In [19]:
import pandas as pd
import pickle as pk

In [20]:
def select_k_best(indep, dep, k):
    
    from sklearn.feature_selection import SelectKBest, chi2
    skb=SelectKBest(score_func=chi2, k=3)
    
    # fit the selectkbest object to the data
    skb_fit = skb.fit(indep, dep)
    
    # trasform the data to the Select top  K features
    x_new = skb_fit.transform(indep)
    
    # Get the names of the selected features...
    selected_features = indep.columns[skb.get_support()]
    
    return selected_features, x_new

def standard_scalar(xtrain, xtest):
    ### standard scalar

    from sklearn.preprocessing import StandardScaler
    scx = StandardScaler()
    x_train_scaled = scx.fit_transform(xtrain)
    x_test_scaled = scx.fit_transform (xtest)
    return x_train_scaled, x_test_scaled, scx
   

In [21]:
dataset=pd.read_csv("prep_ds.csv")
#dataset

### separation of indep and dep vars

indep = dataset.drop("HeartDisease", axis=1)
dep = dataset["HeartDisease"]

# Applyting the SelectKbest...
selected_features, x_new = select_k_best(indep, dep, 5)

#### selected features:
print ("Selected Features: ", selected_features)


Selected Features:  Index(['MaxHR', 'ST_Slope_Flat', 'ST_Slope_Up'], dtype='object')


In [22]:
dataset[selected_features]

Unnamed: 0,MaxHR,ST_Slope_Flat,ST_Slope_Up
0,156,0,1
1,156,1,0
2,120,0,1
3,120,1,0
4,122,0,1
...,...,...,...
913,132,1,0
914,141,1,0
915,120,1,0
916,156,1,0


In [23]:
### Separation of Training and Test dataset...
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_new, dep, test_size=0.30, random_state=1)

# Standard scalar
x_train_scaled, x_test_scaled, scx= standard_scalar(xtrain, xtest)

In [24]:
#Model creation using CV search and linear model
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

param_grid={'penalty':['l2'],
            'solver':['lbfgs'],
            'class_weight':['balanced']
            }  
model= GridSearchCV(LogisticRegression(),param_grid,scoring='f1_weighted', refit=True, n_jobs=-1,cv=5)
model.fit(x_train_scaled,ytrain)

y_pred= model.predict(x_test_scaled)

In [25]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(ytest, y_pred)
clf_report = classification_report(ytest, y_pred)
acc_score = accuracy_score(ytest, y_pred)


print("clf_report",clf_report)

### Accuracy_score
print ("Accuracy : ", acc_score)

clf_report               precision    recall  f1-score   support

           0       0.77      0.77      0.77       109
           1       0.85      0.85      0.85       167

    accuracy                           0.82       276
   macro avg       0.81      0.81      0.81       276
weighted avg       0.82      0.82      0.82       276

Accuracy :  0.8188405797101449


In [26]:
# Save model...
pk.dump(model, open("logisticR_finalized_model", 'wb'))

In [27]:
# save the Standard scalar obj...
pk.dump(scx, open("scx", 'wb'))