In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV  

In [2]:
def select_k_best(indep, dep, n):
    
    from sklearn.feature_selection import SelectKBest, chi2
    skb=SelectKBest(score_func=chi2, k=n)
    
    # fit the selectkbest object to the data
    skb_fit = skb.fit(indep, dep)
    
    # trasform the data to the Select top  K features
    x_new = skb_fit.transform(indep)
    
    # Get the names of the selected features...
    selected_features = indep.columns[skb.get_support()]
    
    return selected_features, x_new

def standard_scalar(xtrain, xtest):
    ### standard scalar

    from sklearn.preprocessing import StandardScaler
    scx = StandardScaler()
    x_train_scaled = scx.fit_transform(xtrain)
    x_test_scaled = scx.fit_transform (xtest)
    return x_train_scaled, x_test_scaled

def metrices(ytest, y_pred):
    # making the confusion matrix, classification_report, accuracy_score

    from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
    cm = confusion_matrix(ytest, y_pred)
    clf_report = classification_report(ytest, y_pred, zero_division=1, output_dict=True)
    acc_score = accuracy_score(ytest, y_pred)
    return cm , clf_report, acc_score
    
def hyperparameter_tuning(clf_base, param_grid, x_train_scaled, ytrain, ytest):
    classifier =GridSearchCV(clf_base, param_grid, refit=True, scoring='f1_weighted',verbose=3, n_jobs=-1,cv=5, error_score=np.nan )
    
    
    model =classifier.fit(x_train_scaled,ytrain)
    y_pred = model.predict(x_test_scaled)
    
    # Get the best Hyperparameters
    print("best_parameter=",classifier.best_params_)
    print("best_estimator=",classifier.best_estimator_)

    return classifier.best_estimator_, classifier.best_params_
    
#### 1. logistic regression    
def logistic(x_train_scaled, x_test_scaled, ytrain, ytest):
    
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import GridSearchCV  
    
    # define the base LogiR model
    clf_base =LogisticRegression(random_state=0)

    # LGR hyperparameter tuning 
    param_grid= [ {'penalty':["l2"], 'solver':['lbfgs', 'saga'], 'class_weight':['balanced'] },
                  {'penalty': ["l1"], 'solver': ["liblinear", "saga"], 'class_weight':['balanced'] } 
                ]
    best_lg, best_lg_hyperparams_ =hyperparameter_tuning(clf_base, param_grid, x_train_scaled, ytrain, ytest)
    
    y_pred= best_lg.predict(xtest)
    
    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score ,best_lg, best_lg_hyperparams_ ,y_pred
    
### 2. SVM_linear

def svm_linear(x_train_scaled, x_test_scaled, ytrain, ytest): 
    from sklearn.svm import SVC
    from sklearn.model_selection import GridSearchCV

    # define the base SVMl model
    clf_base = SVC(random_state=0)
    
    # SVML hyperparameter tuning 
    param_grid= {'kernel':["linear", "poly", "rbf", "sigmoid"],
             'C':[10,100,1000,2000,3000],
             'gamma':['auto','scale']
             }
    best_svml, best_svml_hyperparams_ =hyperparameter_tuning(clf_base, param_grid, x_train_scaled, ytrain, ytest)
    
    y_pred= best_svml.predict(xtest)
    
    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score ,best_svml, best_svml_hyperparams_,y_pred
    
    
### 4. Decision Tree
def decision(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.tree import DecisionTreeClassifier
    
    # define the base Decision model
    clf_base= DecisionTreeClassifier(random_state=0)
    
    # DecisionTree hyperparameter tuning
    param_grid= {'criterion': ['gini', 'entropy', 'log_loss'],
             'splitter':['best', 'random'],
             'max_features':['sqrt', 'log2'],
             
            }
    best_dt, best_dt_hyperparams_ =hyperparameter_tuning(clf_base, param_grid, x_train_scaled, ytrain, ytest)
    
    y_pred= best_dt.predict(xtest)
    
    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score ,best_dt, best_dt_hyperparams_,y_pred
    

### 5. RandomForest
def random(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.ensemble import RandomForestClassifier
    
    # define the base random model
    clf_base= RandomForestClassifier(random_state=0)
    
    # RandomTree hyperparameter tuning
    param_grid= {'n_estimators':[10],
             'criterion':['gini', 'entropy', 'log_loss'],
             'max_features':['sqrt', 'log2', None],
             
            }
    best_rf, best_rf_hyperparams_ =hyperparameter_tuning(clf_base, param_grid, x_train_scaled, ytrain, ytest)
    
    y_pred= best_rf.predict(xtest)
    
    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score ,best_rf, best_rf_hyperparams_,y_pred 
    

## 6. KNN (KNearestNeighbours)

##Power parameter for the Minkowski metric. When p = 1, 
#####   this is equivalent to using manhattan_distance (l1), and euclidean_distance (l2) for p = 2.
######    For arbitrary p, minkowski_distance (l_p) is used.
## Metric to use for distance computation. Default is "minkowski", which
##### results in the standard Euclidean distance when p = 2. 
def knn(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.neighbors import KNeighborsClassifier
    
    # define the base random model
    clf_base= KNeighborsClassifier()
    
    # RandomTree hyperparameter tuning
    param_grid= {'n_neighbors':[1,12],
                  'p': [1,2]       
            }
    best_knn, best_knn_hyperparams_ =hyperparameter_tuning(clf_base, param_grid, x_train_scaled, ytrain, ytest )
    y_pred= best_knn.predict(xtest)
    
    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score ,best_knn, best_knn_hyperparams_,y_pred 
    
    
### 7. Naive Bayes
def naive(x_train_scaled, x_test_scaled, ytrain, ytest):
    from sklearn.naive_bayes import GaussianNB

    # define the base Naive Bayes model
    clf_base = GaussianNB()
    
    # Naive Bayes hyperparameter tuning
     # n_jobs is the maximum number of concurrently running workers; in this case, #it is set to -1 which implies that all CPUs are used.
     # verbose is the verbosity: the higher, the more messages; in this case, it is set to 1.
    param_grid= { 'var_smoothing': [1e-9, 1e-8, 1e-7]  }

    best_nb, best_nb_hyperparams_ =hyperparameter_tuning(clf_base, param_grid, x_train_scaled, ytrain, ytest )
    y_pred= best_nb.predict(xtest)
    
    cm,clf_report, acc_score = metrices(ytest, y_pred)
    return cm , clf_report, acc_score ,best_nb, best_nb_hyperparams_ ,y_pred

#### display the accuracy in a table

def view_acc_score(acc_logistic,acc_svmlinear,acc_decision,acc_random,acc_knn,acc_naive):

    table= pd.DataFrame(index=["chi square"], columns=["Logistic","svmlinear","decision","random","knn","naive"])
    # Loop through each index label and assign values from the lists to the corresponding row
    for i, label in enumerate(table.index):
        table.loc[label] = [
            acc_logistic[i], 
            acc_svmlinear[i],  
            acc_decision[i], 
            acc_random[i], 
            acc_knn[i], 
            acc_naive[i]
    ]

    return table

#def evaluating_metrics(clf_report,ytest, y_pred, model_name):
    # extracting metrics
#    metrics= {
#        "precision_0":clf_report["0"]["precision"],
#        'precision_1':clf_report["1"]['precision'],
#        'recall_0':clf_report['0']['recall'],
#        'recall_1':clf_report['1']['recall'],
#        'f1_score_0':clf_report['0']['f1-score'],
#        'f1_score_1':clf_report['1']['f1-score'],
#        'mac_average_precision':clf_report['macro avg']['precision'],
#        'mac_average_recall':clf_report['macro avg']['recall'],
#        'mac_average_f1_score':clf_report['macro avg']['f1-score'],
        
#        "accuracy_p":clf_report['accuracy'],
#        "accuracy_r":clf_report['accuracy'],
#        "accuracy_f":clf_report['accuracy']
#    }
#    #convert this dictionary into Dataframe
#    df =pd.DataFrame(metrics, index=[model_name]).round(2)
#    return df

In [16]:
dataset=pd.read_csv("prep_ds.csv")
#dataset

### separation of indep and dep vars

indep = dataset.drop("HeartDisease", axis=1)
dep = dataset["HeartDisease"]

# Applyting the SelectKbest...
selected_features, x_new = select_k_best(indep, dep, 3)

### Separation of Training and Test dataset...
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x_new, dep, test_size=0.30, random_state=1)

# Standard scalar
x_train_scaled, x_test_scaled = standard_scalar(xtrain, xtest)



In [17]:
acc_logistic= []
acc_svmlinear = []
acc_svm_nonlinear = []
acc_decision = []
acc_random = []
acc_knn = []
acc_naive = []


cm ,clf_report,acc_score,best_lg, best_lg_hyperparams_,y_pred= logistic(x_train_scaled, x_test_scaled, ytrain, ytest)
print("LG Optimal Hyper parameters: ",best_lg_hyperparams_)
acc_logistic.append(acc_score)
#df =evaluating_metrics(clf_report, ytest, y_pred, 'LR')
#df



Fitting 5 folds for each of 4 candidates, totalling 20 fits
best_parameter= {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'lbfgs'}
best_estimator= LogisticRegression(class_weight='balanced', random_state=0)
LG Optimal Hyper parameters:  {'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'lbfgs'}


In [18]:
cm ,clf_report,acc_score, best_svml, best_svml_hyperparams_,y_pred= svm_linear(x_train_scaled, x_test_scaled, ytrain, ytest)
print("SVML Optimal Hyper parameters: ",best_svml_hyperparams_)
acc_svmlinear.append(acc_score)
#df =evaluating_metrics(clf_report, ytest, y_pred, 'svml')
#df


Fitting 5 folds for each of 40 candidates, totalling 200 fits
best_parameter= {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
best_estimator= SVC(C=10, gamma='auto', random_state=0)
SVML Optimal Hyper parameters:  {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}


In [19]:
cm ,clf_report,acc_score, best_dt, best_dt_hyperparams_,y_pred= decision(x_train_scaled, x_test_scaled, ytrain, ytest)
print("DecisionTree Optimal Hyper parameters: ",best_dt_hyperparams_)
acc_decision.append(acc_score)
#df =evaluating_metrics(clf_report, ytest, y_pred, 'decision')
#df

Fitting 5 folds for each of 12 candidates, totalling 60 fits
best_parameter= {'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'random'}
best_estimator= DecisionTreeClassifier(max_features='sqrt', random_state=0, splitter='random')
DecisionTree Optimal Hyper parameters:  {'criterion': 'gini', 'max_features': 'sqrt', 'splitter': 'random'}


In [20]:
cm ,clf_report,acc_score, best_rf, best_rf_hyperparams_ ,y_pred= random(x_train_scaled, x_test_scaled, ytrain, ytest)
print("RandomForest Optimal Hyper parameters: ",best_rf_hyperparams_)
acc_random.append(acc_score)
#df =evaluating_metrics(clf_report, ytest, y_pred, 'random')
#df

Fitting 5 folds for each of 9 candidates, totalling 45 fits
best_parameter= {'criterion': 'gini', 'max_features': None, 'n_estimators': 10}
best_estimator= RandomForestClassifier(max_features=None, n_estimators=10, random_state=0)
RandomForest Optimal Hyper parameters:  {'criterion': 'gini', 'max_features': None, 'n_estimators': 10}


In [21]:
cm ,clf_report,acc_score, best_knn, best_knn_hyperparams_,y_pred= knn(x_train_scaled, x_test_scaled, ytrain, ytest)
print("KNN Optimal Hyper parameters: ",best_knn_hyperparams_)
acc_knn.append(acc_score)
#df =evaluating_metrics(clf_report, ytest, y_pred, 'KNN')
#df

Fitting 5 folds for each of 4 candidates, totalling 20 fits
best_parameter= {'n_neighbors': 12, 'p': 1}
best_estimator= KNeighborsClassifier(n_neighbors=12, p=1)
KNN Optimal Hyper parameters:  {'n_neighbors': 12, 'p': 1}


In [22]:
cm ,clf_report,acc_score, best_nb, best_nb_hyperparams_,y_pred= naive(x_train_scaled, x_test_scaled, ytrain, ytest)
print("Naive Bayes Optimal Hyper parameters: ",best_nb_hyperparams_)
acc_naive.append(acc_score)
#df =evaluating_metrics(clf_report, ytest, y_pred, 'Naive Bayes')
#df

Fitting 5 folds for each of 3 candidates, totalling 15 fits
best_parameter= {'var_smoothing': 1e-09}
best_estimator= GaussianNB()
Naive Bayes Optimal Hyper parameters:  {'var_smoothing': 1e-09}


In [23]:
#### selected features:

print ("Selected Features: ", selected_features)

Selected Features:  Index(['MaxHR', 'ST_Slope_Flat', 'ST_Slope_Up'], dtype='object')


In [24]:
#dataset[selected_features]

In [25]:
#### Disply the accuracy score in a table

table =view_acc_score(acc_logistic,acc_svmlinear,acc_decision,acc_random,acc_knn,acc_naive)


In [26]:
table #3

Unnamed: 0,Logistic,svmlinear,decision,random,knn,naive
chi square,0.394928,0.394928,0.605072,0.76087,0.76087,0.605072


In [14]:
table #5

Unnamed: 0,Logistic,svmlinear,decision,random,knn,naive
chi square,0.394928,0.536232,0.648551,0.76087,0.724638,0.605072


In [15]:
### Conclusion: Accuracy score for Random forest and KNN is higher when selectkBest_K=3
###             so we will save any one of these two model for deployment