In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder
import itertools

In [2]:
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [4]:
def data_cleaning(filename, onehotencode=False, bool_bmi=True, categorical_col=[], onehotencode_column=[], drop_column = [], drop_value_num=5, drop_nan_percent=0.4):
    """Return the cleaned dataframe or one-hot encoded dataframe plus the label
        Parameters:
            filename (string): the name of the .csv file
            onehotencode (boolean): if true, return one-hot encoded dataframe plus the label 
            bool_bmi (boolean): if true, change bmi to True/False
            categorical_col (list): a list of categorical columns
            onehotencode_column (list): a list of columns to one-hot encode if onehotencode is true
            drop_column (list): a list of columns to drop
            drop_value_num (int): the threshold for droping the value in a column
            drop_nan_percent (float): the threshold for droping the nan (in percentage)
        Returns:
            df (DataFrame): cleaned dataFrame
            or 
            df (DataFrame): cleaned one-hot encodedand dataFrame
            label (Series): label for stroke
    """
    # read in the file to clean
    df = pd.read_csv(filename,index_col=0)
    df = df.sample(frac=1)
    N = len(df)
    # get all the unique column
    columns = df.columns
    
    
    # check if there is duplicate
    if len(df.duplicated()) != 0:
        df = df.drop_duplicates(keep = 'last')
        
    # change
    if bool_bmi:
        df['bmi'] = df['bmi'].isna().astype(int)
    else:
        df = df.drop("bmi", axis=1)
    
    # drop unwanted columns
    df = df.drop(columns=drop_column)
        
    # loop through all the columns to drop unecessary 
    for col in categorical_col:
        # check if it is object categorical feature
        if df[col].dtypes == object:
            # get unique values for the column
            values = pd.unique(df[col])
            # check the number of the values and drop if too few
            for v in values:
                if len(df[df[col] == v]) < drop_value_num:
                    df = df.drop(df[df[col]== v].index)
        # if it is numercial categorical feature
        else:
            # get unique values for the column
            values = pd.unique(df[col])
            # check the number of the values and drop if too few
            for v in values:
                if len(df[df[col] == v]) < drop_value_num:
                    df = df.drop(df[df[col]== v].index)
    # the nan is less than drop_nan_percent percent of the dataset, drop them
    if sum(df.isna().sum()) < N*drop_nan_percent:
        df = df.dropna()
        
    # one hot encode if true
    if onehotencode:
        df = pd.get_dummies(df, columns=onehotencode_column)
        label = df['stroke']
        df = df.drop('stroke', axis=1)
        # return the one hot encode dataframe and label
        return df, label
    
    return df

In [7]:
data, target = data_cleaning("healthcare-dataset-stroke-data.csv", onehotencode=True, categorical_col=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], onehotencode_column=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')


In [24]:
#Naive Bayes w/o SMOTE

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3)

bernoulliSet = X_train.drop(columns=["avg_glucose_level", "age"])
gaussianSet = X_train[["avg_glucose_level", "age"]]

bernoulliClassifier = BernoulliNB()
gaussianClassifier = GaussianNB()

bernoulliTest = X_test.drop(columns=["avg_glucose_level", "age"])
gaussianTest = X_test[["avg_glucose_level", "age"]]

bernoulliProb = bernoulliClassifier.fit(bernoulliSet, y_train).predict_proba(bernoulliTest)
gaussianProb = gaussianClassifier.fit(gaussianSet, y_train).predict_proba(gaussianTest)
prediction = (np.argmax(bernoulliProb*gaussianProb, axis = 1))

print("Combined Gaussian/Bernoulli w/o SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("Precision:", precision_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))
print()

prediction = bernoulliClassifier.fit(bernoulliSet, y_train).predict(bernoulliTest)

print("Bernoulli (just categorical) w/o SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("Precision:", precision_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))
print()

prediction = gaussianClassifier.fit(gaussianSet, y_train).predict(gaussianTest)

print("Gaussian (just continuous) w/o SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("Precision:", precision_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))

Combined Gaussian/Bernoulli w/o SMOTE:
Recall: 0.02631578947368421
Accuracy: 0.9445531637312459
Precision: 0.15384615384615385
F1 Score: 0.0449438202247191

Bernoulli (just categorical) w/o SMOTE:
Recall: 0.07894736842105263
Accuracy: 0.9360730593607306
Precision: 0.17647058823529413
F1 Score: 0.10909090909090909

Gaussian (just continuous) w/o SMOTE:
Recall: 0.14473684210526316
Accuracy: 0.9256360078277887
Precision: 0.18333333333333332
F1 Score: 0.16176470588235292


In [25]:
#Naive Bayes w/ SMOTE

sm = SMOTE()

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3)

X_train, y_train = sm.fit_resample(X_train,y_train)

bernoulliSet = X_train.drop(columns=["avg_glucose_level", "age"])
gaussianSet = X_train[["avg_glucose_level", "age"]]

bernoulliClassifier = BernoulliNB()
gaussianClassifier = GaussianNB()

bernoulliTest = X_test.drop(columns=["avg_glucose_level", "age"])
gaussianTest = X_test[["avg_glucose_level", "age"]]

bernoulliProb = bernoulliClassifier.fit(bernoulliSet, y_train).predict_proba(bernoulliTest)
gaussianProb = gaussianClassifier.fit(gaussianSet, y_train).predict_proba(gaussianTest)
prediction = (np.argmax(bernoulliProb*gaussianProb, axis = 1))

#prediction = bernoulliClassifier.fit(bernoulliSet, y_train).predict(bernoulliTest)
#prediction = gaussianClassifier.fit(gaussianSet, y_train).predict(gaussianTest)

print("Combined Gaussian/Bernoulli w/ SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("Precision:", precision_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))
print()

prediction = bernoulliClassifier.fit(bernoulliSet, y_train).predict(bernoulliTest)

print("Bernoulli (just categorical) w/ SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("Precision:", precision_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))
print()

prediction = gaussianClassifier.fit(gaussianSet, y_train).predict(gaussianTest)

print("Gaussian (just continuous) w/ SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("Precision:", precision_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))

Combined Gaussian/Bernoulli w/ SMOTE:
Recall: 0.5974025974025974
Accuracy: 0.7638617090671885
Precision: 0.1220159151193634
F1 Score: 0.20264317180616742

Bernoulli (just categorical) w/ SMOTE:
Recall: 0.2597402597402597
Accuracy: 0.8016960208741031
Precision: 0.0749063670411985
F1 Score: 0.11627906976744186

Gaussian (just continuous) w/ SMOTE:
Recall: 0.8181818181818182
Accuracy: 0.6999347684279191
Precision: 0.1237721021611002
F1 Score: 0.21501706484641636


In [51]:
#Naive Bayes w/ SMOTE Cross Validated (7-Fold)

kf = KFold(n_splits = 7)

recallScoresCombined = []
accuracyScoresCombined = []
f1ScoresCombined = []

recallScoresGaussian = []
accuracyScoresGaussian = []
f1ScoresGaussian = []

recallScoresBernoulli = []
accuracyScoresBernoulli = []
f1ScoresBernoulli = []

for fold, (train_index, test_index) in enumerate(kf.split(data), 1):
    X_train = data.iloc[train_index]
    y_train = target.iloc[train_index]
    X_test = data.iloc[test_index]
    y_test = target.iloc[test_index]

    X_train_oversampled, y_train_oversampled = X_train, y_train

    bernoulliSet = X_train_oversampled.drop(columns=["avg_glucose_level", "age"])
    gaussianSet = X_train_oversampled[["avg_glucose_level", "age"]]

    bernoulliClassifier = BernoulliNB()
    gaussianClassifier = GaussianNB()

    bernoulliTest = X_test.drop(columns=["avg_glucose_level", "age"])
    gaussianTest = X_test[["avg_glucose_level", "age"]]

    bernoulliProb = bernoulliClassifier.fit(bernoulliSet, y_train_oversampled).predict_proba(bernoulliTest)
    gaussianProb = gaussianClassifier.fit(gaussianSet, y_train_oversampled).predict_proba(gaussianTest)
    prediction = (np.argmax(bernoulliProb*gaussianProb, axis = 1))

    recallScoresCombined.append(recall_score(y_test, prediction))
    accuracyScoresCombined.append(accuracy_score(y_test, prediction))
    f1ScoresCombined.append(f1_score(y_test, prediction))

    prediction = bernoulliClassifier.fit(bernoulliSet, y_train_oversampled).predict(bernoulliTest)
    
    recallScoresBernoulli.append(recall_score(y_test, prediction))
    accuracyScoresBernoulli.append(accuracy_score(y_test, prediction))
    f1ScoresBernoulli.append(f1_score(y_test, prediction))
    
    prediction = gaussianClassifier.fit(gaussianSet, y_train_oversampled).predict(gaussianTest)

    recallScoresGaussian.append(recall_score(y_test, prediction))
    accuracyScoresGaussian.append(accuracy_score(y_test, prediction))
    f1ScoresGaussian.append(f1_score(y_test, prediction))

print("Combined Gaussian/Bernoulli w/o SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresCombined))
print("Accuracy mean:", np.mean(accuracyScoresCombined))
print("F1 Score mean:", np.mean(f1ScoresCombined))
print()

print("Bernoulli w/o SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresBernoulli))
print("Accuracy mean:", np.mean(accuracyScoresBernoulli))
print("F1 Score mean:", np.mean(f1ScoresBernoulli))
print()

print("Gaussian w/o SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresGaussian))
print("Accuracy mean:", np.mean(accuracyScoresGaussian))
print("F1 Score mean:", np.mean(f1ScoresGaussian))

Combined Gaussian/Bernoulli w/o SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.05136448742760537
Accuracy mean: 0.9448025469841808
F1 Score mean: 0.08252040616492118

Bernoulli w/o SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.06917371638234517
Accuracy mean: 0.9369731476783735
F1 Score mean: 0.09562552430874734

Gaussian w/o SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.1548348622552158
Accuracy mean: 0.9222922857626054
F1 Score mean: 0.16226184144629455


In [49]:
#Naive Bayes w/ SMOTE Cross Validated (7-Fold)

kf = KFold(n_splits = 7)

recallScoresCombined = []
accuracyScoresCombined = []
f1ScoresCombined = []

recallScoresGaussian = []
accuracyScoresGaussian = []
f1ScoresGaussian = []

recallScoresBernoulli = []
accuracyScoresBernoulli = []
f1ScoresBernoulli = []

for fold, (train_index, test_index) in enumerate(kf.split(data), 1):
    X_train = data.iloc[train_index]
    y_train = target.iloc[train_index]
    X_test = data.iloc[test_index]
    y_test = target.iloc[test_index]

    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)

    bernoulliSet = X_train_oversampled.drop(columns=["avg_glucose_level", "age"])
    gaussianSet = X_train_oversampled[["avg_glucose_level", "age"]]

    bernoulliClassifier = BernoulliNB()
    gaussianClassifier = GaussianNB()

    bernoulliTest = X_test.drop(columns=["avg_glucose_level", "age"])
    gaussianTest = X_test[["avg_glucose_level", "age"]]

    bernoulliProb = bernoulliClassifier.fit(bernoulliSet, y_train_oversampled).predict_proba(bernoulliTest)
    gaussianProb = gaussianClassifier.fit(gaussianSet, y_train_oversampled).predict_proba(gaussianTest)
    prediction = (np.argmax(bernoulliProb*gaussianProb, axis = 1))

    recallScoresCombined.append(recall_score(y_test, prediction))
    accuracyScoresCombined.append(accuracy_score(y_test, prediction))
    f1ScoresCombined.append(f1_score(y_test, prediction))

    prediction = bernoulliClassifier.fit(bernoulliSet, y_train_oversampled).predict(bernoulliTest)
    
    recallScoresBernoulli.append(recall_score(y_test, prediction))
    accuracyScoresBernoulli.append(accuracy_score(y_test, prediction))
    f1ScoresBernoulli.append(f1_score(y_test, prediction))
    
    prediction = gaussianClassifier.fit(gaussianSet, y_train_oversampled).predict(gaussianTest)

    recallScoresGaussian.append(recall_score(y_test, prediction))
    accuracyScoresGaussian.append(accuracy_score(y_test, prediction))
    f1ScoresGaussian.append(f1_score(y_test, prediction))

print("Combined Gaussian/Bernoulli w/ SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresCombined))
print("Accuracy mean:", np.mean(accuracyScoresCombined))
print("F1 Score mean:", np.mean(f1ScoresCombined))
print()

print("Bernoulli w/ SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresBernoulli))
print("Accuracy mean:", np.mean(accuracyScoresBernoulli))
print("F1 Score mean:", np.mean(f1ScoresBernoulli))
print()

print("Gaussian w/ SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresGaussian))
print("Accuracy mean:", np.mean(accuracyScoresGaussian))
print("F1 Score mean:", np.mean(f1ScoresGaussian))

Combined Gaussian/Bernoulli w/ SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.6409513107006365
Accuracy mean: 0.7713818624016494
F1 Score mean: 0.2139777524537915

Bernoulli w/ SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.2615820711496333
Accuracy mean: 0.7946714127333101
F1 Score mean: 0.11314140818365222

Gaussian w/ SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.834993298930181
Accuracy mean: 0.7001355635551475
F1 Score mean: 0.21291448300732174


In [36]:
#K-Nearest Neighbors (No Parameter Tuning/Cross Validated)
knn = KNeighborsClassifier()

print("KNN Classifier w/o SMOTE:")
print("Recall mean:", np.mean(cross_validate(knn, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(knn, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(knn, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('classification', KNeighborsClassifier())])

print("KNN Classifier w/ SMOTE:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

KNN Classifier w/o SMOTE:
Recall mean: 0.008
Accuracy mean: 0.9451945542818041
F1 Score mean: 0.013793103448275865

KNN Classifier w/ SMOTE:
Recall mean: 0.5460408163265306
Accuracy mean: 0.7860627411443828
F1 Score mean: 0.19855744247162455



In [37]:
#KNN Neighbors Classifier (parameter tuning through gridSearch)

knn = KNeighborsClassifier()

parameters = {'n_neighbors':[2,3,4,5,6,7,8,9,10], "weights":["uniform", "distance"]}
gridSearch = GridSearchCV(knn, parameters, scoring="f1")
gridSearch.fit(data, target)

print("KNN Classifier w/o SMOTE and parameter tuning via GridSearch:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('classification', KNeighborsClassifier())])

parameters = {'classification__n_neighbors':[2,3,4,5,6,7,8,9,10], "classification__weights":["uniform", "distance"]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data,target)

print("KNN Classifier w/ SMOTE and parameter tuning via GridSearch:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))

KNN Classifier w/o SMOTE and parameter tuning via GridSearch:
Recall mean: 0.14065306122448978
Accuracy mean: 0.9168130703370128
F1 Score mean: 0.13964015239726799

KNN Classifier w/ SMOTE and parameter tuning via GridSearch:
Recall mean: 0.614530612244898
Accuracy mean: 0.7764715916823037
F1 Score mean: 0.21169656545130197


In [38]:
#KNN Neighbors Classifier (parameter tuning through gridSearch/Standard Scalar Preprocessing)

model = Pipeline([('scale', StandardScaler()),('classification', KNeighborsClassifier())])

parameters = {'classification__n_neighbors':[2,3,4,5,6,7,8,9,10], "classification__weights":["uniform", "distance"]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data, target)

print("KNN Classifier w/o SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('scale', StandardScaler()),('classification', KNeighborsClassifier())])

parameters = {'classification__n_neighbors':[2,3,4,5,6,7,8,9,10], "classification__weights":["uniform", "distance"]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data,target)

print("KNN Classifier w/ SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))

KNN Classifier w/o SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:
Recall mean: 0.10457142857142858
Accuracy mean: 0.918965712215682
F1 Score mean: 0.11061164467196574

KNN Classifier w/ SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:
Recall mean: 0.14057142857142857
Accuracy mean: 0.9342350751632547
F1 Score mean: 0.1581175483766787


In [52]:
#Random Forest Classifier (parameter tuning through gridSearch/Standard Scalar Preprocessing)

model = Pipeline([('classification', RandomForestClassifier())])

print("Random Forest Classifier w/o SMOTE:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('classification', RandomForestClassifier())])


print("Random Forest Classifier w/ SMOTE:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

Random Forest Classifier w/o SMOTE:
Recall mean: 0.044163265306122454
Accuracy mean: 0.9483270114292613
F1 Score mean: 0.07637540666394216

Random Forest Classifier w/ SMOTE:
Recall mean: 0.048
Accuracy mean: 0.9397143355483955
F1 Score mean: 0.08307272908376977



In [42]:
#Random Forest Classifier (parameter tuning through gridSearch/Standard Scalar Preprocessing)

model = Pipeline([('classification', RandomForestClassifier())])

parameters = {'classification__n_estimators':[200], "classification__max_depth": [2,3,4,5], "classification__max_features": ["auto", "log2", None]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data, target)

print("Random Forest Classifier w/o SMOTE and parameter tuning via GridSearch:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('scale', StandardScaler()),('classification', RandomForestClassifier())])

parameters = {'classification__n_estimators':[200], "classification__max_depth": [2,3,4,5], "classification__max_features": ["auto", "log2", None]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data, target)

print("Random Forest Classifier w/o SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))
print()

Random Forest Classifier w/o SMOTE and parameter tuning via GridSearch:
Recall mean: 0.036244897959183675
Accuracy mean: 0.9516542049446939
F1 Score mean: 0.07464837049742709

Random Forest Classifier w/o SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:
Recall mean: 0.036244897959183675
Accuracy mean: 0.9512631988515154
F1 Score mean: 0.061393323657474595



In [43]:
#Random Forest Classifier (parameter tuning through gridSearch/Standard Scalar Preprocessing)

model = Pipeline([('sampling', SMOTE()),('classification', RandomForestClassifier())])

parameters = {'classification__n_estimators':[200], "classification__max_depth": [2,3,4,5], "classification__max_features": ["auto", "log2", None]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data, target)

print("Random Forest Classifier w/ SMOTE and parameter tuning via GridSearch:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('scale', StandardScaler()),('classification', RandomForestClassifier())])

parameters = {'classification__n_estimators':[200], "classification__max_depth": [2,3,4,5], "classification__max_features": ["auto", "log2", None]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data, target)

print("Random Forest Classifier w/ SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))
print()

Random Forest Classifier w/ SMOTE and parameter tuning via GridSearch:
Recall mean: 0.4619591836734694
Accuracy mean: 0.8434129848523473
F1 Score mean: 0.22900447385735462

Random Forest Classifier w/ SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:
Recall mean: 0.49020408163265305
Accuracy mean: 0.8351913150646597
F1 Score mean: 0.22730059569403274



In [122]:
data, target = data_cleaning("healthcare-dataset-stroke-data.csv", onehotencode=True, categorical_col=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], onehotencode_column=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'])

In [123]:
#Random Forest Classifier (parameter tuning through gridSearch/Standard Scalar Preprocessing)

model = Pipeline([('classification', LogisticRegression(solver="newton-cg"))])

print("Logistic Regression Classifier w/o SMOTE:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('classification', LogisticRegression(solver="newton-cg"))])


print("Logistic Regression Classifier w/ SMOTE:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

Logistic Regression Classifier w/o SMOTE:
Recall mean: 0.016
Accuracy mean: 0.9504796533079307
F1 Score mean: 0.03019405472235661

Logistic Regression Classifier w/ SMOTE:
Recall mean: 0.04408163265306122
Accuracy mean: 0.9469556150583347
F1 Score mean: 0.09046198510674353



In [11]:
#Random Forest Classifier (parameter tuning through gridSearch/Standard Scalar Preprocessing)

catcol = ['gender', "smoking_status", 'ever_married','work_type', 'Residence_type']
maxf1 = 0
maxRecall = 0
for r in range(0,5):
    for combo in itertools.combinations(catcol, r):
        current = [x for x in catcol if x not in combo]
        combo = list(combo)
        data, target = data_cleaning("healthcare-dataset-stroke-data.csv", onehotencode=True, categorical_col=current, onehotencode_column=current, drop_column=combo)

        model = Pipeline([('sampling', SMOTE()),('classification', LogisticRegression(solver="newton-cg"))])

        f1 = np.mean(cross_validate(model, data, target, scoring="f1")["test_score"])
        if maxf1 < f1:
            maxf1 = f1
            bestCurrentf1 = current
            bestCombof1 = combo

        recall = np.mean(cross_validate(model, data, target, scoring="recall")["test_score"])
        if maxRecall < recall:
            maxRecall = recall
            bestCurrentRecall = current
            bestComboRecall = combo

print("Best Feature Combination for f1:")
data, target = data_cleaning("healthcare-dataset-stroke-data.csv", onehotencode=True, categorical_col=bestCurrentf1, onehotencode_column=bestCurrentf1, drop_column=bestCombof1)

model = Pipeline([('classification', LogisticRegression(solver="newton-cg"))])

print("Logistic Regression Classifier w/o SMOTE Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('classification', LogisticRegression(solver="newton-cg"))])


print("Logistic Regression Classifier w/ SMOTE and Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

print(bestCombof1)

print()
print("Best Feature Combination for recall:")
data, target = data_cleaning("healthcare-dataset-stroke-data.csv", onehotencode=True, categorical_col=bestCurrentRecall, onehotencode_column=bestCurrentRecall, drop_column=bestComboRecall)

model = Pipeline([('classification', LogisticRegression(solver="newton-cg"))])

print("Logistic Regression Classifier w/o SMOTE Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('classification', LogisticRegression(solver="newton-cg"))])


print("Logistic Regression Classifier w/ SMOTE and Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

print(bestComboRecall)



Best Feature Combination for f1:
Logistic Regression Classifier w/o SMOTE Standard Scalar Preprocessing:
Recall mean: 0.004
Accuracy mean: 0.9499021526418787
F1 Score mean: 0.007692307692307693

Logistic Regression Classifier w/ SMOTE and Standard Scalar Preprocessing:
Recall mean: 0.42122448979591837
Accuracy mean: 0.8733855185909979
F1 Score mean: 0.24344828508140628

['gender', 'ever_married', 'work_type', 'Residence_type']

Best Feature Combination for recall:
Logistic Regression Classifier w/o SMOTE Standard Scalar Preprocessing:
Recall mean: 0.008
Accuracy mean: 0.9504892367906066
F1 Score mean: 0.015250544662309368

Logistic Regression Classifier w/ SMOTE and Standard Scalar Preprocessing:
Recall mean: 0.614204081632653
Accuracy mean: 0.7724070450097847
F1 Score mean: 0.20786197872210194

['gender', 'smoking_status', 'work_type', 'Residence_type']


Analysis of Learning Algorithms:

We tried a variety of classifiers in an attempt to create a model that could accurately (and more importantly, with high recall) predict stroke occurances using the health data provided. One important characteristic of our dataset is that it is heavily imbalanced. The null hypothesis (predicting no stroke occurances across the board) results in a model with about 95% accuracy. Thus, it is extremely important that we focus on the recall of the model and the f1 scores, which represent a sort of balance between the accuracy and the recall scores. Experimentally, raising a model's recall by changing model parameters or using SMOTE to oversample the training data results in a lowering of the accuracy so there is a practical trade-off between the two scores. However, the null hypothesis model is quite obviously useless so it's a trade-off we need to make in order to get any kind of applicable model.

Treating the natural imbalance in the data is of utmost importance. The Syntethic Minority Oversampling Technique (SMOTE) is used to take the minority class (in this case, stroke occurances) and creating fake examples that are close to the other stroke occurances in the feature space. In all of our attempted models, oversampling with SMOTE greatly improves both recall and the f1 score from the corresponding baseline model that was trained only on the provided data. Regardless of the classifier type, the models were effectively useless without oversampling. Thus, the most important step in model creation for our dataset is using SMOTE in our pipeline. We also played around with other preprocessing steps such as using sklearn's StandardScaler, but this reduces the scoring of our models across the board.

Only two of the provided features are continuous/numerical, so Gaussian Discriminant Analysis was not an appropriate tool to use due to the fact that it assumes that the features are normally distributed. Instead, we turned to Naive Bayes as a potential model because it can handle both Gaussian and Binary distributed features. Due to the fact that Naive Bayes assumes independence between the features, we can split the dataset into Gaussian and Bernoulli features and get final probabilities at the end by multiplying the probabilities of the two separate models together. For each of the sets, we can train a Naive Bayes model of the corresponding type on just the appropriate features. Then, rather than using the predict method to get labels, we grab the probabilities for each class. Multiplying these newly calculated probabilities together from both the Gaussian and Bernoulli models, we get a "combined" probability that accounts for all of the features in the data. Then, assigning class labels is as easy as choosing the argmax of the resultant probabilities for each input. In addition to being theoretically sound (by the assumed independence of features), we experimentally see that the combined model improves on the f1 scores of both subset models. Using this combined model (w/ SMOTE), we end up with a recall of $0.65$, an accuracy of $0.79$, and an f1 of around $0.24$. Unfortunately, there aren't a whole lot of model parameters that can be changed for the Naive Bayes model. The only possiblity being "var_smoothing" which is just used for calculation stability on GaussianNB and "alpha" a smoothing parameter for BernoulliNB. Thus, the best we can do with Naive Bayes is an f1 score of $0.24$. 

Another option we explored as a way to predict stroke occurances was a K-Nearest Neighbors Classifier. Without SMOTE or any fine tuning of parameters, the model is useless with a recall score around $.01$ and an f1 score of about $0.02$. Oversampling immediately yields much better results with a recall score of about $0.51$, an accuracy of $0.79$, and an f1 score of around $0.19$ which is actually already decent compared to our scores on the Naive Bayes. Unfortunately, the tuning of model parameters using GridSearch doesn't improve the SMOTE model significantly. It provides a slight jump up to a recall of $.59$, a slight accuracy decrease to $.78$, and a slight jump up to a f1 score of about $.21$. Thus, even with all the improvements we could make with our KNN classifier, it underperforms when compared to the combined Naive Bayes model we talked about previously.