In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder

In [2]:
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import KFold
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [3]:
def cont_bmi_dataframe(meanbmi=False):
    '''
    returns a one-hot encoded dataframe where smoking status is calculated by a knn classifier and
    bmi is calculated by knn regression if meanbmi is false, otherwise nans in the bmi column are 
    replaced by the mean column bmi.

    returns Pandas DataFrame, Target Variable
    '''

    df = pd.read_csv('healthcare-dataset-stroke-data.csv')
    #mix up the data since it came in ordered
    df = df.sample(frac=1)
    #drop the id column since it holds no predictive value as its not discrete
    df = df.drop('id', axis=1)

    #We now check to verify there are sufficient values in each category for the columns to run a prediction
    categories = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']

    #Since there is only one person with gender Other we drop the row because there are not enough for a prediction
    df = df.drop(df[df['gender']=='Other'].index).reset_index(drop=True)
    df = df.dropna()

    '''
    We leave the rows where people never worked since 22 is enough to gather some predictive information and
    never working is very distinct from having a job. However, it may be linked to age, as there are some babies
    in the dataset.

    The smoking status column has the categories former smoker, never smoked, smokes, and unknown. The unknown values are
    essencially Nan values. We are going to use a knn classifier to impute these values.

    Before we do this, we need to one-hot encode the other categorical variables in order for the classifier to work, and
    we need to drop the stroke column, as we don't want it to be used in this imputer because that would bias our final
    predictions. We also need to encode the labels of the smoking_status column in order to be able to use the KNN classifier.
    '''

    le1 = LabelEncoder()
    newcol = le1.fit_transform(df['smoking_status'])
    df['smoking_status'] = pd.Series(newcol)
    keys = le1.classes_
    values = le1.transform(keys)
    dictionary = dict(zip(keys, values))
    inversedictionary = dict(zip(values, keys))

    #one-hot encode the dataframe
    df = pd.get_dummies(df, columns= ['gender', 'ever_married', 'work_type', 'Residence_type'])
    y = df['stroke']
    df = df.drop('stroke', axis=1)

    #Find the best number of neighbors to use for the prediction
    dfclass = df.drop('bmi', axis=1).dropna().copy()
    dfclass['smoking_status'] = pd.Series(newcol)
    dfclass = dfclass[dfclass['smoking_status'] != dictionary['Unknown']].copy()
    yclass = dfclass['smoking_status'].copy()
    dfclass = dfclass.drop('smoking_status', axis=1)

    #Encode the labels in the prediction column
    le = LabelEncoder().fit(yclass)
    knnclass = KNeighborsClassifier()
    parameters = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,12,14,16,18,20,23,26,29]}
    bestparamsclass = GridSearchCV(knnclass, parameters).fit(dfclass, yclass).best_params_


    #Use the best value to fit a knnclassifier
    df['smoking_status'] = df['smoking_status'].replace(to_replace=dictionary['Unknown'], value=np.nan)
    yclassifier = df['smoking_status'].copy()
    dfclassifier = df.drop(['bmi','smoking_status'], axis=1).copy()
    knnclassifier = KNeighborsClassifier(n_neighbors=bestparamsclass['n_neighbors']).fit(dfclass, yclass)

    #predict nan values only
    for i in range(len(df)):
        if np.isnan(yclassifier[i]):
            yclassifier[i] = int(knnclassifier.predict(dfclassifier.loc[[i]]))

    #replace all values with original or new predicted values
    for i in range(len(yclassifier)):
        yclassifier[i] = inversedictionary[yclassifier[i]]

    #We next explore options to account for the Nan values in the BMI column
    if meanbmi:

        #We replace Nan values with the mean bmi
        imputermean = SimpleImputer(missing_values=np.nan, strategy='mean')
        dfmean = pd.DataFrame(imputermean.fit_transform(df), columns=df.columns)
        dfmean['smoking_status'] = yclassifier
        dfmean = pd.get_dummies(dfmean, columns=['smoking_status'])
        return dfmean, y

    else:
        #We use knn to fill in the missing values
        #We begin by finding the optimal number of neighbors to use for the prediction
        dfreg = df.dropna()
        yreg = dfreg['bmi']
        dfreg = dfreg.drop(['bmi','smoking_status'], axis=1)
        knnreg = KNeighborsRegressor()
        parameters = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10,12,14,16,18,20,23,26,29]}
        bestparams = GridSearchCV(knnreg, parameters).fit(dfreg, yreg).best_params_

        #Use the best value to impute the values to use for bmi
        imputerKNN = KNNImputer(n_neighbors=bestparams['n_neighbors'])
        dfKNN = pd.DataFrame(imputerKNN.fit_transform(df), columns = df.columns)
        dfKNN['smoking_status'] = yclassifier
        dfKNN = pd.get_dummies(dfKNN, columns=['smoking_status'])
        return dfKNN, y


In [5]:
df = pd.read_csv('healthcare-dataset-stroke-data.csv')
df = df.drop('id', axis=1)

data,target = cont_bmi_dataframe()

In [7]:
#Naive Bayes w/o SMOTE

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3)

bernoulliSet = X_train.drop(columns=["avg_glucose_level", "bmi", "age"])
gaussianSet = X_train[["avg_glucose_level", "bmi", "age"]]

bernoulliClassifier = BernoulliNB()
gaussianClassifier = GaussianNB()

bernoulliTest = X_test.drop(columns=["avg_glucose_level", "bmi", "age"])
gaussianTest = X_test[["avg_glucose_level", "bmi", "age"]]

bernoulliProb = bernoulliClassifier.fit(bernoulliSet, y_train).predict_proba(bernoulliTest)
gaussianProb = gaussianClassifier.fit(gaussianSet, y_train).predict_proba(gaussianTest)
prediction = (np.argmax(bernoulliProb*gaussianProb, axis = 1))

print("Combined Gaussian/Bernoulli w/o SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))
print()

prediction = bernoulliClassifier.fit(bernoulliSet, y_train).predict(bernoulliTest)

print("Bernoulli (just categorical) w/o SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))
print()

prediction = gaussianClassifier.fit(gaussianSet, y_train).predict(gaussianTest)

print("Gaussian (just continuous) w/o SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))

Combined Gaussian/Bernoulli w/o SMOTE:
Recall: 0.044444444444444446
Accuracy: 0.9354207436399217
F1 Score: 0.07476635514018692

Bernoulli (just categorical) w/o SMOTE:
Recall: 0.05555555555555555
Accuracy: 0.928897586431833
F1 Score: 0.08403361344537814

Gaussian (just continuous) w/o SMOTE:
Recall: 0.14444444444444443
Accuracy: 0.9080234833659491
F1 Score: 0.155688622754491


In [28]:
#Naive Bayes w/ SMOTE

sm = SMOTE()

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.3)

X_train, y_train = sm.fit_resample(X_train,y_train)

bernoulliSet = X_train.drop(columns=["avg_glucose_level", "bmi", "age"])
gaussianSet = X_train[["avg_glucose_level", "bmi", "age"]]

bernoulliClassifier = BernoulliNB()
gaussianClassifier = GaussianNB()

bernoulliTest = X_test.drop(columns=["avg_glucose_level", "bmi", "age"])
gaussianTest = X_test[["avg_glucose_level", "bmi", "age"]]

bernoulliProb = bernoulliClassifier.fit(bernoulliSet, y_train).predict_proba(bernoulliTest)
gaussianProb = gaussianClassifier.fit(gaussianSet, y_train).predict_proba(gaussianTest)
prediction = (np.argmax(bernoulliProb*gaussianProb, axis = 1))

#prediction = bernoulliClassifier.fit(bernoulliSet, y_train).predict(bernoulliTest)
#prediction = gaussianClassifier.fit(gaussianSet, y_train).predict(gaussianTest)

print("Combined Gaussian/Bernoulli w/ SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))
print()

prediction = bernoulliClassifier.fit(bernoulliSet, y_train).predict(bernoulliTest)

print("Bernoulli (just categorical) w/ SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))
print()

prediction = gaussianClassifier.fit(gaussianSet, y_train).predict(gaussianTest)

print("Gaussian (just continuous) w/ SMOTE:")
print("Recall:", recall_score(y_test, prediction))
print("Accuracy:", accuracy_score(y_test, prediction))
print("F1 Score:", f1_score(y_test, prediction))

Combined Gaussian/Bernoulli w/ SMOTE:
Recall: 0.5952380952380952
Accuracy: 0.776255707762557
F1 Score: 0.22573363431151242

Bernoulli (just categorical) w/ SMOTE:
Recall: 0.40476190476190477
Accuracy: 0.8114807566862361
F1 Score: 0.1904761904761905

Gaussian (just continuous) w/ SMOTE:
Recall: 0.8214285714285714
Accuracy: 0.6797129810828441
F1 Score: 0.21939586645469


In [43]:
#Naive Bayes w/ SMOTE Cross Validated (7-Fold)

kf = KFold(n_splits = 7)

recallScoresCombined = []
accuracyScoresCombined = []
f1ScoresCombined = []

recallScoresGaussian = []
accuracyScoresGaussian = []
f1ScoresGaussian = []

recallScoresBernoulli = []
accuracyScoresBernoulli = []
f1ScoresBernoulli = []

for fold, (train_index, test_index) in enumerate(kf.split(data), 1):
    X_train = data.iloc[train_index]
    y_train = target.iloc[train_index]
    X_test = data.iloc[test_index]
    y_test = target.iloc[test_index]

    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_resample(X_train, y_train)

    bernoulliSet = X_train_oversampled.drop(columns=["avg_glucose_level", "bmi", "age"])
    gaussianSet = X_train_oversampled[["avg_glucose_level", "bmi", "age"]]

    bernoulliClassifier = BernoulliNB()
    gaussianClassifier = GaussianNB()

    bernoulliTest = X_test.drop(columns=["avg_glucose_level", "bmi", "age"])
    gaussianTest = X_test[["avg_glucose_level", "bmi", "age"]]

    bernoulliProb = bernoulliClassifier.fit(bernoulliSet, y_train_oversampled).predict_proba(bernoulliTest)
    gaussianProb = gaussianClassifier.fit(gaussianSet, y_train_oversampled).predict_proba(gaussianTest)
    prediction = (np.argmax(bernoulliProb*gaussianProb, axis = 1))

    recallScoresCombined.append(recall_score(y_test, prediction))
    accuracyScoresCombined.append(accuracy_score(y_test, prediction))
    f1ScoresCombined.append(f1_score(y_test, prediction))

    prediction = bernoulliClassifier.fit(bernoulliSet, y_train_oversampled).predict(bernoulliTest)
    
    recallScoresBernoulli.append(recall_score(y_test, prediction))
    accuracyScoresBernoulli.append(accuracy_score(y_test, prediction))
    f1ScoresBernoulli.append(f1_score(y_test, prediction))
    
    prediction = gaussianClassifier.fit(gaussianSet, y_train_oversampled).predict(gaussianTest)

    recallScoresGaussian.append(recall_score(y_test, prediction))
    accuracyScoresGaussian.append(accuracy_score(y_test, prediction))
    f1ScoresGaussian.append(f1_score(y_test, prediction))

print("Combined Gaussian/Bernoulli w/ SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresCombined))
print("Accuracy mean:", np.mean(accuracyScoresCombined))
print("F1 Score mean:", np.mean(f1ScoresCombined))
print()

print("Bernoulli w/ SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresBernoulli))
print("Accuracy mean:", np.mean(accuracyScoresBernoulli))
print("F1 Score mean:", np.mean(f1ScoresBernoulli))
print()

print("Gaussian w/ SMOTE (Cross-Validated 7-Fold):")
print("Recall mean:", np.mean(recallScoresGaussian))
print("Accuracy mean:", np.mean(accuracyScoresGaussian))
print("F1 Score mean:", np.mean(f1ScoresGaussian))

Combined Gaussian/Bernoulli w/ SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.651331924603814
Accuracy mean: 0.7948708656471214
F1 Score mean: 0.23541575539480236

Bernoulli w/ SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.37933502749170955
Accuracy mean: 0.8404800828951007
F1 Score mean: 0.18759931790199497

Gaussian w/ SMOTE (Cross-Validated 7-Fold):
Recall mean: 0.833274636270028
Accuracy mean: 0.6979834585618451
F1 Score mean: 0.21109612339930808


In [61]:
data,target = cont_bmi_dataframe()

In [85]:
#K-Nearest Neighbors (No Parameter Tuning/Cross Validated)
knn = KNeighborsClassifier()

print("KNN Classifier w/o SMOTE:")
print("Recall mean:", np.mean(cross_validate(knn, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(knn, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(knn, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('classification', KNeighborsClassifier())])

print("KNN Classifier w/ SMOTE:")
print("Recall mean:", np.mean(cross_validate(model, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(model, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(model, data, target, scoring="f1")["test_score"]))
print()

KNN Classifier w/o SMOTE:
Recall mean: 0.02
Accuracy mean: 0.9436299549001305
F1 Score mean: 0.029968066814050603

KNN Classifier w/ SMOTE:
Recall mean: 0.5141224489795919
Accuracy mean: 0.7929164646149068
F1 Score mean: 0.2039133111623574



In [82]:
#KNN Neighbors Classifier (parameter tuning through gridSearch)

knn = KNeighborsClassifier()

parameters = {'n_neighbors':[2,3,4,5,6,7,8,9,10], "weights":["uniform", "distance"]}
gridSearch = GridSearchCV(knn, parameters, scoring="f1")
gridSearch.fit(data, target)

print("KNN Classifier w/o SMOTE and parameter tuning via GridSearch:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('classification', KNeighborsClassifier())])

parameters = {'classification__n_neighbors':[2,3,4,5,6,7,8,9,10], "classification__weights":["uniform", "distance"]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data,target)

print("KNN Classifier w/ SMOTE and parameter tuning via GridSearch:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))

KNN Classifier w/o SMOTE and parameter tuning via GridSearch:
Recall mean: 0.10840816326530611
Accuracy mean: 0.9121186971830311
F1 Score mean: 0.10461931919376691

KNN Classifier w/ SMOTE and parameter tuning via GridSearch:
Recall mean: 0.5944489795918366
Accuracy mean: 0.779803960278381
F1 Score mean: 0.2092442818078263


In [91]:
#KNN Neighbors Classifier (parameter tuning through gridSearch/Standard Scalar Preprocessing)

model = Pipeline([('scale', StandardScaler()),('classification', KNeighborsClassifier())])

parameters = {'classification__n_neighbors':[2,3,4,5,6,7,8,9,10], "classification__weights":["uniform", "distance"]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data, target)

print("KNN Classifier w/o SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))
print()

model = Pipeline([('sampling', SMOTE()),('scale', StandardScaler()),('classification', KNeighborsClassifier())])

parameters = {'classification__n_neighbors':[2,3,4,5,6,7,8,9,10], "classification__weights":["uniform", "distance"]}
gridSearch = GridSearchCV(model, parameters, scoring="f1")
gridSearch.fit(data,target)

print("KNN Classifier w/ SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:")
print("Recall mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="recall")["test_score"]))
print("Accuracy mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="accuracy")["test_score"]))
print("F1 Score mean:", np.mean(cross_validate(gridSearch.best_estimator_, data, target, scoring="f1")["test_score"]))

KNN Classifier w/o SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:
Recall mean: 0.11632653061224489
Accuracy mean: 0.9132896070963772
F1 Score mean: 0.11556957895102224

KNN Classifier w/ SMOTE and parameter tuning via GridSearch with Standard Scalar Preprocessing:
Recall mean: 0.2529795918367347
Accuracy mean: 0.8833431404306051
F1 Score mean: 0.1767126099888079


Analysis of Learning Algorithms:

We tried a variety of classifiers in an attempt to create a model that could accurately (and more importantly, with high recall) predict stroke occurances using the health data provided. One important characteristic of our dataset is that it is heavily imbalanced. The null hypothesis (predicting no stroke occurances across the board) results in a model with about 95% accuracy. Thus, it is extremely important that we focus on the recall of the model and the f1 scores, which represent a sort of balance between the accuracy and the recall scores. Experimentally, raising a model's recall by changing model parameters or using SMOTE to oversample the training data results in a lowering of the accuracy so there is a practical trade-off between the two scores. However, the null hypothesis model is quite obviously useless so it's a trade-off we need to make in order to get any kind of applicable model.

Treating the natural imbalance in the data is of utmost importance. The Syntethic Minority Oversampling Technique (SMOTE) is used to take the minority class (in this case, stroke occurances) and creating fake examples that are close to the other stroke occurances in the feature space. In all of our attempted models, oversampling with SMOTE greatly improves both recall and the f1 score from the corresponding baseline model that was trained only on the provided data. Regardless of the classifier type, the models were effectively useless without oversampling. Thus, the most important step in model creation for our dataset is using SMOTE in our pipeline. We also played around with other preprocessing steps such as using sklearn's StandardScaler, but this reduces the scoring of our models across the board.

Only two of the provided features are continuous/numerical, so Gaussian Discriminant Analysis was not an appropriate tool to use due to the fact that it assumes that the features are normally distributed. Instead, we turned to Naive Bayes as a potential model because it can handle both Gaussian and Binary distributed features. Due to the fact that Naive Bayes assumes independence between the features, we can split the dataset into Gaussian and Bernoulli features and get final probabilities at the end by multiplying the probabilities of the two separate models together. For each of the sets, we can train a Naive Bayes model of the corresponding type on just the appropriate features. Then, rather than using the predict method to get labels, we grab the probabilities for each class. Multiplying these newly calculated probabilities together from both the Gaussian and Bernoulli models, we get a "combined" probability that accounts for all of the features in the data. Then, assigning class labels is as easy as choosing the argmax of the resultant probabilities for each input. In addition to being theoretically sound (by the assumed independence of features), we experimentally see that the combined model improves on the f1 scores of both subset models. Using this combined model (w/ SMOTE), we end up with a recall of $0.65$, an accuracy of $0.79$, and an f1 of around $0.24$. Unfortunately, there aren't a whole lot of model parameters that can be changed for the Naive Bayes model. The only possiblity being "var_smoothing" which is just used for calculation stability on GaussianNB and "alpha" a smoothing parameter for BernoulliNB. Thus, the best we can do with Naive Bayes is an f1 score of $0.24$. 

Another option we explored as a way to predict stroke occurances was a K-Nearest Neighbors Classifier. Without SMOTE or any fine tuning of parameters, the model is useless with a recall score around $.01$ and an f1 score of about $0.02$. Oversampling immediately yields much better results with a recall score of about $0.51$, an accuracy of $0.79$, and an f1 score of around $0.19$ which is actually already decent compared to our scores on the Naive Bayes. Unfortunately, the tuning of model parameters using GridSearch doesn't improve the SMOTE model significantly. It provides a slight jump up to a recall of $.59$, a slight accuracy decrease to $.78$, and a slight jump up to a f1 score of about $.21$. Thus, even with all the improvements we could make with our KNN classifier, it underperforms when compared to the combined Naive Bayes model we talked about previously.