An ensemble approach is used to combine a naive Bayes model with logistical regression. 

In [12]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import ComplementNB, GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import f1_score
import scipy.stats as st
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler

In [3]:
df = pd.read_csv('framingham.csv')

# Replacement of NA values with attribute means. 
size = len(df)

# Attributes that will have their NA values replaced by the attribute mean. 
na_cols_mean_replace = ['cigsPerDay', 'totChol', 'BMI', 'heartRate', 'glucose']

for col in na_cols_mean_replace:
    avg = round(df[col].mean(), 2)
    df[col] = df[col].fillna(avg)

# Education will be binned as is, so replacement values will need to correspond to an existing int. 
na_cols_mean_replace_round = ['education', 'BPMeds']

for col in na_cols_mean_replace_round:
    avg = round(df[col].mean(), 0)
    df[col] = df[col].fillna(avg)

In [4]:
df_c = df.astype('category')

for col in df_c.columns:
    df_c[col] = df_c[col].cat.codes

df_c.head(10)

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,7,3,0,0,0,0,0,0,59,31,33,747,36,32,0
1,0,14,1,0,0,0,0,0,0,115,61,55,897,51,31,0
2,1,16,0,1,20,0,0,0,0,110,74,53,595,30,25,0
3,0,29,2,1,24,0,0,1,0,89,119,83,883,20,59,1
4,0,14,2,1,21,0,0,0,0,150,79,61,387,41,41,0
5,0,11,1,0,0,0,0,1,0,92,176,113,1025,33,55,0
6,0,31,0,0,0,0,0,0,0,69,95,35,1199,15,41,1
7,0,13,1,1,20,0,0,0,0,178,19,35,268,35,33,0
8,1,20,0,0,0,0,0,1,0,125,102,71,688,32,34,0
9,1,11,0,1,24,0,0,1,0,89,143,107,434,49,44,0


In [5]:
x, y = df_c.iloc[:, :-1].values, df.iloc[:, -1].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=10, shuffle=True)

print(x_train.shape, y_train.shape)

(3390, 15) (3390,)


In [6]:
estimators = [('cnb', GaussianNB())]

ens = StackingClassifier(estimators=estimators)
ens.fit(x_train, y_train)

y_pred_ens = ens.predict(x_test)

accuracy_ens = metrics.accuracy_score(y_test, y_pred_ens)
f1_ens = f1_score(y_test, y_pred_ens, average='macro')
print("accuracy: {} \nF1 score: {}".format(accuracy_ens, f1_ens))


accuracy: 0.8714622641509434 
F1 score: 0.4656584751102709


Ultimately, using GaussianNB and LogisticRegression, no significant improvement was seen over the previous linear regression model alone. Note that additions of random forest, support vector machine, and gaussian naive Bayes did not significantly increase prediction accuracy. 

Below, it will be investigated if balancing the data prior to training will benefit this approach as it did with the individual models.

In [7]:
# Here, enough negative CHD values are dropped until the rate of CHD is at 50%.
df_cp = df_c.copy()

# The rows are first shuffled to remove any kind of trends in ordering.
df_cp = df_cp.sample(frac = 1, random_state=10)

# Dropping 2950 CHD-positive rows brings both sample sizes to 644.
df_cp2 = df_cp.drop(df_cp[df_cp.TenYearCHD==0].index[:2950])

# Percentage of positive CHD cases in the dataset.
sum(df_cp2['TenYearCHD'])/len(df_cp2)

0.5

In [8]:
# Defining new variables with the parity dataset and splitting them into train and test subsets. 
x2 = df_cp2.drop('TenYearCHD', axis=1)
y2 = df_cp2['TenYearCHD']

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.2, random_state=10)
print(x2_train.shape, y2_train.shape)

(1030, 15) (1030,)


In [9]:
# An ensemble model of the same previous components is trained and tested. 
estimators_bal = [('cnb', ComplementNB())]

ens_bal = StackingClassifier(estimators=estimators_bal)
ens_bal.fit(x2_train, y2_train)

y2_pred_ens_bal = ens_bal.predict(x2_test)

accuracy_ens_bal = metrics.accuracy_score(y2_test, y2_pred_ens_bal)
f1_ens_bal = f1_score(y2_test, y2_pred_ens_bal, average='macro')
print("accuracy: {} \nF1 score: {}".format(accuracy_ens_bal, f1_ens_bal))

accuracy: 0.5968992248062015 
F1 score: 0.5966810966810967


In [11]:
# An ensemble model of the different components is trained and tested in an attempt to increase accuracy. 
# GaussianNB performs better here, so it is used over ComplementNB. 
estimators_bal2 = [('cnb', GaussianNB()),
                  ('svr', make_pipeline(StandardScaler(),
                                       LinearSVC(random_state=10, max_iter=10000)))]

ens_bal2 = StackingClassifier(estimators=estimators_bal2)
ens_bal2.fit(x2_train, y2_train)

y2_pred_ens_bal2 = ens_bal2.predict(x2_test)

accuracy_ens_bal2 = metrics.accuracy_score(y2_test, y2_pred_ens_bal2)
f1_ens_bal2 = f1_score(y2_test, y2_pred_ens_bal2, average='macro')
print("accuracy: {} \nF1 score: {}".format(accuracy_ens_bal2, f1_ens_bal2))

accuracy: 0.6782945736434108 
F1 score: 0.6774757135326455


The performance here is just over that of the previous linear regression model alone. The addition of linear support vector classification was shown to benefit the model further. 