# Predicting Diabetes in Women of the Pima Indigenous People

### Prediction: 

Predict whether a woman in the Pima Tribe will be diagnosed with diabetes in the next 5 years.

### Data Set Info : 

Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.


### Dataset Attributes :

Pregnancies:  Number of times pregnant

Glucose:  Plasma glucose concentration a 2 hours in an oral glucose tolerance test 

BloodPressure:  Diastolic blood pressure (mm Hg) 

SkinThickness:  Triceps skin fold thickness (mm) 

Insulin:  2-Hour serum insulin (mu U/ml) 

BMI:  Body mass index (weight in kg/(height in m)^2) 

DiabetesPedigreeFunction:  measure of genetic influence and hereditary risk 

Age:  Age (years) 

Outcome:  Class variable (0 or 1)

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(style="whitegrid")
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import linear_model
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks 
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.linear_model import LassoCV
from sklearn import tree 
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus
import xgboost as xgb
from xgboost import plot_importance
import warnings
warnings.filterwarnings('ignore')



# DATA CLEANING

(cleaning steps were moved into a function in the EDA section)

In [2]:
df = pd.read_csv('diabetes.csv')

FileNotFoundError: [Errno 2] File b'diabetes.csv' does not exist: b'diabetes.csv'

In [None]:
df.head(2)

In [None]:
df.shape

In [None]:
#df.info()

In [None]:
df.Pregnancies.unique()

In [None]:
df.loc[df['Pregnancies'] >= 15]

In [None]:
df.describe()  #min 0 must be missing data for vitals/blood columns

In [None]:
#df.pregnancies.value_counts()  #bin these values

In [None]:
#df.SkinThickness.value_counts() # 227 missing values for skin thickness

In [None]:
#df.BMI.value_counts() #only 11 missing values for BMI, will replace with mean

In [None]:
#df.BMI.median()

In [None]:
#df.Insulin.value_counts() # 374 missing values for insulin level

In [None]:
#df.Insulin.median()

In [None]:
#df.BloodPressure.median()

# EDA & FEATURE TRANSFORMATION

In [None]:
# sns.set_style('darkgrid',{'axes.edgecolor': '.6'})
# sns.pairplot(df, hue='Outcome', palette='husl')

### Baseline Model 

In [None]:
df1 = df.copy()
targ = df1['Outcome']
feat = df1.drop('Outcome', axis=1)
Xtrain, Xtest, ytrain, ytest = train_test_split(feat, targ, random_state=42,test_size=0.2)

In [None]:
logreg = LogisticRegression()
logreg.fit(Xtrain, ytrain)
pred_trn = logreg.predict(Xtrain)
print('TRAINING F1: ', metrics.f1_score(ytrain, pred_trn))
print(confusion_matrix(ytrain,pred_trn))

pred_tst = logreg.predict(Xtest)
print('TESTING F1: ', metrics.f1_score(ytest, pred_tst))
print(confusion_matrix(ytest,pred_tst))

In [None]:
sns.set_style('darkgrid',{'axes.edgecolor': '.9'})
sns.jointplot(df.Insulin, df.Glucose, color='lightseagreen') 

In [None]:
sns.set_style('darkgrid',{'axes.edgecolor': '.9'})
sns.jointplot(df.BMI, df.SkinThickness, color='violet')

In [None]:
df_copy = df.copy()
df_copy.BMI = df_copy.BMI.replace({0:32})
bins = [10,20,30,40,50,60]
bins_bmi = pd.cut(df_copy['BMI'], bins)
bins_bmi = bins_bmi.cat.as_unordered()
df_copy['BinBMI'] = bins_bmi
df_copy.groupby(['BinBMI']).SkinThickness.mean()

#replacing zeros with the average per BMI group, this is added to the data cleaning function

In [None]:
def data_shape(df):
    df.columns = df.columns.str.lower()
    
    df = df.rename({'diabetespedigreefunction': 'dpf', 'outcome':'diabetes'}, axis=1)
    df = df[df['insulin'] < 600] #drop outliers
    df = df[df['skinthickness'] < 70] #drop outliers
    df = df[df['bmi'] < 55] #drop outliers
    df = df[df['glucose'] > 0] #drop zeros
    df.bloodpressure = df.bloodpressure.replace({0:72}) #replace zeros with mean
    df.insulin = df.insulin.replace({0:30.5})
    df.bmi = df.bmi.replace({0:32}) #replace zeros with mean
    
    #replace skin thickness with mean of thickness by bmi group
    
    mask1 = (df['skinthickness'] == 0) & (df['bmi'] <= 20)
    col1 = 'skinthickness'
    df.loc[mask1, col1] = 9.64

    mask2 = (df['skinthickness'] == 0) & (df['bmi'] <= 30)
    col2 = 'skinthickness'
    df.loc[mask2, col2] = 14.36

    mask3 = (df['skinthickness'] == 0) & (df['bmi'] <= 40)
    col3 = 'skinthickness'
    df.loc[mask3, col3] = 23.37

    mask4 = (df['skinthickness'] == 0) & (df['bmi'] <= 50)
    col4 = 'skinthickness'
    df.loc[mask4, col4] = 28.26

    mask5 = (df['skinthickness'] == 0) & (df['bmi'] <= 60)
    col5 = 'skinthickness'
    df.loc[mask5, col5] = 32.71
    
    #CAUTION: this is also executed below
    
#     df['healthy_bmi'] = df.bmi.apply([lambda x: 1 if (x <= 24.9) & (x >= 18.5) else 0])
#     df['healthy_bp'] = df.bloodpressure.apply([lambda x: 1 if x <= 120 else 0])
#     df['healthy_glu'] = df.glucose.apply([lambda x: 1 if x <= 110 else 0])
#     df['healthy_ins'] = df.insulin.apply([lambda x: 1 if x <= 100 else 0])    
#     df['healthy_preg'] = df.pregnancies.apply([lambda x: 1 if x < 5 else 0])
#     df['healthy_dpf'] = df.dpf.apply([lambda x: 1 if x <= 0.4259 else 0])
#     df['health_score'] = df.healthy_preg + df.healthy_ins + df.healthy_glu + df.healthy_bp + df.healthy_dpf + df.healthy_bmi    
#      df['dpf_log']= np.log(df.dpf)
# # df.drop(columns=['dpf'], inplace=True)
        
    
    return df

In [None]:
df = data_shape(df)

In [None]:
origin_df = df.copy() #for future use

In [None]:
A = df.drop('diabetes', axis = 1)
b = df['diabetes'] 
feature_cols = A.columns
plt.figure(figsize = (10,5))
sns.countplot(b, alpha =.50, palette= ['tomato','c'], edgecolor='gray')
plt.title('Diabetes vs No Diabetes')
plt.ylabel('# of Women')
plt.show()

In [None]:
df.describe()

In [None]:
sns.set_style('darkgrid',{'axes.edgecolor': '.9'},)
f, ax = plt.subplots(3,3,figsize = (20,16))
plt.rcParams["patch.force_edgecolor"] = True

vis1 = sns.distplot(df["pregnancies"],bins=10, color='tomato',hist_kws=dict(edgecolor="k", linewidth=2),ax= ax[0][0])
vis2 = sns.distplot(df["glucose"],bins=10, color ='tomato',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[0][1])
vis3 = sns.distplot(df["bloodpressure"],bins=10, color='tomato',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[0][2])
vis4 = sns.distplot(df["skinthickness"],bins=10,color='tomato',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[1][0])
vis5 = sns.distplot(df["insulin"],bins=10,color='tomato',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[1][1])
vis6 = sns.distplot(df["bmi"],bins=10,color='tomato', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[1][2])
vis7 = sns.distplot(df["dpf"],bins=10, color='tomato',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[2][0])
vis8 = sns.distplot(df["age"],bins=10,color='tomato', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[2][1])


In [None]:
log_df = df.copy()

In [None]:
log_df['insulin_log'] = np.log(log_df.insulin)
log_df['dpf_log']= np.log(log_df.dpf)



In [None]:
sns.set_style('darkgrid',{'axes.edgecolor': '.9'},)
f, ax = plt.subplots(1,2,figsize = (15,5))
plt.rcParams["patch.force_edgecolor"] = True

v1 = sns.distplot(log_df["insulin_log"],bins=10,color='teal',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[0])
v2 = sns.distplot(log_df["dpf_log"],bins=10, color='teal',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[1])


In [None]:
log_df.drop(columns=['insulin','dpf'], inplace=True) #logging insulin did not improve the distribution

# FEATURE ENGINEERING

In [None]:
def find_best_k(X_train, y_train, X_test, y_test, min_k=None, max_k=None):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
            
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

## Test Model 1 : Clean Data

In [None]:
l_target = log_df['diabetes']
l_features = log_df.drop('diabetes', axis=1)
Xtrain_l, Xtest_l, ytrain_l, ytest_l = train_test_split(l_features, l_target, random_state=42,test_size=0.2)

In [None]:
scalar_lg = StandardScaler()
scalar_lg.fit(Xtrain_l)
Xtrain_l_scaled  = scalar_lg.transform(Xtrain_l)
Xtest_l_scaled = scalar_lg.transform(Xtest_l)

In [None]:
logreg = LogisticRegression()
logreg.fit(Xtrain_l, ytrain_l)
predl_tr = logreg.predict(Xtrain_l)
print('TRAINING F1: ', metrics.f1_score(predl_tr, ytrain_l))
print(confusion_matrix(predl_tr, ytrain_l))

predl_tst = logreg.predict(Xtest_l)
print('TESTING F1: ', metrics.f1_score(predl_tst, ytest_l))
print(confusion_matrix(predl_tst,ytest_l))

In [None]:
#Bin two variables to test whether this will increase our f1 score.

In [None]:
feat_df = df.copy()

In [None]:
sns.distplot(feat_df.pregnancies, bins=30, color='tomato',hist_kws=dict(edgecolor="k", linewidth=.8))

In [None]:
bins = [-1, 0, 1, 3, 5, 8, 17]
preg_bs = pd.cut(feat_df['pregnancies'], bins)
preg_bs = preg_bs.cat.as_unordered()
preg_bins = pd.get_dummies(preg_bs, prefix="preg")
feat_df = feat_df.drop(columns=['pregnancies'])
feat_df = pd.concat([feat_df, preg_bins],axis=1)


In [None]:
bin_dist = feat_df.groupby([preg_bs]).size()
bin_dist.plot(kind='barh',color='violet',edgecolor='gray')

In [None]:
sns.distplot(feat_df.age, bins=30, color='tomato',hist_kws=dict(edgecolor="k", linewidth=.8))

In [None]:
bins = [21, 23, 26, 30, 38, 46, 80]
age_bs = pd.cut(feat_df['age'], bins)
age_bs = age_bs.cat.as_unordered()
age_bins = pd.get_dummies(age_bs, prefix="age")
feat_df = feat_df.drop(columns=['age'])
feat_df = pd.concat([feat_df, age_bins],axis=1)
    


In [None]:
bin_dist = feat_df.groupby([age_bs]).size()
bin_dist.plot(kind='barh',color='c',edgecolor='gray')

## Test Model 2 : Binned & Logged Features

In [None]:
df2 = feat_df.copy()
targ2 = df2['diabetes']
feat2 = df2.drop('diabetes', axis=1)
Xtrain2, Xtest2, ytrain2, ytest2 = train_test_split(feat2, targ2, random_state=42,test_size=0.2)

In [None]:
scalar_lg = StandardScaler()
scalar_lg.fit(Xtrain2)
Xtrain2_scaled  = scalar_lg.transform(Xtrain2)
Xtest2_scaled = scalar_lg.transform(Xtest2)

In [None]:
logreg = LogisticRegression()
logreg.fit(Xtrain2, ytrain2)
pred2_tr = logreg.predict(Xtrain2)
print('TRAINING F1: ', metrics.f1_score(pred2_tr, ytrain2))
print(confusion_matrix(pred2_tr, ytrain2))

pred2 = logreg.predict(Xtest2)
print('TESTING F1: ', metrics.f1_score(pred2, ytest2))
print(confusion_matrix(pred2,ytest2))

#this did not perform better than the unbinned data

### Polynomial Features

In [None]:
poly_df = df.copy()

In [None]:
#train test split on the data without logging
p_target = poly_df['diabetes']
p_features = poly_df.drop('diabetes', axis=1)
pXtrain, pXtest, pytrain, pytest = train_test_split(p_features, p_target, random_state=42,test_size=0.2)

In [None]:
poly_2 = PolynomialFeatures(degree=3, include_bias=False)
poly_2.fit(pXtrain)
X_train_2= pd.DataFrame(poly_2.transform(pXtrain), columns = poly_2.get_feature_names(p_features.columns))
X_test_2= pd.DataFrame(poly_2.transform(pXtest), columns = poly_2.get_feature_names(p_features.columns))

In [None]:
X_train_2.head(2)

## Test Model 3 : Interaction Features

In [None]:
scalar_2 = StandardScaler()
scalar_2.fit(X_train_2)
X_train_2_scaled  = scalar_2.transform(X_train_2)
X_test_2_scaled = scalar_2.transform(X_test_2)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_2_scaled, pytrain)
y_train_2_pred = logreg.predict(X_train_2_scaled)
print('TRAINING F1: ', metrics.f1_score(pytrain, y_train_2_pred))
print(confusion_matrix(pytrain, y_train_2_pred))

y_test_2_pred = logreg.predict(X_test_2_scaled)
print('TESTING F1: ', metrics.f1_score(pytest, y_test_2_pred))
print(confusion_matrix(pytest, y_test_2_pred))

In [None]:
find_best_k(X_train_2_scaled, pytrain, X_test_2_scaled, pytest,min_k=1, max_k=10)

In [None]:
#so far this is the best performing model

### More Features

In [None]:
#determine who has healthy BMI 18.5 to 24.9, bloodpressure <=120, glucose <=110, insulin < 100

In [None]:
df['healthy_bmi'] = df.bmi.apply([lambda x: 1 if (x <= 24.9) & (x >= 18.5) else 0])

In [None]:
df['healthy_bp'] = df.bloodpressure.apply([lambda x: 1 if x <= 120 else 0])

In [None]:
df['healthy_glu'] = df.glucose.apply([lambda x: 1 if x <= 110 else 0])

In [None]:
df['healthy_ins'] = df.insulin.apply([lambda x: 1 if x <= 100 else 0])

In [None]:
print('Diabetic DPF: ',df[df['diabetes'] == 1].dpf.mean())
print('Undiabetic DPF: ',df[df['diabetes'] == 0].dpf.mean())

#the difference is minimal

In [None]:
df['healthy_dpf'] = df.dpf.apply([lambda x: 1 if x <= 0.4259 else 0])

In [None]:
#pregnancy paper https://www.ncbi.nlm.nih.gov/pubmed/12177894
#complications increase after pregnancy 5

In [None]:
df['healthy_preg'] = df.pregnancies.apply([lambda x: 1 if x < 5 else 0])

In [None]:
df['health_score'] = df.healthy_preg + df.healthy_ins + df.healthy_glu + df.healthy_bp + df.healthy_dpf + df.healthy_bmi

In [None]:
df.head()

In [None]:
df4 = df.copy()

In [None]:
corr = df4.corr()
fig, ax = plt.subplots(figsize=(10,10))
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);


In [None]:
sns.distplot(df4.health_score, bins=30, color='tomato',hist_kws=dict(edgecolor="k", linewidth=.8))

In [None]:
#log dpf

df4['dpf_log']= np.log(df4.dpf)
df4.drop(columns=['dpf'], inplace=True)


In [None]:
#train test split
target4 = df4['diabetes']
features4 = df4.drop('diabetes', axis=1)
Xtrain4, Xtest4, ytrain4, ytest4 = train_test_split(features4, target4, random_state=42,test_size=0.2)

## Test Model 4 : Feature Engineering

In [None]:
#scale the features
scalar4 = StandardScaler()
scalar4.fit(Xtrain4)
Xtrain4_scaled  = scalar4.transform(Xtrain4)
Xtest4_scaled = scalar4.transform(Xtest4)

In [None]:
#baseline
logreg = LogisticRegression()
logreg.fit(Xtrain4_scaled, ytrain4)
ytrain4_pred = logreg.predict(Xtrain4_scaled)
print('TRAINING F1: ', metrics.f1_score(ytrain4, ytrain4_pred))
print(confusion_matrix(ytrain4, ytrain4_pred))

ytest4_pred = logreg.predict(Xtest4_scaled)
print('TESTING F1: ', metrics.f1_score(ytest4, ytest4_pred))
print(confusion_matrix(ytest4, ytest4_pred))

In [None]:
poly_4 = PolynomialFeatures(degree=2, include_bias=False)
poly_4.fit(Xtrain4)
X_train_4= pd.DataFrame(poly_4.transform(Xtrain4), columns = poly_4.get_feature_names(features4.columns))
X_test_4= pd.DataFrame(poly_4.transform(Xtest4), columns = poly_4.get_feature_names(features4.columns))

## Test Model 5 : Polynomial Features  + Feature Eng

In [None]:
scalar_4 = StandardScaler()
scalar_4.fit(X_train_4)
X_train_4_scaled  = scalar_4.transform(X_train_4)
X_test_4_scaled = scalar_4.transform(X_test_4)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_4_scaled, ytrain4)
y_train_4_pred = logreg.predict(X_train_4_scaled)
print('TRAINING F1: ', metrics.f1_score(ytrain4, y_train_4_pred))
print(confusion_matrix(ytrain4, y_train_4_pred))

y_test_4_pred = logreg.predict(X_test_4_scaled)
print('TESTING F1: ', metrics.f1_score(ytest4, y_test_4_pred))
print(confusion_matrix(ytest4, y_test_4_pred))

# FEATURE SELECTION

In [None]:
def find_best_k(X_train, y_train, X_test, y_test, min_k=None, max_k=None):
    best_k = 0
    best_score = 0.0
    for k in range(min_k, max_k+1):
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train, y_train)
        preds = knn.predict(X_test)
        f1 = f1_score(y_test, preds)
        if f1 > best_score:
            best_k = k
            best_score = f1
            
    
    print("Best Value for k: {}".format(best_k))
    print("F1-Score: {}".format(best_score))

In [None]:
def run_model(model,X_train,X_test,y_train,y_test):
    
    print('Training R^2 :',model.score(X_train,y_train))
    y_pred_train = model.predict(X_train)
    print('Training Root Mean Square Error',np.sqrt(metrics.mean_squared_error(y_train,y_pred_train)))
    print('\n----------------\n')
    print('Testing R^2 :',model.score(X_test,y_test))
    y_pred_test = model.predict(X_test)
    print('Testing Root Mean Square Error',np.sqrt(metrics.mean_squared_error(y_test,y_pred_test)))

In [None]:
#test the function
log = LogisticRegression()
log.fit(X_train_4_scaled, ytrain4)
run_model(log,X_train_4_scaled, X_test_4_scaled, ytrain4, ytest4 )

In [None]:
lasso = LassoCV(max_iter=100,cv=5)
lasso.fit(X_train_4_scaled, ytrain4)
run_model(lasso ,X_train_4_scaled, X_test_4_scaled, ytrain4, ytest4)
print("The optimal alpha for the Lasso Regression is: ",lasso.alpha_)

In [None]:
coeff_used = np.sum(lasso.coef_!=0)

print("Number of coefs: ", len(lasso.coef_))
print("Number at 0: ", sum(abs(lasso.coef_) < 10**(-10)))
print("Number of coef used: ",coeff_used)
print("Percent reduced: ", sum(abs(lasso.coef_) < 10**(-10))/135)


In [None]:
X_train_4.columns[(lasso.coef_ != 0).tolist()]


In [None]:
cols = ['pregnancies glucose', 'pregnancies insulin', 'glucose^2',
       'glucose bloodpressure', 'glucose insulin', 'glucose bmi',
       'bloodpressure^2', 'bloodpressure age', 'skinthickness^2',
       'skinthickness insulin', 'insulin^2', 'insulin bmi', 'insulin age',
       'bmi age', 'age^2']

In [None]:
#using original df4 train test split that is not yet scaled
X_train_5 = X_train_4[cols]
X_test_5 = X_test_4[cols]

In [None]:
scaler_5 = StandardScaler()
scaler_5.fit(X_train_5)

In [None]:
X_train_5_scaled = pd.DataFrame(scaler_5.transform(X_train_5),columns=X_train_5.columns)
X_test_5_scaled = pd.DataFrame(scaler_5.transform(X_test_5), columns = X_train_5.columns)


In [None]:
## Graphing remaining features

sns.set_style('darkgrid',{'axes.edgecolor': '.9'},)
f, ax = plt.subplots(5,3,figsize = (30,30))
plt.rcParams["patch.force_edgecolor"] = True

vis1 = sns.distplot(X_train_4['pregnancies glucose'],bins=10, color='teal',hist_kws=dict(edgecolor="k", linewidth=2),ax= ax[0][0])
vis2 = sns.distplot(X_train_4['pregnancies insulin'], bins =10, color ='teal',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[0][1])
vis3 = sns.distplot(X_train_4['glucose^2'],bins=10, color='teal',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[0][2])
vis4 = sns.distplot(X_train_4[ 'glucose bloodpressure'],bins=10,color='teal',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[1][0])
vis5 = sns.distplot(X_train_4['glucose insulin'],bins=10,color='teal',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[1][1])
vis6 = sns.distplot(X_train_4['glucose bmi'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[1][2])
vis7 = sns.distplot(X_train_4['bloodpressure^2'],bins=10, color='teal',hist_kws=dict(edgecolor="k", linewidth=2), ax=ax[2][0])
vis8 = sns.distplot(X_train_4['skinthickness^2'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[2][1])
vis9 = sns.distplot(X_train_4['skinthickness insulin'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[2][2])
vis10 = sns.distplot(X_train_4['insulin^2'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[3][0])
vis11 = sns.distplot(X_train_4['insulin bmi'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[3][1])
vis12 = sns.distplot(X_train_4['insulin^2'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[3][2])
vis13 = sns.distplot(X_train_4['insulin age'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[4][0])
vis14 = sns.distplot(X_train_4['bmi age'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[4][1])
vis15 = sns.distplot(X_train_4['age^2'],bins=10,color='teal', hist_kws=dict(edgecolor="k", linewidth=2),ax=ax[4][2])


## Test Model 6 : >0 Features

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_5_scaled, ytrain4)
y_train_5_pred = logreg.predict(X_train_5_scaled)
print('TRAINING F1: ', metrics.f1_score(ytrain4, y_train_5_pred))
print(confusion_matrix(ytrain4, y_train_4_pred))

y_test_5_pred = logreg.predict(X_test_5_scaled)
print('TESTING F1: ', metrics.f1_score(ytest4, y_test_5_pred))
print(confusion_matrix(ytest4, y_test_5_pred))

# RESAMPLING

In [None]:
#USE THESE :  X_train_5_scaled, ytrain4, X_test_5_scaled, ytest4

In [None]:
print(X_train_5_scaled.shape)
print(X_test_5_scaled.shape)

In [None]:
#create new dataframe with xtrain and ytrain values
resampling = X_train_5_scaled.copy()
resampling['diabetes']= ytrain4.values

In [None]:
#split the data on the binary value of the outcomes
no_diabetes = resampling[resampling['diabetes']==0]
diabetes = resampling[resampling['diabetes']==1]
print('no diabetes count: '+ str(len(no_diabetes)))
print('diabetes count: '+ str(len(diabetes)))

### Downsample (resampling)

In [None]:
#downsample the imbalanced data, there are more observations with zeros
no_d_downsampled = resample(no_diabetes,
                                replace = False, 
                                n_samples = len(diabetes), 
                                random_state = 34) 

In [None]:
#concatenate the downsampled data (zeros) with the outcome data that was split before downsampling (ones)
downsampled_train5 = pd.concat([no_d_downsampled, diabetes])
downsampled_train5['diabetes'].value_counts()

In [None]:
#return to the xtrain and ytrain format
X_train5_downsampled = downsampled_train5.drop('diabetes', axis=1)
y_train5_downsampled = downsampled_train5['diabetes']

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train5_downsampled, y_train5_downsampled)
y_train_pred_ds1 = logreg.predict(X_train5_downsampled)
print('TRAINING F1: ', metrics.f1_score(y_train5_downsampled, y_train_pred_ds1))
print(confusion_matrix(y_train5_downsampled, y_train_pred_ds1))

y_test_pred_ds1 = logreg.predict(X_test_5_scaled)
print('TESTING F1: ', metrics.f1_score(ytest4, y_test_pred_ds1))
print(confusion_matrix(ytest4, y_test_pred_ds1))

### Downsample(TOMEK)

In [None]:
tl = TomekLinks()
X_tl_train, y_tl_train = tl.fit_sample(X_train_5_scaled, ytrain4)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_tl_train, y_tl_train)
y_train_pred_tl1 = logreg.predict(X_tl_train)
print('TRAINING F1: ', metrics.f1_score(y_tl_train, y_train_pred_tl1))
print(confusion_matrix(y_tl_train, y_train_pred_tl1))

y_test_pred_tl1 = logreg.predict(X_test_5_scaled)
print('TESTING F1: ', metrics.f1_score(ytest4, y_test_pred_tl1))
print(confusion_matrix(ytest4, y_test_pred_ds1))

### Upsample (resampling)


In [None]:
#using the split from the downsampling effort above

d_train_upsampled = resample(diabetes,
                          replace=True, 
                          n_samples=len(no_diabetes),
                          random_state=34)


In [None]:
upsampled_train5 = pd.concat([d_train_upsampled, no_diabetes])
upsampled_train5['diabetes'].value_counts()

In [None]:
X_train5_upsampled = upsampled_train5.drop('diabetes', axis=1)
y_train5_upsampled = upsampled_train5['diabetes']

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train5_upsampled, y_train5_upsampled)
y_train_pred_up1 = logreg.predict(X_train5_upsampled)
print('TRAINING F1: ', metrics.f1_score(y_train5_upsampled, y_train_pred_up1))
print(confusion_matrix(y_train5_upsampled, y_train_pred_up1))

y_test_pred_up1 = logreg.predict(X_test_5_scaled)
print('TESTING F1: ', metrics.f1_score(ytest4, y_test_pred_up1))
print(confusion_matrix(ytest4, y_test_pred_up1))

### Upsampling (SMOTE)

In [None]:
sm = SMOTE(random_state=34, ratio=1.0)
X_train_sm, y_train_sm = sm.fit_sample(X_train_5_scaled, ytrain4)

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train_sm, y_train_sm)
y_train_pred_sm1 = logreg.predict(X_train_sm)
print('TRAINING F1: ', metrics.f1_score(y_train_sm, y_train_pred_sm1))
print(confusion_matrix(y_train_sm, y_train_pred_sm1))

y_test_pred_sm1 = logreg.predict(X_test_5_scaled)
print('TESTING F1: ', metrics.f1_score(ytest4, y_test_pred_sm1))
print(confusion_matrix(ytest4, y_test_pred_sm1))

# HYPERPARAMETER TUNING

In [None]:
X_train = X_train5_downsampled.copy()
y_train = y_train5_downsampled.copy()
X_test = X_test_5_scaled.copy()
y_test = ytest4.copy()

### KNN Model

In [None]:
find_best_k(X_train, y_train, X_test, y_test, min_k=1, max_k=50)

In [None]:
def knn_error_rate(X_train, y_train, X_test, y_test):
    error_rate = []
    for i in range(1,60):
        knn = KNeighborsClassifier(n_neighbors=i)
        knn.fit(X_train, y_train)
        pred_i = knn.predict(X_test)
        error_rate.append(np.mean(pred_i!=y_test))
    return error_rate

In [None]:
error_rate = knn_error_rate(X_train, y_train, X_test, y_test)
plt.figure(figsize=(7,7))
plt.plot(range(1,60), error_rate, color='teal',linestyle='--', marker='o', markerfacecolor ='tomato', markersize=10)
plt.title('ERROR RATE VS K')
plt.xlabel("K")
plt.ylabel("Error Rate")

In [None]:
knn = KNeighborsClassifier(n_neighbors=23)
knn.fit(X_train, y_train)
pred_k = knn.predict(X_test)
print("F1_score:",metrics.f1_score(y_test, pred_k))
print(confusion_matrix(y_test, pred_k))
print('')
print('')
print(classification_report(y_test, pred_k))

### Decision Tree Model

In [None]:
# GRID SEARCH 
parameters={'criterion': ['gini','entropy'], 
            'min_samples_leaf' : range(5,200,15),
            'max_depth': range(2,20,2)}

In [None]:
clf_tree=DecisionTreeClassifier(random_state=34)
grid_tree=GridSearchCV(clf_tree, parameters, cv=5, scoring='f1')
grid_tree.fit(X_train, y_train)

In [None]:
print(grid_tree.best_score_)
print(grid_tree.best_params_)
print(grid_tree.best_estimator_)

In [None]:
y_pred_t = grid_tree.best_estimator_.predict(X_test)
print("F1_score:",metrics.f1_score(y_test, y_pred_t))


In [None]:
d_tree = DecisionTreeClassifier(criterion= 'gini', max_depth= 2, min_samples_leaf= 110)
d_tree.fit(X_train, y_train)
d_pred = d_tree.predict(X_test)
print("F1_score:",metrics.f1_score(y_test, d_pred))
print(confusion_matrix(y_test, d_pred))
print('/n')
print(classification_report(y_test, d_pred))

In [None]:
feature_cols= X_train.columns
dot_data = StringIO()
export_graphviz(d_tree, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True, feature_names = feature_cols, class_names=['0','1'])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

### Random Forest Model

In [None]:
rfc = RandomForestClassifier()

In [None]:
rfc_param_grid = {
    'n_estimators': [10, 100, 300, 400],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 10, 15],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [1, 2, 5]}

In [None]:
rfc_grid_search = GridSearchCV(rfc, rfc_param_grid, cv= 10)
rfc_grid_search.fit(X_train, y_train)

print("Testing Accuracy: {:.4}%".format(rfc_grid_search.best_score_ * 100))
print("")
print("Optimal Parameters: {}".format(rfc_grid_search.best_params_))


In [None]:
rfc_1 = RandomForestClassifier(criterion= 'entropy', max_depth= 15, min_samples_leaf= 1, min_samples_split= 10, n_estimators= 10)

In [None]:
rfc_1.fit(X_train, y_train)
rfc_1_pred = rfc_1.predict(X_test)
print('Test Accuracy score: ', accuracy_score(y_test, rfc_1_pred))
print('Test F1 score: ', f1_score(y_test, rfc_1_pred))
print(classification_report(y_test,rfc_1_pred))

### XGBoost Model

In [None]:
import xgboost as xgb
xg1 = xgb.XGBClassifier(max_depth=3)
xg1.fit(X_train, y_train)

In [None]:
training_preds = xg1.predict(X_train)
preds = xg1.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, preds)

print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))
test0_f1 = f1_score(y_test, preds)
print(classification_report(y_test, preds))

In [None]:
cv_params = {'max_depth': [3,5,7], 'min_child_weight': [1,3,5]}
ind_params = {'learning_rate': .01, 'n_estimators': 1000, 'seed':0, 'subsample': 0.8, 'colsample_bytree': 0.8, 
             'objective': 'binary:logistic'}

In [None]:
optimized_GBM = GridSearchCV(xgb.XGBClassifier(**ind_params), 
                            cv_params, 
                             scoring = 'accuracy', cv = 5, n_jobs = -1) 

In [None]:
optimized_GBM.fit(X_train, y_train)

In [None]:
best_parameters = optimized_GBM.best_params_

In [None]:
print("Grid Search found the following optimal parameters: ")
for param_name in sorted(best_parameters.keys()):
    print("%s: %r" % (param_name, best_parameters[param_name]))

training_preds = optimized_GBM.predict(X_train)
val_preds = optimized_GBM.predict(X_test)
training_accuracy = accuracy_score(y_train, training_preds)
val_accuracy = accuracy_score(y_test, val_preds)

print("")
print("Training Accuracy: {:.4}%".format(training_accuracy * 100))
print("Validation accuracy: {:.4}%".format(val_accuracy * 100))
test1_f1 = f1_score(y_test, val_preds)
test1_f1

# SAVING MODEL

In [None]:
#import pickle

In [None]:
# pickle the knn model 
 
model_pickle_path = 'knn.pkl'

# Create an variable to pickle and open it in write mode
model_pickle = open(model_pickle_path, 'wb')
pickle.dump(knn, model_pickle)
model_pickle.close()