In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sys
import matplotlib
import seaborn as sns
import scipy as sp
import IPython
from IPython import display
import sklearn 
import random
import time
import warnings
warnings.filterwarnings('ignore')

from xgboost import XGBClassifier

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data_raw = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

# make a copy
data1 = data_raw.copy(deep = True)

print(data_raw.info()) # look at dataset
# data_raw.sample(10)
data_raw.describe() # dataset distribution metrics

In [None]:
# Explore correlation to select, combine features
correlation = data_raw.corr()
sns.heatmap(correlation, annot=True, cbar=True, cmap="RdYlGn")

# Correlated with OUTCOME: Glucose, BMI, Age, Pregnancies, DPF, Insulin
# BP correlated with BMI, SkinThick with Insulin -> Combine features
# Make baseline model first with basic features
# Maybe discretize glucose and insulin levels?

In [None]:
# Check distributions of columns
import matplotlib.pyplot as plt

fig, axes = plt.subplots(data1.shape[1], 1, figsize=(10,30), dpi=90)

for ax, feature in zip(axes.flat, data1.columns):
    sns.distplot(data1[feature], color="skyblue", ax=ax)
# see that Pregnancies, Insulin, DiabetesPedigreeFunction, Age are skewed
# May need to scale depending on model used 

In [None]:
data1.head()

In [None]:
#replace zero values in Glucose, BP, SkinThickness, Insulin
dflist = []
for i in [0, 1]:
    df1 = data1.loc[data1['Outcome'] == i]
    df1 = df1.replace({'BloodPressure':0}, np.median(df1['BloodPressure']))
    df1 = df1.replace({'Glucose':0}, np.median(df1['Glucose']))
    df1 = df1.replace({'SkinThickness':0}, np.median(df1['SkinThickness']))
    df1 = df1.replace({'Insulin':0}, np.median(df1['Insulin']))
    dflist.append(df1)
data1 = pd.concat(dflist)
    
#default is 0.75 to 0.25 split
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1.drop(columns='Outcome'), 
                                                                        data1['Outcome'], 
                                                                        random_state = 0,
                                                                       stratify = data1['Outcome'])
print("Data1 Shape: {}".format(data1.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))
train1_x.head()



In [None]:
for x in data1:
    if data1[x].dtype != 'float64' :
        print('Outcome Correlation by:', x)
        print(data1[[x, 'Outcome']].groupby(x, as_index=False).mean())
        print('-'*10, '\n')

In [None]:
# Prepare data for BASELINE model using given features

feat_cols = ['Glucose', 'Age', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction',
            'Insulin', 'BloodPressure', 'SkinThickness']

# 89% training 83% baseline test acc, without Skin Thickness
# WITh skin thickness: 90, 85

In [None]:
# Prepare data 
# Correlated with OUTCOME: Glucose, BMI, Age, Pregnancies, DPF, Insulin
# BP correlated with BMI, SkinThick with Insulin -> Combine features
# Make baseline model first with basic features
# Maybe discretize glucose and insulin levels?

data_cleaner = [train1_x, test1_x]
for df in data_cleaner: 
    # create new features lol
    df['BPxBMI'] = df['BloodPressure']*df['BMI']
    df['STxINSUL'] = df['SkinThickness']*df['Insulin']
    # Bin into discrete groups
    df['GluBin'] = pd.cut(df['Glucose'].astype(int), 5)
    df['InsBin'] = pd.cut(df['Insulin'].astype(int), 5)
    df['AgeBin'] = pd.cut(df['Age'].astype(int), 5) 
    #convert to category with LabelEncoder
    df['GluBin_Code'] = LabelEncoder().fit_transform(df['GluBin'])
    df['InsBin_Code'] = LabelEncoder().fit_transform(df['InsBin'])
    df['AgeBin_Code'] = LabelEncoder().fit_transform(df['AgeBin'])
    
    
feat_cols = ['GluBin_Code', 'InsBin_Code', 'AgeBin_Code', 'Pregnancies', 'DiabetesPedigreeFunction',
             'BPxBMI', 'STxINSUL']

# 88% training, 85% test accuracy with new features, without BP,BMI,SkinThick
# 88%, 83% WITH BP, BMI, SkinThick, and new features - probably double represented

In [None]:
#try another set of features, discretize glucose and insulin

data_cleaner = [train1_x, test1_x]
for df in data_cleaner:  # create features
    df['GluBin'] = pd.cut(df['Glucose'].astype(int), 5)
    df['InsBin'] = pd.cut(df['Insulin'].astype(int), 5)
    df['AgeBin'] = pd.cut(df['Age'].astype(int), 5) 
    #convert to category with LabelEncoder
    df['GluBin_Code'] = LabelEncoder().fit_transform(df['GluBin'])
    df['InsBin_Code'] = LabelEncoder().fit_transform(df['InsBin'])
    df['AgeBin_Code'] = LabelEncoder().fit_transform(df['AgeBin'])
    
feat_cols = ['GluBin_Code', 'InsBin_Code', 'AgeBin_Code', 'Pregnancies', 'DiabetesPedigreeFunction']

#Without including BP, BMI, SkinThick 70, 75%

In [None]:
#Set to RF classifier model, n_est tuned at 15, ensemble bagging forest type
model = RandomForestClassifier(n_estimators = 200,
                              max_features = 'sqrt')
model.fit(train1_x[feat_cols], train1_y)

#Setting best model as one with new features, adjusting hyperparams
#Results at 88, 85%, tuning min_samples_leaf 2-5 no change

In [None]:
#Set to SVC large margin classifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
#Affected by distance, need to scale first
scaler = StandardScaler().fit(train1_x[feat_cols])
model = SVC(kernel = 'poly', gamma=1, degree = 3)
model.fit(scaler.transform(train1_x[feat_cols]), train1_y)

#doesnt do very well... might need to tune

In [None]:
#Set to XGBoost ensemble gradient descent boosting
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler

#Affected by distance, need to scale first only on train set, fit to both train and test/val set
scaler = StandardScaler().fit(train1_x[feat_cols])

model = XGBClassifier(n_estimators = 100, learning_rate = 0.01)
model.fit(scaler.transform(train1_x[feat_cols]), train1_y, verbose = False)

#XGBoost does ok, around 88, 83. Random Forest still seems better
#Tuning n_est and learning rate

In [None]:
# Predict, Evaluate using xval accuracy 
# No scaling here, forest not affected 
from sklearn import ensemble   # set up model
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

cv = ShuffleSplit(n_splits = 100, test_size = 0.25, random_state = 0)

trainAcc = round(np.median(cross_val_score(model, train1_x[feat_cols], train1_y, cv = cv)),2)*100
testAcc = round(np.median(cross_val_score(model, test1_x[feat_cols], test1_y, cv = cv)),2)*100

# f1score = f1_score(train1_y, y_pred) * 100
# Accuracy = round(np.median(cross_val_score(model, test1_x[feat_cols], test1_y, cv = cv)),2)*100

print('Acc on train set: ', trainAcc, '%')
print('Acc on test set: ', testAcc, '%')

In [None]:
# Scaled Version for distance affected models like SVM, XGBoost
# Predict, Evaluate using xval accuracy 
from sklearn import ensemble   # set up model
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

cv = ShuffleSplit(n_splits = 100, test_size = 0.25, random_state = 0)

trainAcc = round(np.median(cross_val_score(model, scaler.transform(train1_x[feat_cols]), train1_y, cv = cv)),2)*100
testAcc = round(np.median(cross_val_score(model, scaler.transform(test1_x[feat_cols]), test1_y, cv = cv)),2)*100

# f1score = f1_score(train1_y, y_pred) * 100
# Accuracy = round(np.median(cross_val_score(model, test1_x[feat_cols], test1_y, cv = cv)),2)*100

print('Acc on train set: ', trainAcc, '%')
print('Acc on test set: ', testAcc, '%')