# Diabetic Analysis

## Import Libraries


In [1]:
# Data Manipulation
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Models, Training and Testing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('../data/Diabetes.csv')
df.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,Positive
1,1,85,66,29,0,26.6,0.351,31,Negative
2,8,183,64,0,0,23.3,0.672,32,Positive
3,1,89,66,23,94,28.1,0.167,21,Negative
4,0,137,40,35,168,43.1,2.288,33,Positive
5,5,116,74,0,0,25.6,0.201,30,Negative
6,3,78,50,32,88,31.0,0.248,26,Positive
7,10,115,0,0,0,35.3,0.134,29,Negative
8,2,197,70,45,543,30.5,0.158,53,Positive
9,8,125,96,0,0,0.0,0.232,54,Positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    object 
dtypes: float64(2), int64(6), object(1)
memory usage: 54.1+ KB


In [5]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [6]:
df.shape

(768, 9)

### Encoding

In [7]:
# Encoding
cols = df.select_dtypes(['object', 'category'])
for i in cols:
    df[i] = LabelEncoder().fit_transform(df[i])

df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
df.corr()['Outcome']

Pregnancies                 0.221898
Glucose                     0.466581
BloodPressure               0.065068
SkinThickness               0.074752
Insulin                     0.130548
BMI                         0.292695
DiabetesPedigreeFunction    0.173844
Age                         0.238356
Outcome                     1.000000
Name: Outcome, dtype: float64

In [9]:
df['Outcome'].unique()

array([1, 0])

## Training and Test Data Creation

In [10]:
# Creating the training and testing dataset
x = df.drop('Outcome', axis=1)
y = df['Outcome']

### Feature Scaling

In [11]:
from sklearn.preprocessing import StandardScaler
x = StandardScaler().fit_transform(x)

In [13]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

## Determining what model to use
- Since the target is available
    - We can go for supervised models
- Since the target is not a numerical continuous data
    - We can go for Classification algorithm
        - Decision Tree
        - Random Forest
        - KNN Classifier
        - KNN Logistic Regression
        - Naive Bayes
        - SVN Algorithm


### Decision Tree

In [14]:
# Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier().fit(x_train, y_train)

In [15]:
y_pred = model.predict(x_test) # this gets the predicted output
# Checking the performance


In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score
dt_accuracy_score = accuracy_score(y_test, y_pred)
confusion_matrix(y_test, y_pred)
dt_precision_score = precision_score(y_test, y_pred)

In [17]:
print ('Accuracy Score (Decision Tree Model): ', dt_accuracy_score)
print ('Precision Score (Decision Tree Model): ', dt_precision_score)

Accuracy Score (Decision Tree Model):  0.7207792207792207
Precision Score (Decision Tree Model):  0.5818181818181818


In [18]:
class ModelInfo:
    def __init__(self):
        self.model_name = 'NA'
        self.model = None
        self.accuracy = 0.0
        self.precision = 0.0
model_list = []

In [19]:
len(model_list)

0

In [20]:
m_dt = ModelInfo()
m_dt.model_name = "Decision Tree"
m_dt.model = model
m_dt.accuracy = dt_accuracy_score
m_dt.precision = dt_precision_score

model_list.append(m_dt)

In [None]:
# Visual Representation
# plt.figure(figsize=(20, 20))
# import sklearn.tree as tree
# tree.plot_tree(model, feature_names=x.columns, filled=True, fontsize=8)
# plt.show()

In [21]:
print("Model Name:", model_list[0].model_name)
print("Model Score", model_list[0].model.score(x_test, y_test))

Model Name: Decision Tree
Model Score 0.7207792207792207


In [22]:
# Testing with a single value
#Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age

# test_value = np.array([[0,132,78,0,0,32.4,0.393,21]])
# test_value_neg = np.array([[0,173,78,32,265,46.5,1.159,58]]) # Negative Input
# test_value_pos = np.array([[4,111,72,47,207,37.1,1.390,56]]) # Positive Input
test_value_pos = np.array([[6, 148, 72, 35, 0, 33.6, 0.627, 50]]) # positive (1)
test_value_neg = np.array([[1, 85, 66, 29, 0, 26.6, 0.351, 31]]) # negative (0)

# 0	173	78	32	265	46.5	1.159	58	Negative
print(f"Prediction for neg values {test_value_neg} is {model_list[0].model.predict(test_value_neg)}")
print(f"Prediction for pos values {test_value_pos} is {model_list[0].model.predict(test_value_pos)}")

Prediction for neg values [[ 1.    85.    66.    29.     0.    26.6    0.351 31.   ]] is [1]
Prediction for pos values [[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]] is [1]


In [23]:
len(model_list)

1

### KNN

In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

knn = KNeighborsClassifier()

# Define the hyper parameter grid
max_len = int(round(np.sqrt(len(x_train)),0))
param_grid = {
    'n_neighbors': range(1,max_len,1), # Test for 1 to 30 step 2 neighbors
    'weights': ['uniform', 'distance'], # Test uniform and distance weights
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['euclidean', 'manhattan', 'minkowski'] # Test differenc distance methods
}
# param_grid = {
#     'n_neighbors': list(np.arange(1,max_len,1)), # Test for 1 to 30 step 2 neighbors
#     'weights': ['uniform', 'distance'], # Test uniform and distance weights
#     'metric': ['euclidean', 'manhattan', 'minkowski'] # Test differenc distance methods
# }


In [33]:
grid_search = GridSearchCV(knn, param_grid=param_grid, scoring='accuracy', cv=10, n_jobs=-1) #n_jobs=-1 will use all pre processor
grid_search.fit(x_train, y_train)

# Step 6: Best hyperparameters and corresponding accuracy
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Training Accuracy:", grid_search.best_score_)

# arange
# Best Hyperparameters: {'metric': 'manhattan', 'n_neighbors': 16, 'weights': 'distance'}
# Best Training Accuracy: 0.7655071304811409

Best Hyperparameters: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 23, 'weights': 'distance'}
Best Training Accuracy: 0.7442622950819672


In [36]:
grid_search.best_params_['algorithm']

'auto'

In [27]:
knn_model = grid_search.best_estimator_
y_pred_knn = knn_model.predict(x_test)

In [28]:
knn_accuracy_score = accuracy_score(y_test, y_pred_knn)
confusion_matrix(y_test, y_pred_knn)
knn_precision_score = precision_score(y_test, y_pred_knn)

print ('Accuracy Score (KNN Hyper Tuned Classifer): ', knn_accuracy_score)
print ('Precision Score (KNN Hyper Tuned Classifer): ', knn_precision_score)

Accuracy Score (KNN Hyper Tuned Classifer):  0.7922077922077922
Precision Score (KNN Hyper Tuned Classifer):  0.7941176470588235


In [29]:
print(f"Prediction for neg values {test_value_neg} is {knn_model.predict(test_value_neg)}")
print(f"Prediction for pos values {test_value_pos} is {knn_model.predict(test_value_pos)}")

Prediction for neg values [[ 1.    85.    66.    29.     0.    26.6    0.351 31.   ]] is [1]
Prediction for pos values [[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]] is [1]


In [30]:
m_knn = ModelInfo()
m_knn.model_name = "KNN Hyper-Tuned Classifier"
m_knn.model = knn_model
m_knn.accuracy = knn_accuracy_score
m_knn.precision = knn_precision_score

model_list.append(m_knn)

In [31]:
len(model_list)

2

In [32]:
print(f"Prediction for neg values {test_value_neg} is {model_list[1].model.predict(test_value_neg)}")
print(f"Prediction for pos values {test_value_pos} is {model_list[1].model.predict(test_value_pos)}")

Prediction for neg values [[ 1.    85.    66.    29.     0.    26.6    0.351 31.   ]] is [1]
Prediction for pos values [[  6.    148.     72.     35.      0.     33.6     0.627  50.   ]] is [1]


### SVM

In [37]:
from sklearn.svm import SVC

model_svm = SVC().fit(x_train, y_train)

In [39]:
y_pred_svm = model_svm.predict(x_test)
accuracy_score(y_test, y_pred_svm)

0.7987012987012987

In [51]:
# hyper param tuning
params = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1.0, 10, 100]
}

grid=GridSearchCV(model_svm, param_grid=params, cv=10, scoring='accuracy').fit(x_train, y_train)
grid.best_params_

{'C': 0.1, 'kernel': 'linear'}

In [None]:
# model_tuned = grid.best_estimator_
# cross_val_score (model_tuned
# ASK WITH HARI FOR CODE

In [None]:
x1 = df.drop(['BloodPressure','SkinThickness', 'Outcome'], axis=1)
x_train1, x_test1, y_train1, y_test1 = train_test_split(x1,y,test_size=0.2)
x_train1

In [46]:
model_svm1 = SVC().fit(x_train1, y_train1) 

In [47]:
y_pred1 = model_svm1.predict(x_test1)

In [49]:
accuracy_score(y_test1, y_pred1)

0.7857142857142857