# To predict diabetes using PIMA diabetes data

### Importing libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

### Load and review data

In [2]:
data = pd.read_csv("./data/pima-data.csv")

FileNotFoundError: File b'./data/pima-data.csv' does not exist

In [None]:
data.shape

In [None]:
data.head(5)

In [None]:
data.isnull().values.any()

In [None]:
def plot_correlated(data, size=11):
    """
    Function to show plot of correlation between the data columns
    data : the data from pandas Dataframe
    size : the horizontal and vertical size of the plot
    
    Result : blue - cyan - yellow - red -darkred = less to more correlated columns = 0 ---- 1 
    """
    corr = data.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(corr.columns)), corr.columns)

In [None]:
plot_correlated(data)

In [None]:
data.corr()

In [None]:
del data['thickness']

In [None]:
data.head(5)

In [None]:
plot_correlated(data)

### Uniformying the data types of the features

In [None]:
data.head(5)

Changing the diabetes column data from boolean to number

In [None]:
diabetes_map = {True: 1, False: 0}

In [None]:
data['diabetes'] = data['diabetes'].map(diabetes_map)

In [None]:
data.head(5)

In [None]:
diabetes_true_count = len(data.loc[data['diabetes'] == True])

In [None]:
diabetes_false_count = len(data.loc[data['diabetes'] == False])

In [None]:
print("Diabetes - True : ", diabetes_true_count)
print("Diabetes - False : ", diabetes_false_count)

In [None]:
print("Diabetes - True : ", diabetes_true_count/data.shape[0]*100)
print("Diabetes - False : ", diabetes_false_count/data.shape[0]*100)

### Splitting the data
70% for training and 30% for testing

In [None]:
from sklearn.cross_validation import train_test_split
feature_columns = ['num_preg', 'glucose_conc', 'diastolic_bp', 'insulin', 'bmi', 'diab_pred', 'age', 'skin']
predicted_class = ['diabetes']

X = data[feature_columns].values
y = data[predicted_class].values

split_test_size = 0.30

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = split_test_size, random_state=10)

Checking the size of the test and train splits

In [None]:
X_train.shape[0]/X.shape[0] * 100



In [None]:
X_test.shape[0]/X.shape[0] * 100

Verifying if the prediction are splitted uniformly

In [None]:
print(len(data.loc[data['diabetes'] == 1]) / len(data.loc[data['diabetes']]) * 100)
print(len(data.loc[data['diabetes'] == 0]) / len(data.loc[data['diabetes']]) * 100)

In [None]:
print(len(y_train[y_train[:] == 1]) / len(y_train[y_train[:]]) * 100)
print(len(y_train[y_train[:] == 0]) / len(y_train[y_train[:]]) * 100)

In [None]:
print(len(y_test[y_test[:] == 1]) / len(y_test[y_test[:]]) * 100)
print(len(y_test[y_test[:] == 0]) / len(y_test[y_test[:]]) * 100)

### Post split data preparation

#### hidden missing values

In [None]:
data.head(5)

Check how many other missing(zero) values are there...

In [None]:
print("total number of rows : {0}".format(len(data)))
print("number of rows missing glucose_conc: {0}".format(len(data.loc[data['glucose_conc'] == 0])))
print("number of rows missing glucose_conc: {0}".format(len(data.loc[data['glucose_conc'] == 0])))
print("number of rows missing diastolic_bp: {0}".format(len(data.loc[data['diastolic_bp'] == 0])))
print("number of rows missing insulin: {0}".format(len(data.loc[data['insulin'] == 0])))
print("number of rows missing bmi: {0}".format(len(data.loc[data['bmi'] == 0])))
print("number of rows missing diab_pred: {0}".format(len(data.loc[data['diab_pred'] == 0])))
print("number of rows missing age: {0}".format(len(data.loc[data['age'] == 0])))
print("number of rows missing skin: {0}".format(len(data.loc[data['skin'] == 0])))

In [None]:
from sklearn.preprocessing import Imputer

fill_0 = Imputer(missing_values=0, strategy="mean", axis=0)

X_train = fill_0.fit_transform(X_train)
X_test = fill_0.fit_transform(X_test)

# Training with Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

naive_model = GaussianNB()
naive_model.fit(X_train, y_train.ravel())

## Performance on training data

In [None]:
naive_predict_train_data = naive_model.predict(X_train)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_train, naive_predict_train_data)))

## Performance on test data

In [None]:
naive_predict_test_data = naive_model.predict(X_test)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, naive_predict_test_data)))

#### Looking into more details

In [None]:
print("Confusion matrix")

print("{0}".format(metrics.confusion_matrix(y_test, naive_predict_test_data, labels=[1,0])))

print("Classification report")

print("{0}".format(metrics.classification_report(y_test, naive_predict_test_data, labels=[1,0])))


# Random Forest



In [None]:
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(random_state=10)

random_forest_model.fit(X_train, y_train.ravel())

#### Predicting Training data

In [None]:
rf_predict_train_data = random_forest_model.predict(X_train)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_train, rf_predict_train_data)))

#### Predicting Test data

In [None]:
rf_predict_test_data = random_forest_model.predict(X_test)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, rf_predict_test_data)))

#### Looking into more details

In [None]:
print("Confusion matrix")

print("{0}".format(metrics.confusion_matrix(y_test, rf_predict_test_data, labels=[1,0])))

print("Classification report")

print("{0}".format(metrics.classification_report(y_test, rf_predict_test_data, labels=[1,0])))

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(C=0.7, random_state=10)

log_reg_model.fit(X_train, y_train.ravel())

#### Predicting with train data

In [None]:
lr_predict_train_data = log_reg_model.predict(X_train)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_train, lr_predict_train_data)))

#### Predicting with test data

In [None]:
lr_predict_test_data = log_reg_model.predict(X_test)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, lr_predict_test_data)))

#### looking into more details

In [None]:
print("Confusion matrix")

print("{0}".format(metrics.confusion_matrix(y_test, lr_predict_test_data, labels=[1,0])))

print("Classification report")

print("{0}".format(metrics.classification_report(y_test, lr_predict_test_data, labels=[1,0])))

##### Selecting regularization parameter for logistic regression

In [None]:
C_start = 0.1
C_end = 5
C_inc = 0.1

C_values, recall_scores = [], []
C_val = C_start
best_recall = 0
while(C_val < C_end):
    C_values.append(C_val)
    lr_model_loop = LogisticRegression(C = C_val, random_state=10)
    lr_model_loop.fit(X_train, y_train.ravel())
    lr_predict_loop_test = lr_model_loop.predict(X_test)
    recall_score = metrics.recall_score(y_test, lr_predict_loop_test)
    recall_scores.append(recall_score)
    if(recall_score > best_recall):
        best_recall = recall_score
        best_lr_predict_test = lr_predict_loop_test
        
    C_val += C_inc
    
best_score_C_val = C_values[recall_scores.index(best_recall)]
print("First max value of {0:.3f} occured at {1:.3f}".format(best_recall, best_score_C_val))

%matplotlib inline
plt.plot(C_values, recall_scores, "-")
plt.xlabel("C_values")
plt.ylabel("recall_score")


# Logistic Regression with class_weight='balanced'

In [None]:
C_start = 0.1
C_end = 5
C_inc = 0.1

C_values, recall_scores = [], []
C_val = C_start
best_recall = 0
while(C_val < C_end):
    C_values.append(C_val)
    lr_model_loop = LogisticRegression(C = C_val, class_weight="balanced", random_state=10)
    lr_model_loop.fit(X_train, y_train.ravel())
    lr_predict_loop_test = lr_model_loop.predict(X_test)
    recall_score = metrics.recall_score(y_test, lr_predict_loop_test)
    recall_scores.append(recall_score)
    if(recall_score > best_recall):
        best_recall = recall_score
        best_lr_predict_test = lr_predict_loop_test
        
    C_val += C_inc
    
best_score_C_val = C_values[recall_scores.index(best_recall)]
print("First max value of {0:.3f} occured at {1:.3f}".format(best_recall, best_score_C_val))

%matplotlib inline
plt.plot(C_values, recall_scores, "-")
plt.xlabel("C_values")
plt.ylabel("recall_score")


In [None]:
from sklearn.linear_model import LogisticRegression

log_reg_model = LogisticRegression(C=0.7, class_weight="balanced", random_state=10)

log_reg_model.fit(X_train, y_train.ravel())

lr_predict_test_data = log_reg_model.predict(X_test)

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, lr_predict_test_data)))

print("Confusion matrix")

print("{0}".format(metrics.confusion_matrix(y_test, lr_predict_test_data, labels=[1,0])))

print("Classification report")

print("{0}".format(metrics.classification_report(y_test, lr_predict_test_data, labels=[1,0])))

# LogisticRegressionCV

In [None]:
from sklearn.linear_model import LogisticRegressionCV
lrcv_model = LogisticRegressionCV(n_jobs=-1, random_state=10, Cs=3, cv=10, refit=True, class_weight="balanced")
lrcv_model.fit(X_train, y_train.ravel())

#### Predicting with test data


In [None]:
lrcv_predict_test_data = lrcv_model.predict(X_test)

from sklearn import metrics

print("Accuracy = {0:.3f}".format(metrics.accuracy_score(y_test, lrcv_predict_test_data)))

print("Confusion matrix")

print("{0}".format(metrics.confusion_matrix(y_test, lrcv_predict_test_data, labels=[1,0])))

print("Classification report")

print("{0}".format(metrics.classification_report(y_test, lrcv_predict_test_data, labels=[1,0])))