# Introduction
​

### Job Description

Can you develop a machine learning model that can predict whether people have diabetes when their characteristics are specified?

    
### Dataset Story 

* The dataset is part of the large dataset held at the National Institutes of Diabetes-Digestive-Kidney Diseases in the USA.

* Data used for diabetes research on Pima Indian women aged 21 and over living in Phoenix, the 5th largest city of the State of Arizona in the USA.

* It consists of 768 observations and 8 numerical independent variables. The target variable is specified as "outcome"; 1 indicates positive diabetes test result, 0 indicates negative.

    
### Variables

* Pregnancies – Number of pregnancies
* Glucose – Glucose 2-hour plasma glucose concentration in the oral glucose tolerance test
* Blood Pressure (mm Hg)
* SkinThickness – Skin Thickness
* Insulin – 2-hour serum insulin (mu U/ml)
* DiabetesPedigreeFunction: – Function 2-hour plasma glucose concentration in the oral glucose tolerance test
* Age – Age (years)
* Outcome: Have the disease (1) or not (0)

### Mission

Develop a diabetes prediction model by performing literature research, data preprocessing, and feature engineering.


In [1]:

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.express as px
import random
import plotly.figure_factory as ff


from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, plot_roc_curve
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler, RobustScaler

import cufflinks as cf 

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot


from sklearn.neighbors import LocalOutlierFactor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/pima-indians-diabetes-database/diabetes.csv


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)



## Load and Check Data

In [3]:
df = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


<a id = "2"></a>
## Basic Data Analysis

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [6]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845,3.37,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.895,31.973,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105,19.356,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536,15.952,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799,115.244,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.993,7.884,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.472,0.331,0.078,0.244,0.372,0.626,2.42
Age,768.0,33.241,11.76,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.349,0.477,0.0,0.0,0.0,1.0,1.0


## Outliers

In [7]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [8]:
outlier_thresholds(df, "Pregnancies")

(-15.0, 25.0)

In [9]:
outlier_thresholds(df, "Glucose")

(-74.0, 334.0)

In [10]:
outlier_thresholds(df, "SkinThickness")

(-66.0, 110.0)

In [11]:
outlier_thresholds(df, "Insulin")

(-439.5, 732.5)

In [12]:
outlier_thresholds(df, "BMI")

(-12.09249999999999, 78.2875)

In [13]:
outlier_thresholds(df, "DiabetesPedigreeFunction")

(-1.3483999999999996, 2.621599999999999)

In [14]:
outlier_thresholds(df, "Age")

(-34.5, 113.5)

## Categoric or Numeric Data Analysis

In [15]:
def grab_col_names(dataframe, cat_th=10, car_th=20):

    # cat_cols, cat_but_car
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]

    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and
                   dataframe[col].dtypes != "O"]

    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and
                   dataframe[col].dtypes == "O"]

    cat_cols = cat_cols + num_but_cat

    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    # num_cols
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O"]

    num_cols = [col for col in num_cols if col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f'cat_cols: {len(cat_cols)}')
    print(f'num_cols: {len(num_cols)}')
    print(f'cat_but_car: {len(cat_but_car)}')
    print(f'num_but_cat: {len(num_but_cat)}')

    return cat_cols, num_cols, cat_but_car

In [16]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [17]:
cat_cols, num_cols, cat_but_car = grab_col_names(df,cat_th=10, car_th=20)

Observations: 768
Variables: 9
cat_cols: 1
num_cols: 8
cat_but_car: 0
num_but_cat: 1


In [18]:
for col in num_cols:
    print(col, check_outlier(df, col))

Pregnancies False
Glucose False
BloodPressure False
SkinThickness False
Insulin True
BMI False
DiabetesPedigreeFunction False
Age False


## Getting rid of outliers

In [19]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit


In [20]:
replace_with_thresholds(df, 'Insulin')

In [21]:
replace_with_thresholds(df, 'Age')

In [22]:
replace_with_thresholds(df, 'BloodPressure')

In [23]:
replace_with_thresholds(df, 'Pregnancies')

In [24]:
replace_with_thresholds(df, 'Glucose')


In [25]:
replace_with_thresholds(df, 'BMI')

In [26]:
replace_with_thresholds(df, 'DiabetesPedigreeFunction')

In [27]:
replace_with_thresholds(df, 'SkinThickness')

In [28]:
cat_cols, num_cols, cat_but_car = grab_col_names(df,cat_th=10, car_th=20)

Observations: 768
Variables: 9
cat_cols: 1
num_cols: 8
cat_but_car: 0
num_but_cat: 1


In [29]:
for col in num_cols:
    print(col, check_outlier(df, col))

Pregnancies False
Glucose False
BloodPressure False
SkinThickness False
Insulin False
BMI False
DiabetesPedigreeFunction False
Age False


Glucose değerleri nasıl olmalı?

Aç Kan Şekeri Seviyesi
* 50/70 mg/ dl Hipoglisemi
* 70/100 Normal
* 100/125 Gizli Şeker
* 126/ve üzeri Diyabet

Tok Kan Şekeri Seviyesi
* 100/140 Normal
* 140/199 Gizli Şeker
* 200/ve üzeri diyabet


In [30]:
bins = [-1, 50, 70, 100, 125, int(df["Glucose"].max())]

In [31]:
mylabels = ['anlamsızca_dusuk','normal1', 'normal2', 'gizli_seker', 'diyabet']

In [32]:
df["Glucose_cat"] = pd.cut(df["Glucose"], bins, labels=mylabels)


Küçük tansiyon (BloodPressure) değerleri nasıl olmalı?
* Düşük Tansiyon <60
* En İdeal Tansiyon 80
* Sağlıklı bir Tansiyon <90
* Yüksek Tansiyon 90 - 100
* Çok yüksek tansiyon >100

In [33]:
bins = [-1, 60, 80, 81, 90, 100, int(df["BloodPressure"].max())]

In [34]:
mylabels = ['cok_dusuk', 'dusuk', 'ideal', 'saglikli', 'yuksek', 'cok_yuksek']

In [35]:
df["BloodPressure_cat"] = pd.cut(df["BloodPressure"], bins, labels=mylabels)

In [36]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Glucose_cat,BloodPressure_cat
0,6,148,72.0,35,0.0,33.6,0.627,50.0,1,diyabet,dusuk
1,1,85,66.0,29,0.0,26.6,0.351,31.0,0,normal2,dusuk
2,8,183,64.0,0,0.0,23.3,0.672,32.0,1,diyabet,dusuk
3,1,89,66.0,23,94.0,28.1,0.167,21.0,0,normal2,dusuk
4,0,137,40.0,35,168.0,43.1,2.288,33.0,1,diyabet,cok_dusuk


Age
* 0-1 baby
* 1-3 toddler
* 4-12 child
* 13-20 teen
* 20-25 young adult 
* 25-50 adult
* 50-65 middle age 
* 65-.. Senior
* Note: In our data, the age starts at 21. That's why I categorized it starting from young_adult.

In [37]:
bins = [20, 25, 50, 65, int(df["Age"].max())]

In [38]:
mylabels = ['young_adult', 'adult', 'middle_age', 'senior']

In [39]:
df["age_cat"] = pd.cut(df["Age"], bins, labels=mylabels)

In [40]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Glucose_cat,BloodPressure_cat,age_cat
0,6,148,72.0,35,0.0,33.6,0.627,50.0,1,diyabet,dusuk,adult
1,1,85,66.0,29,0.0,26.6,0.351,31.0,0,normal2,dusuk,adult
2,8,183,64.0,0,0.0,23.3,0.672,32.0,1,diyabet,dusuk,adult
3,1,89,66.0,23,94.0,28.1,0.167,21.0,0,normal2,dusuk,young_adult
4,0,137,40.0,35,168.0,43.1,2.288,33.0,1,diyabet,cok_dusuk,adult


In [41]:
cat_cols, num_cols, cat_but_car = grab_col_names(df,cat_th=10, car_th=20)

Observations: 768
Variables: 12
cat_cols: 4
num_cols: 8
cat_but_car: 0
num_but_cat: 4


BMI
* <18.5 Underweight
* 18.5 - 24.5 Normal
* 25 - 29.9 Overweight
* 30 - 34.9 Obese
* 35< Extremely Obese

In [42]:
bins = [-1, 19, 25, 30, 35, int(df["Age"].max())]

In [43]:
mylabels = ['underweight', 'normal', 'overweight', 'obese' , 'extremelyobese']

In [44]:
df["BMI_cat"] = pd.cut(df["BMI"], bins, labels=mylabels)

In [45]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Glucose_cat,BloodPressure_cat,age_cat,BMI_cat
0,6,148,72.0,35,0.0,33.6,0.627,50.0,1,diyabet,dusuk,adult,obese
1,1,85,66.0,29,0.0,26.6,0.351,31.0,0,normal2,dusuk,adult,overweight
2,8,183,64.0,0,0.0,23.3,0.672,32.0,1,diyabet,dusuk,adult,normal
3,1,89,66.0,23,94.0,28.1,0.167,21.0,0,normal2,dusuk,young_adult,overweight
4,0,137,40.0,35,168.0,43.1,2.288,33.0,1,diyabet,cok_dusuk,adult,extremelyobese


In [46]:
df[df["BloodPressure_cat"].isnull()]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Glucose_cat,BloodPressure_cat,age_cat,BMI_cat


### Visualization for Target

In [47]:
def random_colors(number_of_colors):
    color = ["#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
                 for i in range(number_of_colors)]
    return color

In [48]:
species_count = df['Outcome'].value_counts()
data = [go.Bar(
    x = species_count.index,
    y = species_count.values,
    marker = dict(color = random_colors(3),line=dict(color='#000000', width=2)))]

layout = go.Layout(
   {
      "title":"Healthy VS Diabetic",
   }
)

fig = go.Figure(data=data,layout = layout)
iplot(fig)

In [49]:
trace = go.Pie(labels = list(df.Outcome.unique()), values = list(df.Outcome.value_counts()),
                            hole = 0.2,
               marker=dict(colors = random_colors(3), 
                           line=dict(color='#000000', width=2)
                           ))
data = [trace]
layout = go.Layout(
   {
      "title":"Healthy VS Diabetic",
   }
)

fig = go.Figure(data=data,layout = layout)
iplot(fig)

### Correlation

In [50]:
data = [go.Heatmap(z = np.array(df.corr().values),
                   x = np.array(df.corr().columns),
                   y = np.array(df.corr().columns),
                     colorscale='Viridis',)
       ]
layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                            #autosize = False,
                            #height  = 1400,
                            #width   = 1600,
                            margin  = dict(r = 0 ,l = 100,
                                           t = 0,b = 100,
                                         ),
                            yaxis   = dict(tickfont = dict(size = 9)),
                            xaxis   = dict(tickfont = dict(size = 9)),
                           )
                      )

fig = go.Figure(data=data,layout = layout)
iplot(fig)

## Pregnancies Analysis

In [51]:
trace0 = go.Box(
    name = 'Pregnancies',
    y = df["Pregnancies"]
)


data = [trace0]


layout = go.Layout(
   {
      "title":"Pregnancies ",
   }
)


fig = go.Figure(data=data,layout = layout)
iplot(fig)

In [52]:
Diabetic = df[(df['Outcome'] != 0)]
Healthy = df[(df['Outcome'] == 0)]

tmp1 = Diabetic["Pregnancies"]
tmp2 = Healthy["Pregnancies"]
hist_data = [tmp1, tmp2]
    
group_labels = ['diabetic', 'healthy']
colors = random_colors(2)

fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = 0, curve_type='kde')
    
fig['layout'].update(title = "Pregnancies")

py.iplot(fig, filename = 'Density plot')

## Glucose Analysis

In [53]:
cols = "Glucose"
trace0 = go.Box(
    name = cols,
    y = df[cols]
)

Diabetic = df[(df['Outcome'] != 0)]
Healthy = df[(df['Outcome'] == 0)]

tmp1 = Diabetic[cols]

tmp2 = Healthy[cols]
hist_data = [tmp1, tmp2]
    
group_labels = ['diabetic', 'healthy']
colors = random_colors(2)

fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = 0, curve_type='kde')
    
fig['layout'].update(title = cols)

py.iplot(fig, filename = 'Density plot')

## BloodPressure Analysis

In [54]:
cols = "BloodPressure"
trace0 = go.Box(
    name = cols,
    y = df[cols]
)

Diabetic = df[(df['Outcome'] != 0)]
Healthy = df[(df['Outcome'] == 0)]

tmp1 = Diabetic[cols]

tmp2 = Healthy[cols]
hist_data = [tmp1, tmp2]
    
group_labels = ['diabetic', 'healthy']
colors = random_colors(2)

fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = 0, curve_type='kde')
    
fig['layout'].update(title = cols)

py.iplot(fig, filename = 'Density plot')

## Insulin Analysis

In [55]:
cols = "Insulin"
trace0 = go.Box(
    name = cols,
    y = df[cols]
)

Diabetic = df[(df['Outcome'] != 0)]
Healthy = df[(df['Outcome'] == 0)]

tmp1 = Diabetic[cols]

tmp2 = Healthy[cols]
hist_data = [tmp1, tmp2]
    
group_labels = ['diabetic', 'healthy']
colors = random_colors(2)

fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = 0, curve_type='kde')
    
fig['layout'].update(title = cols)

py.iplot(fig, filename = 'Density plot')

## BMI Analysis

In [56]:
cols = "BMI"
trace0 = go.Box(
    name = cols,
    y = df[cols]
)

Diabetic = df[(df['Outcome'] != 0)]
Healthy = df[(df['Outcome'] == 0)]

tmp1 = Diabetic[cols]

tmp2 = Healthy[cols]
hist_data = [tmp1, tmp2]
    
group_labels = ['diabetic', 'healthy']
colors = random_colors(2)

fig = ff.create_distplot(hist_data, group_labels, colors = colors, show_hist = True, bin_size = 0, curve_type='kde')
    
fig['layout'].update(title = cols)

py.iplot(fig, filename = 'Density plot')

## Age - Glucose - Insulin

In [57]:
ax = px.scatter_3d(df, x="Age", y="Glucose", z="Insulin",template= "plotly_dark",color="Outcome")
ax.show()

In [58]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Glucose_cat,BloodPressure_cat,age_cat,BMI_cat
0,6,148,72.0,35,0.0,33.6,0.627,50.0,1,diyabet,dusuk,adult,obese
1,1,85,66.0,29,0.0,26.6,0.351,31.0,0,normal2,dusuk,adult,overweight
2,8,183,64.0,0,0.0,23.3,0.672,32.0,1,diyabet,dusuk,adult,normal
3,1,89,66.0,23,94.0,28.1,0.167,21.0,0,normal2,dusuk,young_adult,overweight
4,0,137,40.0,35,168.0,43.1,2.288,33.0,1,diyabet,cok_dusuk,adult,extremelyobese


In [59]:
encode_glucose = {'anlamsızca_dusuk': 0,
                 'normal1': 1,
                 'normal2': 3,
                 'gizli_seker': 4,
                 'diyabet': 5}

In [60]:
df['Glucose_cat'] = df['Glucose_cat'].map(encode_glucose)

In [61]:
encode_bloodpressure = {'cok_dusuk': 0,
                   'dusuk': 1,
                   'ideal': 2,
                   'saglikli': 3,
                   'yuksek': 4,
                   'cok_yuksek': 5}

In [62]:
df['BloodPressure_cat'] = df['BloodPressure_cat'].map(encode_bloodpressure)

In [63]:
encode_age = {'young_adult': 0,
             'adult': 1,
             'middle_age': 2,
             'senior': 3}

In [64]:
df['age_cat'] = df['age_cat'].map(encode_age)

In [65]:
encode_bmi = {'underweight': 0,
              'normal': 1,
              'overweight': 2,
              'obese': 3,
              'extremelyobese': 4}

In [66]:
df['BMI_cat'] = df['BMI_cat'].map(encode_bmi)

In [67]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Glucose_cat,BloodPressure_cat,age_cat,BMI_cat
0,6,148,72.0,35,0.0,33.6,0.627,50.0,1,5,1,1,3
1,1,85,66.0,29,0.0,26.6,0.351,31.0,0,3,1,1,2
2,8,183,64.0,0,0.0,23.3,0.672,32.0,1,5,1,1,1
3,1,89,66.0,23,94.0,28.1,0.167,21.0,0,3,1,0,2
4,0,137,40.0,35,168.0,43.1,2.288,33.0,1,5,0,1,4


In [68]:
df["BloodPressure_cat"] = df["BloodPressure_cat"].astype(int)

In [69]:
df["Glucose_cat"] = df["Glucose_cat"].astype(int)

In [70]:
df["age_cat"] = df["age_cat"].astype(int)

In [71]:
df["BMI_cat"] = df["BMI_cat"].astype(int)

In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    float64
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
 8   Outcome                   768 non-null    int64  
 9   Glucose_cat               768 non-null    int64  
 10  BloodPressure_cat         768 non-null    int64  
 11  age_cat                   768 non-null    int64  
 12  BMI_cat                   768 non-null    int64  
dtypes: float64(5), int64(8)
memory usage: 78.1 KB


In [73]:
df.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome,Glucose_cat,BloodPressure_cat,age_cat,BMI_cat
Pregnancies,1.0,0.129,0.141,-0.082,-0.073,0.018,-0.034,0.544,0.222,0.117,0.154,0.46,0.057
Glucose,0.129,1.0,0.153,0.057,0.331,0.221,0.137,0.264,0.467,0.877,0.208,0.269,0.224
BloodPressure,0.141,0.153,1.0,0.207,0.09,0.282,0.041,0.24,0.065,0.14,0.72,0.223,0.247
SkinThickness,-0.082,0.057,0.207,1.0,0.44,0.393,0.184,-0.114,0.075,0.028,0.076,-0.123,0.401
Insulin,-0.073,0.331,0.09,0.44,1.0,0.2,0.186,-0.045,0.13,0.28,0.02,-0.026,0.221
BMI,0.018,0.221,0.282,0.393,0.2,1.0,0.141,0.036,0.293,0.193,0.251,0.055,0.894
DiabetesPedigreeFunction,-0.034,0.137,0.041,0.184,0.186,0.141,1.0,0.034,0.174,0.104,0.024,0.039,0.121
Age,0.544,0.264,0.24,-0.114,-0.045,0.036,0.034,1.0,0.238,0.22,0.269,0.873,0.062
Outcome,0.222,0.467,0.065,0.075,0.13,0.293,0.174,0.238,1.0,0.39,0.149,0.233,0.296
Glucose_cat,0.117,0.877,0.14,0.028,0.28,0.193,0.104,0.22,0.39,1.0,0.212,0.223,0.195


In [74]:
data = [go.Heatmap(z = np.array(df.corr().values),
                   x = np.array(df.corr().columns),
                   y = np.array(df.corr().columns),
                     colorscale='Viridis',)
       ]
layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                            #autosize = False,
                            #height  = 1400,
                            #width   = 1600,
                            margin  = dict(r = 0 ,l = 100,
                                           t = 0,b = 100,
                                         ),
                            yaxis   = dict(tickfont = dict(size = 9)),
                            xaxis   = dict(tickfont = dict(size = 9)),
                           )
                      )

fig = go.Figure(data=data,layout = layout)
iplot(fig)

In [75]:
df.drop(["Age", "Glucose", "BloodPressure", "BMI"], axis=1, inplace=True)

In [76]:
df.columns

Index(['Pregnancies', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction', 'Outcome', 'Glucose_cat', 'BloodPressure_cat', 'age_cat', 'BMI_cat'], dtype='object')

In [77]:
data = [go.Heatmap(z = np.array(df.corr().values),
                   x = np.array(df.corr().columns),
                   y = np.array(df.corr().columns),
                     colorscale='Viridis',)
       ]
layout = go.Layout(dict(title = 'Correlation Matrix for variables',
                            #autosize = False,
                            #height  = 1400,
                            #width   = 1600,
                            margin  = dict(r = 0 ,l = 100,
                                           t = 0,b = 100,
                                         ),
                            yaxis   = dict(tickfont = dict(size = 9)),
                            xaxis   = dict(tickfont = dict(size = 9)),
                           )
                      )

fig = go.Figure(data=data,layout = layout)
iplot(fig)

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   SkinThickness             768 non-null    int64  
 2   Insulin                   768 non-null    float64
 3   DiabetesPedigreeFunction  768 non-null    float64
 4   Outcome                   768 non-null    int64  
 5   Glucose_cat               768 non-null    int64  
 6   BloodPressure_cat         768 non-null    int64  
 7   age_cat                   768 non-null    int64  
 8   BMI_cat                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


## MODEL
* **Logistic Regression**

In [79]:
y = df['Outcome']

In [80]:
X = df.drop(['Outcome'], axis=1)

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20, random_state=1)

In [82]:
log_model = LogisticRegression(solver = 'liblinear').fit(X_train, y_train)

In [83]:
y_pred = log_model.predict(X_train)

In [84]:
y_pred[0:10]

array([1, 1, 0, 0, 0, 0, 1, 1, 0, 0])

In [85]:
# sınıf olasılıkları
log_model.predict_proba(X_train)[0:10]

array([[0.28691185, 0.71308815],
       [0.29215322, 0.70784678],
       [0.5535955 , 0.4464045 ],
       [0.6705208 , 0.3294792 ],
       [0.64518571, 0.35481429],
       [0.96044499, 0.03955501],
       [0.35991346, 0.64008654],
       [0.38147937, 0.61852063],
       [0.85299148, 0.14700852],
       [0.94163231, 0.05836769]])

In [86]:
# 1. sınıfa ait olma olasılıkları:
y_prob = log_model.predict_proba(X_train)[:, 1]

In [87]:

# Model Validation: 10-Fold Cross Validation

y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)


In [88]:
log_model = LogisticRegression(solver = 'liblinear').fit(X, y)

In [89]:
cv_results = cross_validate(log_model,
                            X, y,
                            cv=5,
                            scoring=["accuracy", "precision", "recall", "f1", "roc_auc"])

In [90]:
cv_results['test_accuracy'].mean()

0.7396485867074102

In [91]:
cv_results['test_precision'].mean()

0.6774988600091199

In [92]:
cv_results['test_recall'].mean()

0.5037735849056604

In [93]:
cv_results['test_f1'].mean()

0.5746696053240552

In [94]:
cv_results['test_roc_auc'].mean()

0.8065939902166317

In [95]:
X.columns

Index(['Pregnancies', 'SkinThickness', 'Insulin', 'DiabetesPedigreeFunction', 'Glucose_cat', 'BloodPressure_cat', 'age_cat', 'BMI_cat'], dtype='object')

In [96]:
random_user = X.sample(1, random_state=27)

In [97]:
log_model.predict(random_user)


array([1])