# This notebook helps explore the original data as well as EDA

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

## Read data and perform preliminary checks
Ex: duplicates, missingness, and types  
Quick summary statistics

In [2]:
# read in the data
df = pd.read_csv('..//data/raw/diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
# check for duplicate entries
df.duplicated().any()

False

In [4]:
# check for missing data
df.isna().any()

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [5]:
# check for data types
df.dtypes

Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Age                           int64
Outcome                       int64
dtype: object

In [6]:
df.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [7]:
# quick summary statistics for the predictor variables
df.describe().iloc[1:, :-1]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0


**Realize there are values in Glucose, BloodPressure, SkinThickness, Insulin, and BMI that are not in the correct range.**  
Ex: Can't have 0 BMI

### Path 1 Remove rows of data that contains entries that are inaccurate

In [8]:
df2 = df.copy()
col_check = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for i in col_check:
    df2[i] = df2[i].replace(to_replace = 0, value = np.NaN)
df2.dropna(inplace= True)
df2

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
13,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
753,0,181.0,88.0,44.0,510.0,43.3,0.222,26,1
755,1,128.0,88.0,39.0,110.0,36.5,1.057,37,1
760,2,88.0,58.0,26.0,16.0,28.4,0.766,22,0
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0


In [9]:
df2.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.198291,0.213355,0.093209,0.078984,-0.025347,0.007562,0.679608,0.256566
Glucose,0.198291,1.0,0.210027,0.198856,0.581223,0.209516,0.14018,0.343641,0.515703
BloodPressure,0.213355,0.210027,1.0,0.232571,0.098512,0.304403,-0.015971,0.300039,0.192673
SkinThickness,0.093209,0.198856,0.232571,1.0,0.182199,0.664355,0.160499,0.167761,0.255936
Insulin,0.078984,0.581223,0.098512,0.182199,1.0,0.226397,0.135906,0.217082,0.301429
BMI,-0.025347,0.209516,0.304403,0.664355,0.226397,1.0,0.158771,0.069814,0.270118
DiabetesPedigreeFunction,0.007562,0.14018,-0.015971,0.160499,0.135906,0.158771,1.0,0.085029,0.20933
Age,0.679608,0.343641,0.300039,0.167761,0.217082,0.069814,0.085029,1.0,0.350804
Outcome,0.256566,0.515703,0.192673,0.255936,0.301429,0.270118,0.20933,0.350804,1.0


There are some moderate correlation we see in "Age & Pregnancies", "Glucose & Insulin", "BMI & SkinThickness" but they do not appear to be a cuase of concern.

In [10]:
df2.describe().iloc[1:, :-1]

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
mean,3.30102,122.627551,70.663265,29.145408,156.056122,33.086224,0.523046,30.864796
std,3.211424,30.860781,12.496092,10.516424,118.84169,7.027659,0.345488,10.200777
min,0.0,56.0,24.0,7.0,14.0,18.2,0.085,21.0
25%,1.0,99.0,62.0,21.0,76.75,28.4,0.26975,23.0
50%,2.0,119.0,70.0,29.0,125.5,33.2,0.4495,27.0
75%,5.0,143.0,78.0,37.0,190.0,37.1,0.687,36.0
max,17.0,198.0,110.0,63.0,846.0,67.1,2.42,81.0


### Path 2 Impute Insulin
Remove entries that have 0 in either "Glucose", "BloodPressure", "SkinThickness", "BMI"  
Retain more data by imputing Insulin

In [11]:
# Dataframe that removes inaccurate entries for everything except Insulin (want to impute the missing insulin values)
filter_df = df[(df['SkinThickness'] != 0) & (df['BMI'] != 0) & (df['Glucose'] != 0) & (df['BloodPressure'] != 0)]
filter_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
6,3,78,50,32,88,31.0,0.248,26,1
...,...,...,...,...,...,...,...,...,...
761,9,170,74,31,0,44.0,0.403,43,1
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0


## Exploratory Data Analysis

In [12]:
df_plot = df.copy()
df_plot['Outcome'] = df_plot['Outcome'].replace({0: 'No Diabetes', 1: 'Diabetes'})
color_discrete_map = {'No Diabetes': '#636EFA', 'Diabetes': '#EF553B'}

### Plot the distribution of each predictor

#### Pregnancies

In [13]:
# Get Total Count of Pregnancy
df_plot['Pregnancies'].value_counts()

Pregnancies
1     135
0     111
2     103
3      75
4      68
5      57
6      50
7      45
8      38
9      28
10     24
11     11
13     10
12      9
14      2
15      1
17      1
Name: count, dtype: int64

In [14]:
agg_data = df.groupby(['Pregnancies', 'Outcome']).size().unstack(fill_value=0)
# Create bar plot
fig = go.Figure(data=[
    go.Bar(name='No Diebetes', x=agg_data.index, y=agg_data[0]),
    go.Bar(name='Diabetes', x=agg_data.index, y=agg_data[1])
])

# Update layout
fig.update_layout(
    title='Frequency of Outcomes by Pregnancy',
    xaxis=dict(title='Number of Pregnancies'),
    yaxis=dict(title='Frequency'),
    barmode='group'
)

# Show plot
fig.show()

#### Glucose

In [15]:
fig = px.box(df_plot, y='Outcome', x='Glucose', 
             color='Outcome',
             color_discrete_map=color_discrete_map,
             title='Glucose Levels by Outcome',
             labels={'Outcome': 'Outcome', 'Glucose': 'Glucose Level'})
fig.update_layout(xaxis=dict(title='Outcome'))

# Show the plot
fig.show()

#### BloodPressure

In [16]:
fig = px.box(df_plot, y='Outcome', x='BloodPressure', 
             color='Outcome',
             color_discrete_map=color_discrete_map,
             title='Blood Pressure by Outcome',
             labels={'Outcome': 'Outcome', 'BloodPressure': 'Blood Pressure'})
fig.update_layout(xaxis=dict(title='Outcome'))

# Show the plot
fig.show()

#### SkinThickness

In [17]:
fig = px.box(df_plot, y='Outcome', x='SkinThickness', 
             color='Outcome',
             color_discrete_map=color_discrete_map,
             title='SkinThickness by Outcome',
             labels={'Outcome': 'Outcome', 'SkinThickness': 'Skin Thickness'})
fig.update_layout(xaxis=dict(title='Outcome'))

# Show the plot
fig.show()

#### Insulin

In [18]:
fig = px.box(df_plot, y='Outcome', x='Insulin', 
             color='Outcome',
             color_discrete_map=color_discrete_map,
             title='Insulin by Outcome',
             labels={'Outcome': 'Outcome', 'Insulin': 'Insulin'})
fig.update_layout(xaxis=dict(title='Outcome'))

# Show the plot
fig.show()

#### BMI

In [19]:
# Create data for the heatmap
z_data = [
    [12, 12, 12, 13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18.5, 19],
    [12, 13, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20],
    [13, 13, 14, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 20, 20, 21],
    [14, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 21, 21, 22],
    [14, 15, 15, 15, 16, 16, 17, 17, 18, 18, 19, 20, 20, 21, 22, 22, 23],
    [15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 22, 22, 23, 24],
    [15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 21, 21, 22, 23, 23, 24, 25],
    [16, 16, 17, 17, 18, 18, 19, 20, 20, 21, 21, 22, 23, 24, 24, 25, 26],
    [17, 17, 18, 18, 19, 19, 20, 20, 21, 22, 22, 23, 24, 24, 25, 26, 27],
    [17, 18, 18, 19, 19, 20, 20, 21, 22, 22, 23, 24, 24, 25, 26, 27, 28],
    [18, 18, 19, 19, 20, 21, 21, 22, 22, 23, 24, 25, 25, 26, 27, 28, 29],
    [18, 19, 19, 20, 21, 21, 22, 22, 23, 24, 25, 25, 26, 27, 28, 29, 30],
    [19, 20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31],
    [20, 20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31, 32],
    [20, 21, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31, 32, 33],
    [21, 21, 22, 23, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
    [22, 22, 23, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
    [22, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, 34, 35, 36],
    [23, 23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, 34, 36, 37],
    [23, 24, 25, 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, 34, 35, 36, 38],
    [24, 25, 25, 26, 27, 28, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 39],
    [25, 25, 26, 27, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 40],
    [25, 26, 27, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41],
    [26, 26, 27, 28, 29, 30, 30, 31, 32, 33, 34, 35, 37, 38, 39, 40, 42]
]

x_data = [45.5, 47.7, 50, 52.3, 54.5, 56.8, 59.1, 61.4, 63.6, 65.9, 68.2, 70.5, 72.7, 75, 77.3, 79.5, 81.8, 84.1, 86.4, 88.6, 90.9, 93.2, 95.5, 97.7]
y_data = [193, 190.5, 187.9, 185.4, 182.8, 180.3, 177.8, 175.2, 172.7, 170.1, 167.6, 165.1, 162.5, 160, 157.4, 154.9, 152.4]

# Create the heatmap trace
heatmap_trace = go.Heatmap(
    z=z_data,
    x=x_data,
    y=y_data,
    colorscale=[
        [0, "rgb(255,196,18)"],
        [0.21, "rgb(255,196,18)"],
        [0.21, "rgb(18,203,198)"],
        [0.43, "rgb(18,203,198)"],
        [0.43, "rgb(24,134,169)"],
        [0.6, "rgb(24,134,169)"],
        [0.6, "rgb(242,73,104)"],
        [0.93, "rgb(242,73,104)"],
        [0.93, "rgb(181,53,114)"],
        [1, "rgb(181,53,114)"]
    ],
    xgap=3,
    ygap=3,
    hovertemplate="Height: %{y} cm <br>Weight: %{x} kg <br>BMI: %{z} <extra></extra>",
    showscale=False
)

# Layout for the heatmap
heatmap_layout = go.Layout(
    height=600,
    yaxis=dict(title="Height (cm)", autorange="reversed", tickfont=dict(size=14, color="rgb(107, 107, 107)")),
    xaxis=dict(title="Weight (kg)", tickfont=dict(size=14, color="rgb(107, 107, 107)")),
    plot_bgcolor="rgb(251,251,253)",
    paper_bgcolor="rgb(251,251,253)",
    margin=dict(t=20),
    legend=dict(orientation="h", yanchor="bottom", y=1.05, x=0.18, font=dict(size=14, color="#3d4465")),
    xaxis_range=[43, 100],
    yaxis_range=[152.4, 193]
)

# Create the figure
fig = go.Figure(data=[heatmap_trace], layout=heatmap_layout)

# Display the figure
fig.show()

#### Age

In [20]:
fig = px.histogram(df_plot, x = 'Age', title='Age Distribution', color_discrete_sequence=['#00CC96'])
fig.update_layout(
    yaxis=dict(title='Frequency')
)
fig.show()

In [21]:
fig = px.histogram(df_plot, x = 'Age', color = 'Outcome', title = 'Age Distribution by Outcome', opacity= 0.8, color_discrete_map= color_discrete_map)
fig.update_layout(
    yaxis=dict(title='Frequency')
)
fig.show()

#### 