In [2]:
#Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
import numpy as np

#Upload from csv
data_set_path = "Healthcare-Diabetes.csv"

#Read data and results
data_set_df = pd.read_csv(data_set_path)

data_set_df.head()



Unnamed: 0,Id,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,1,6,148,72,35,0,33.6,0.627,50,1
1,2,1,85,66,29,0,26.6,0.351,31,0
2,3,8,183,64,0,0,23.3,0.672,32,1
3,4,1,89,66,23,94,28.1,0.167,21,0
4,5,0,137,40,35,168,43.1,2.288,33,1


# Clean the Data

In [7]:
#Identify how many unique ids there are
id_unique = data_set_df["Id"].value_counts()
id_unique

#Count total number of participants
id_count = len(id_unique)
id_count

2768

In [14]:
#List of all columns
data_set_df.columns

Index(['Id', 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [19]:
#Reorganize Data Frame to move age next to ID number
data_set_2_df = data_set_df[["Id", "Age", 'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Outcome']]
data_set_2_df.head()

Unnamed: 0,Id,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
0,1,50,6,148,72,35,0,33.6,0.627,1
1,2,31,1,85,66,29,0,26.6,0.351,0
2,3,32,8,183,64,0,0,23.3,0.672,1
3,4,21,1,89,66,23,94,28.1,0.167,0
4,5,33,0,137,40,35,168,43.1,2.288,1


In [20]:
#Check for incomplete rows
data_set_2_df.count()

Id                          2768
Age                         2768
Pregnancies                 2768
Glucose                     2768
BloodPressure               2768
SkinThickness               2768
Insulin                     2768
BMI                         2768
DiabetesPedigreeFunction    2768
Outcome                     2768
dtype: int64

In [21]:
#Check Data Types
data_set_2_df.dtypes

Id                            int64
Age                           int64
Pregnancies                   int64
Glucose                       int64
BloodPressure                 int64
SkinThickness                 int64
Insulin                       int64
BMI                         float64
DiabetesPedigreeFunction    float64
Outcome                       int64
dtype: object

In [22]:
#Display statistical data
data_set_2_df.describe()

Unnamed: 0,Id,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
count,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0,2768.0
mean,1384.5,33.132225,3.742775,121.102601,69.134393,20.824422,80.12789,32.137392,0.471193,0.343931
std,799.197097,11.77723,3.323801,32.036508,19.231438,16.059596,112.301933,8.076127,0.325669,0.475104
min,1.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078,0.0
25%,692.75,24.0,1.0,99.0,62.0,0.0,0.0,27.3,0.244,0.0
50%,1384.5,29.0,3.0,117.0,72.0,23.0,37.0,32.2,0.375,0.0
75%,2076.25,40.0,6.0,141.0,80.0,32.0,130.0,36.625,0.624,1.0
max,2768.0,81.0,17.0,199.0,122.0,110.0,846.0,80.6,2.42,1.0


In [25]:
#Reindex with ID number
data_set_final_df = data_set_2_df.set_index("Id")
data_set_final_df


Unnamed: 0_level_0,Age,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Outcome
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,50,6,148,72,35,0,33.6,0.627,1
2,31,1,85,66,29,0,26.6,0.351,0
3,32,8,183,64,0,0,23.3,0.672,1
4,21,1,89,66,23,94,28.1,0.167,0
5,33,0,137,40,35,168,43.1,2.288,1
...,...,...,...,...,...,...,...,...,...
2764,33,2,75,64,24,55,29.7,0.370,0
2765,36,8,179,72,42,130,32.7,0.719,1
2766,42,6,85,78,0,0,31.2,0.382,0
2767,26,0,129,110,46,130,67.1,0.319,1
