### Exploring health and predicting heart disease of Akimel O'odham, or "River People"

In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
river_file = os.path.join('resources', 'diabetes.csv')
river_df = pd.read_csv(river_file)
river_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Rename Outcome column for clarity ("diabetes_outcome")
river_df = river_df.rename(columns={'Outcome':'diabetes_outcome'})
river_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,diabetes_outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
# Explore statistics
river_df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,diabetes_outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
river_df['weight_category'] = ""
river_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,diabetes_outcome,weight_category
0,6,148,72,35,0,33.6,0.627,50,1,
1,1,85,66,29,0,26.6,0.351,31,0,
2,8,183,64,0,0,23.3,0.672,32,1,
3,1,89,66,23,94,28.1,0.167,21,0,
4,0,137,40,35,168,43.1,2.288,33,1,
...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0,
764,2,122,70,27,0,36.8,0.340,27,0,
765,5,121,72,23,112,26.2,0.245,30,0,
766,1,126,60,0,0,30.1,0.349,47,1,


In [11]:
river_df['BMI'].dtype

dtype('float64')

In [15]:
# Would this kind of logic been able to work? All the column was showing underweight
# if [river_df['BMI'] < 18.5]:
#     weight_category = 'underweight'
# elif [river_df['BMI'] >= 18.5] and [river_df['BMI'] < 25]: 
#     weight_category = 'healthy_weight'
# elif [river_df['BMI'] >= 25.0] and [river_df['BMI'] < 30]:   
#     weight_category = 'overweight'
# else:
#     weight_category = 'obese'
# river_df['weight_category'] = weight_category
# river_df


In [17]:
# Create a new column for weight categories using BMI, and perform binning on BMIs
# underweight range any BMI < 18.5, healthy weight at 18.5 to < 25, overweight at 25.0 to < 30, obese > 30.0 
# right=False for < bins instead of <=
weight_category = ['underweight', 'healthy_weight', 'overweight', 'obese']
bmi_bins = [0, 18.5, 25, 30, 70]

river_df['weight_category'] = pd.cut(river_df['BMI'], bmi_bins, right=False, labels=weight_category)
river_df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,diabetes_outcome,weight_category
0,6,148,72,35,0,33.6,0.627,50,1,obese
1,1,85,66,29,0,26.6,0.351,31,0,overweight
2,8,183,64,0,0,23.3,0.672,32,1,healthy_weight
3,1,89,66,23,94,28.1,0.167,21,0,overweight
4,0,137,40,35,168,43.1,2.288,33,1,obese
...,...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0,obese
764,2,122,70,27,0,36.8,0.340,27,0,obese
765,5,121,72,23,112,26.2,0.245,30,0,overweight
766,1,126,60,0,0,30.1,0.349,47,1,obese


In [25]:
# Check on bin boundaries, if 25.0 is classified overweight
river_df.loc[river_df['BMI'] == 25.0] | 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,diabetes_outcome,weight_category
62,5,44,62,0,0,25.0,0.587,36,0,overweight
79,2,112,66,22,0,25.0,0.307,24,0,overweight
509,8,120,78,0,0,25.0,0.409,64,0,overweight
520,2,68,70,32,66,25.0,0.187,25,0,overweight
581,6,109,60,27,0,25.0,0.206,27,0,overweight
697,0,99,0,0,0,25.0,0.253,22,0,overweight


In [None]:
# Create a new column for general classification of BMI >= 30 as yes for overweight, < 30 no for not overweight
