In [15]:
# !pip install ucimlrepo

# 1. Download the data

In [16]:
from ucimlrepo import fetch_ucirepo 
  
cdc_diabetes_health_indicators = fetch_ucirepo(id=891) 
  
X = cdc_diabetes_health_indicators.data.features 
y = cdc_diabetes_health_indicators.data.targets 

print(cdc_diabetes_health_indicators.metadata) 


# Combine X and y to create a dataframe
import pandas as pd
df = pd.concat([X, y], axis=1)


df.head()

{'uci_id': 891, 'name': 'CDC Diabetes Health Indicators', 'repository_url': 'https://archive.ics.uci.edu/dataset/891/cdc+diabetes+health+indicators', 'data_url': 'https://archive.ics.uci.edu/static/public/891/data.csv', 'abstract': 'The Diabetes Health Indicators Dataset contains healthcare statistics and lifestyle survey information about people in general along with their diagnosis of diabetes. The 35 features consist of some demographics, lab test results, and answers to survey questions for each patient. The target variable for classification is whether a patient has diabetes, is pre-diabetic, or healthy. ', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Tabular', 'Multivariate'], 'num_instances': 253680, 'num_features': 21, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Sex', 'Age', 'Education Level', 'Income'], 'target_col': ['Diabetes_binary'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Diabetes_binary
0,1,1,1,40,1,0,0,0,0,1,...,0,5,18,15,1,0,9,4,3,0
1,0,0,0,25,1,0,0,1,0,0,...,1,3,0,0,0,0,7,6,1,0
2,1,1,1,28,0,0,0,0,1,0,...,1,5,30,30,1,0,9,4,8,0
3,1,0,1,27,0,0,0,1,1,1,...,0,2,0,0,0,0,11,3,6,0
4,1,1,1,24,0,0,0,1,1,1,...,0,2,3,0,0,0,11,5,4,0


In [17]:
# Filter any person who has not had their cholesterol checked in the last 5 years
df = df[df['CholCheck'] == 1]

# Combine Fruits and Veggies into a single Healthy Eating Score
df['HealthyEating'] = df['Fruits'] + df['Veggies'] + 1

df = df.drop(columns=['Fruits', 'Veggies', 'CholCheck', 'HighChol', 'HighBP', 'Stroke', 'HeartDiseaseorAttack', 'NoDocbcCost', 'GenHlth', 'PhysHlth', 'MentHlth', 'DiffWalk'])

# Rename Diabetes_binary to Diabetes
df = df.rename(columns={'Diabetes_binary': 'Diabetes'})

In [18]:
print(len(df))

244210


In [19]:
df.head()

Unnamed: 0,BMI,Smoker,PhysActivity,HvyAlcoholConsump,AnyHealthcare,Sex,Age,Education,Income,Diabetes,HealthyEating
0,40,1,0,0,1,0,9,4,3,0,2
2,28,0,0,0,1,0,9,4,8,0,2
3,27,0,1,0,1,0,11,3,6,0,3
4,24,0,1,0,1,0,11,5,4,0,3
5,25,1,1,0,1,1,10,6,8,0,3


In [20]:
df.to_csv('cdc_diabetes_health_indicators.csv', index=False)

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, train_size=0.80, test_size=0.20)
train_1, train_2 = train_test_split(train, train_size=0.5, test_size=0.5)

test.to_csv('cdc_diabetes_health_indicators_test.csv', index=False)
train_1.to_csv('cdc_diabetes_health_indicators_train1.csv', index=False)
train_2.to_csv('cdc_diabetes_health_indicators_train2.csv', index=False)

print(len(test))
print(len(train_1))
print(len(train_2))

48842
97684
97684


In [21]:
df_c = df.copy()

correlations = df_c.corr()['Diabetes'].drop('Diabetes')
correlations = correlations.sort_values()
print(correlations)

Income              -0.166894
Education           -0.126289
PhysActivity        -0.120127
HealthyEating       -0.062425
HvyAlcoholConsump   -0.056821
AnyHealthcare        0.010412
Sex                  0.033538
Smoker               0.062246
Age                  0.174538
BMI                  0.217982
Name: Diabetes, dtype: float64
