In [28]:
import pandas as pd
import numpy as np
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
import seaborn as sns

In [2]:
df = pd.read_csv('data/diabetes_downsampled.csv')

In [3]:
df.shape

(79757, 22)

In [4]:
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0,0,0,1,25,0,0,0,1,0,...,1,0,2,2,1,0,0,1,5,6
1,0,0,0,1,26,0,0,0,1,0,...,1,0,3,10,0,0,0,1,4,1
2,0,0,0,1,23,0,0,0,1,1,...,1,0,1,0,0,0,0,1,5,3
3,0,0,0,1,28,0,0,0,0,1,...,1,0,1,15,0,0,0,1,5,3
4,0,0,0,1,21,0,0,0,1,0,...,1,0,3,0,14,0,0,1,4,1


### Data Preprocessing

In [5]:
#Remove diabetes instances as we are only interested in non-diabetes and pre-diabetes
#0 is for no diabetes , 1 is for prediabetes, and 2 is for diabetes

In [6]:
df1 = df[(df.Diabetes_012==0) | (df.Diabetes_012==1)]

In [7]:
df1.shape

(44411, 22)

In [13]:
#Number of non-diabetes instances
df1[(df1.Diabetes_012==0)].shape

(39780, 22)

In [14]:
df1[(df1.Diabetes_012==1)].shape

(4631, 22)

In [44]:
df1[(df1.Diabetes_012==0)][df1.columns[0]]

0        0
1        0
2        0
3        0
4        0
        ..
39775    0
39776    0
39777    0
39778    0
39779    0
Name: Diabetes_012, Length: 39780, dtype: int64

#### Data highly unbalanced with only 10.4% of the data as pre-diabetes

## Split Dataset before Data sampling

In [48]:
#Using test_size of 30% of undersampled to split both majority and minority classes to provide a balanced test set
test_size1 = 0.3* df1[(df1.Diabetes_012==1)].shape[0]/df1[(df1.Diabetes_012==0)].shape[0]
test_size2 = 0.3

In [49]:
#split training set
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(df1[(df1.Diabetes_012==0)][df1.columns[1:]], df1[(df1.Diabetes_012==0)][df1.columns[0]],
                                                            test_size=test_size1)

In [50]:
X_train_1.shape

(38390, 21)

In [51]:
X_test_1.shape

(1390, 21)

In [52]:
#split test set
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(df1[(df1.Diabetes_012==1)][df1.columns[1:]], df1[(df1.Diabetes_012==1)][df1.columns[0]],
                                                            test_size=test_size2)

In [53]:
X_train_2.shape

(3241, 21)

In [54]:
X_test_2.shape

(1390, 21)

### merge training splits and test splits

In [55]:
X_train = pd.concat([X_train_1, X_train_2],ignore_index=True)

In [58]:
X_train.shape

(41631, 21)

In [59]:
y_train = pd.concat([y_train_1, y_train_2],ignore_index=True)

In [62]:
y_train.value_counts()

0    38390
1     3241
Name: Diabetes_012, dtype: int64

In [63]:
X_test = pd.concat([X_test_1, X_test_2],ignore_index=True)

In [64]:
X_test.shape

(2780, 21)

In [65]:
y_test = pd.concat([y_test_1, y_test_2],ignore_index=True)

In [66]:
y_test.value_counts()

1    1390
0    1390
Name: Diabetes_012, dtype: int64