In this notebook, we first split the 2015 cleaned dataset into train and test sets. Following this, we also prepare an undersampled training set.

In [None]:
##defining the path to the pre-processed datasets.

from google.colab import drive
drive.mount('/content/gdrive')
data_path = '/content/gdrive/Shareddrives/CIS520 Final Project/dataset/pre_processed/'

### 80-20 split on the 2015 cleaned dataset to get train and test sets

In [None]:
##Importing the required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

In [None]:
##Reading the 2015 cleaned dataset
df = pd.read_csv(data_path + '2015_selected.csv')

##Removing the index column
df = df.drop(["Unnamed: 0"], axis = 1)
##Storing the feature names
col = df.columns
col = col.drop(["DIABETE3"])

##Defining X (feature values) and y (target variable)
y = df['DIABETE3'].values
df = df.drop(['DIABETE3'], axis = 1)
X = df.values

In [None]:
##Using the train_test_split function to do 80-20 split of the 2015 cleaned datset into train and test sets. 
##To ensure that the train and test sets have the same class imbalance as in the 2015 cleaned datset. For this we specifiy the 'stratify' parameter as y.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
##Printing the ratio of (number of diabetic samples) and (number of non-diabetic samples) 
##for cleaned 2015 dataset, train and test sets to show that class imbalance is conserved.
print(f'Class imbalance in 2015 cleaned dataset: ', np.sum(y)/y.shape[0])
print(f'Class imbalance in Training set: ', np.sum(y_train)/y_train.shape[0])
print(f'Class imbalance in Test set: ', np.sum(y_test)/y_test.shape[0])

Class imbalance in 2015 cleaned dataset:  0.1676053510915896
Class imbalance in Training set:  0.16760634058505772
Class imbalance in Test set:  0.1676013932345475


In [None]:
##Saving the training set as .csv file
train_df = pd.DataFrame(data = np.hstack((y_train.reshape(-1,1),X_train)), columns = ['DIABETE3']+col.to_list())
train_df.to_csv(data_path + 'Train.csv', index=False)
##Saving the test set as .csv file
test_df = pd.DataFrame(data = np.hstack((y_test.reshape(-1,1),X_test)), columns = ['DIABETE3']+col.to_list())
test_df.to_csv(data_path + 'Test.csv', index=False)

### Random undersampling is performed on the train set to get the undersampled training set

In [None]:
##We use the RandomUnderSampler from imblearn library
from imblearn.under_sampling import RandomUnderSampler

##Defining the undersampler with a sampling strategy that gives as equal number of samples for each class in the undersampled set
undersample = RandomUnderSampler(sampling_strategy = 1.0, random_state=1)
##Perform undersampling on the training set
X_train_under, y_train_under = undersample.fit_resample(X_train,y_train)

##summarize class distribution in the original and undersampled train sets
print('Class distribution in the Train set:')
print('diabetic: ',y_train.sum())
print('non-diabetic: ',len(y_train)-y_train.sum())
print()
print('Class distribution in the undersampled training set:')
print('diabetic: ',y_train_under.sum())
print('non-diabetic: ',len(y_train_under)-y_train_under.sum())

Class distribution in the Train set:
diabetic:  22712.0
non-diabetic:  112796.0

Class distribution in the undersampled training set:
diabetic:  22712.0
non-diabetic:  22712.0


In [None]:
##Saving the undersampled training set as .csv file
train_under_df = pd.DataFrame(data = np.hstack((y_train_under.reshape(-1,1),X_train_under)), columns = ['DIABETE3']+col.to_list())
train_under_df.to_csv(data_path + 'Train_undersample.csv', index=False)