In [13]:
# setting up environment
import pandas as pd
import numpy as np
import os

from imblearn.over_sampling import SMOTENC

from dotenv import load_dotenv

load_dotenv()

True

In [14]:
# import dataset

train_df = pd.read_csv('../datasets/training.csv')
train_df.head()

Unnamed: 0,sex,age,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,48,1,20.0,0.0,0,0,0,232.582353,117.5,67.5,25.699942,76.028902,79.152866,1
1,0,39,1,5.0,0.0,0,0,0,221.246988,117.5,71.0,24.928274,75.95858,79.291391,0
2,1,41,0,0.0,0.0,0,1,0,222.763006,140.0,87.0,25.553621,75.977011,77.313725,0
3,0,40,0,0.0,0.0,0,0,0,217.508021,110.0,75.0,24.993351,76.518325,78.436782,0
4,1,47,1,30.0,0.0,0,0,0,236.92029,112.0,66.0,25.277482,76.184397,85.572581,1


In [15]:
# drop colomn id before sampling
def dropColomn(df, col):
    try:
        df = df.drop(col,1)
    except:
        print('colomn {0} not found'.format(col))
    return df

train_df = dropColomn(train_df,'id')
train_df.head()

colomn id not found


Unnamed: 0,sex,age,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0,48,1,20.0,0.0,0,0,0,232.582353,117.5,67.5,25.699942,76.028902,79.152866,1
1,0,39,1,5.0,0.0,0,0,0,221.246988,117.5,71.0,24.928274,75.95858,79.291391,0
2,1,41,0,0.0,0.0,0,1,0,222.763006,140.0,87.0,25.553621,75.977011,77.313725,0
3,0,40,0,0.0,0.0,0,0,0,217.508021,110.0,75.0,24.993351,76.518325,78.436782,0
4,1,47,1,30.0,0.0,0,0,0,236.92029,112.0,66.0,25.277482,76.184397,85.572581,1


In [16]:
# original data

print('shape of original data: {0}'.format(train_df.shape))
train_df.groupby('TenYearCHD').size()

shape of original data: (3852, 15)


TenYearCHD
0    3401
1     451
dtype: int64

In [17]:
# split features and labels

labels = np.array(train_df.pop('TenYearCHD'))
features = np.array(train_df)

In [18]:
# splited shape
print('shape of original features : {0}'.format(features.shape))
print('shape of original labels : {0}'.format(labels.shape))

shape of original features : (3852, 14)
shape of original labels : (3852,)


In [19]:
# run sampling using SMOTE-ENC
CATEGORICAL_FEATURES = [0,2,4,5,6,7]

sm = SMOTENC(random_state=42, categorical_features=CATEGORICAL_FEATURES)
features_res, labels_res = sm.fit_resample(features, labels)

#smote result
print('shape of sampled features : {0}'.format(features_res.shape))
print('shape of sampled labels : {0}'.format(labels_res.shape))

shape of sampled features : (6802, 14)
shape of sampled labels : (6802,)


In [20]:
# expands labels dimension to merge data 
labels_res_new = np.expand_dims(labels_res, axis=0)

# merge sampled features and labels 
balance_df = pd.DataFrame(np.concatenate((features_res, labels_res_new.T), axis=1),columns=[
    'sex', 'age',	'is_smoking',	'cigsPerDay',	
    'BPMeds',	'prevalentStroke',	'prevalentHyp',	
    'diabetes',	'totChol',	'sysBP',	'diaBP',	
    'BMI',	'heartRate',	'glucose',	'TenYearCHD']
    )

balance_df.head()

Unnamed: 0,sex,age,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,0.0,48.0,1.0,20.0,0.0,0.0,0.0,0.0,232.582353,117.5,67.5,25.699942,76.028902,79.152866,1.0
1,0.0,39.0,1.0,5.0,0.0,0.0,0.0,0.0,221.246988,117.5,71.0,24.928274,75.95858,79.291391,0.0
2,1.0,41.0,0.0,0.0,0.0,0.0,1.0,0.0,222.763006,140.0,87.0,25.553621,75.977011,77.313725,0.0
3,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,217.508021,110.0,75.0,24.993351,76.518325,78.436782,0.0
4,1.0,47.0,1.0,30.0,0.0,0.0,0.0,0.0,236.92029,112.0,66.0,25.277482,76.184397,85.572581,1.0


In [21]:
# preprocessing after sampling

# drop duplicated colomn
balance_df = balance_df.drop_duplicates()
print('shape of sampled training dataset after drop duplicates : {0}'.format(balance_df.shape))

unWantedSampledData = os.getenv('NUMOFUNWANTEDSAMPLEDDATA')
if unWantedSampledData != '':
    try:
        unWantedSampledDataInt = int(unWantedSampledData)
        balance_df.drop(balance_df.tail(unWantedSampledDataInt).index,inplace=True)
    except:
        print('cant process drop unwanted data')
    print('shape of sampled training dataset after drop unwanted data : {0}'.format(balance_df.shape))


shape of sampled training dataset after drop duplicates : (6788, 15)
shape of sampled training dataset after drop unwanted data : (6788, 15)


In [22]:
# shuffle sampled dataset n times
try:
    NUMBEROFSHUFFLING = int(os.getenv('NUMOFSHUFFLINGDATA'))

    for i in range(NUMBEROFSHUFFLING):
        balance_df = balance_df.sample(frac=1, random_state=i*16)
        print('shuffle sampeld dataset {0} time'.format(i))
except:
    print('failed to shuffle sampled dataset')

shuffle sampeld dataset 0 time
shuffle sampeld dataset 1 time
shuffle sampeld dataset 2 time
shuffle sampeld dataset 3 time
shuffle sampeld dataset 4 time
shuffle sampeld dataset 5 time
shuffle sampeld dataset 6 time
shuffle sampeld dataset 7 time
shuffle sampeld dataset 8 time
shuffle sampeld dataset 9 time
shuffle sampeld dataset 10 time
shuffle sampeld dataset 11 time
shuffle sampeld dataset 12 time
shuffle sampeld dataset 13 time
shuffle sampeld dataset 14 time
shuffle sampeld dataset 15 time


In [23]:
balance_df.to_csv('../datasets/sampled_dataset.csv',  index_label='id')