In [103]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
	
from sklearn.utils import resample

In [104]:
data = pd.read_csv('full_dataset.csv')
data.iloc[0]

on_thyroxine                              f
query_on_thyroxine                        f
on_antithyroid_medication                 f
thyroid_surgery                           f
query_hypothyroid                         f
query_hyperthyroid                        f
pregnant                                  f
sick                                      f
tumor                                     f
lithium                                   f
goitre                                    f
TSH_measured                              n
T3_measured                               y
TT4_measured                              y
T4U_measured                              y
FTI_measured                              y
age                                      72
sex                                       M
TSH                                     0.8
T3                                        1
TT4                                      83
T4U                                    0.95
FTI                             

In [105]:
data.classes.value_counts()

negative          2870
sick-euthyroid     293
Name: classes, dtype: int64

In [106]:
data_encode = pd.get_dummies(data)
data_encode = data_encode.drop('classes_negative', 1)
data_encode = data_encode.rename(columns={'classes_sick-euthyroid': 'classes'})
data_encode.head()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,on_thyroxine_f,on_thyroxine_t,query_on_thyroxine_f,query_on_thyroxine_t,...,T3_measured_y,TT4_measured_n,TT4_measured_y,T4U_measured_n,T4U_measured_y,FTI_measured_n,FTI_measured_y,sex_F,sex_M,classes
0,72,0.8,1.0,83.0,0.95,87.0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,1
1,45,1.9,1.0,82.0,0.73,112.0,1,0,1,0,...,1,0,1,0,1,0,1,1,0,1
2,64,0.09,1.0,101.0,0.82,123.0,1,0,1,0,...,1,0,1,0,1,0,1,1,0,1
3,56,0.0,0.8,76.0,0.77,99.0,1,0,1,0,...,1,0,1,0,1,0,1,0,1,1
4,78,2.6,0.3,87.0,0.95,91.0,0,1,1,0,...,1,0,1,0,1,0,1,1,0,1


In [127]:
major_class = data_encode[data_encode.classes == 0]
minor_class = data_encode[data_encode.classes == 1]

In [128]:
minor_class_upsampled = resample(minor_class,
                                replace=True,
                                n_samples=300,
                                random_state=42)

In [131]:
data = pd.concat([major_class, minor_class_upsampled])

In [117]:
features = data.iloc[:, 0:-1]
features.head()

Unnamed: 0,age,TSH,T3,TT4,T4U,FTI,on_thyroxine_f,on_thyroxine_t,query_on_thyroxine_f,query_on_thyroxine_t,...,T3_measured_n,T3_measured_y,TT4_measured_n,TT4_measured_y,T4U_measured_n,T4U_measured_y,FTI_measured_n,FTI_measured_y,sex_F,sex_M
293,20,0.5,1.8,68.0,0.99,68.0,1,0,1,0,...,0,1,0,1,0,1,0,1,1,0
294,29,1.2,2.5,83.0,0.93,89.0,1,0,1,0,...,0,1,0,1,0,1,0,1,1,0
295,66,3.1,1.7,76.0,0.83,92.0,1,0,1,0,...,0,1,0,1,0,1,0,1,1,0
296,62,0.4,2.2,103.0,0.99,104.0,1,0,1,0,...,0,1,0,1,0,1,0,1,0,1
297,72,0.0,1.5,66.0,0.97,69.0,1,0,1,0,...,0,1,0,1,0,1,0,1,1,0


In [126]:
target = data.iloc[:, -1]
print(len(target == 0))

3170


In [111]:
rus = RandomUnderSampler(random_state=0)
features_resampled, target_resample = rus.fit_resample(features, target)
print('Resampled dataset shape %s' % Counter(target_resample))

Resampled dataset shape Counter({0: 293, 1: 293})


In [112]:
x_train, x_test, y_train, y_test = train_test_split(features_resampled, target_resample, random_state=42)
print('Training target statistics: {}'.format(Counter(y_train)))
print('Testing target statistics: {}'.format(Counter(y_test)))

Training target statistics: Counter({1: 229, 0: 210})
Testing target statistics: Counter({0: 83, 1: 64})


In [114]:
clf_rf = RandomForestClassifier(n_estimators=25, random_state=12)
clf_rf.fit(x_train, y_train)
print('Validation Results')
print(clf_rf.score(x_test, y_test))
print(recall_score(y_test, clf_rf.predict(x_test)))

Validation Results
0.9251700680272109
0.875
