In [1]:
import pandas as pd
import numpy as np

train_data = pd.read_excel("Data/tr.xlsx",)

In [2]:
over_missing_sample = []
row_value_count = train_data.apply(pd.Series.value_counts,axis=1,dropna=False)

In [74]:
import tools
#Data transform
train_y = pd.DataFrame(train_data['outcome']) 
train_X = pd.DataFrame(train_data.drop(['outcome'],axis=1))

train_X = tools.data_transform(train_X)

In [75]:
threshhold =len(train_X.columns)*0.25
over_missing = row_value_count[np.nan]<=threshhold
class_0 = train_y['outcome']==1

print(sorted(train_y.value_counts()))
train_X = train_X[over_missing|class_0]
train_y = train_y[over_missing|class_0]
print(sorted(train_y.value_counts()))


[879, 51280]
[879, 27076]


In [76]:
# Data split
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(
    train_X ,
    train_y,
    test_size=0.2,
    random_state=42)

print(sorted(train_y.value_counts()))
print(sorted(val_y.value_counts()))

[718, 21646]
[161, 5430]


In [77]:
from imblearn.under_sampling import RandomUnderSampler as RUS

print(sorted(train_y.value_counts()))

rus = RUS(sampling_strategy=0.1,random_state=42)
train_X,train_y = rus.fit_resample(train_X,train_y)

print(sorted(train_y.value_counts()))

[718, 21646]
[718, 7180]


In [78]:
from sklearn.feature_selection import VarianceThreshold as VT

vt= VT(0.2)

vt.fit(train_X)
print(train_X.shape)

train_X = pd.DataFrame(vt.transform(train_X)) 
val_X =  pd.DataFrame(vt.transform(val_X))

print(train_X.shape)

(7898, 66)
(7898, 30)


In [79]:
# Missing value imputation
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import tools
feature_kind = tools.init_feature_kind(train_X)
cont,cate = tools.get_feature_kind(train_X,feature_kind)  

strategy = 'median'

imp_mean = IterativeImputer(max_iter=100,random_state=0)
imp_mean.fit(train_X[cont])

train_X[cont] = imp_mean.transform(train_X[cont])
val_X[cont] = imp_mean.transform(val_X[cont])

print("filled continuous missing value with "+strategy)

strategy = 'constant'

imp = SimpleImputer(missing_values=np.nan, strategy=strategy,fill_value=10.0)
imp.fit(train_X[cate])

train_X[cate] = imp.transform(train_X[cate])
val_X[cate] = imp.transform(val_X[cate])


print("filled nominal missing value with ",strategy)


filled continuous missing value with median
filled nominal missing value with  constant


In [80]:
from sklearn import preprocessing


cont,cate = tools.get_feature_kind(train_X,feature_kind)

scaler = preprocessing.StandardScaler().fit(train_X[cont])
train_X[cont] = scaler.transform(train_X[cont])
val_X[cont] = scaler.transform(val_X[cont])

In [81]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import GridSearchCV
clf = RFC(random_state=42,n_jobs=5)

parameters = {
    'n_estimators':[100,10,200,500],
    'criterion':("gini","entropy"),
    'bootstrap':[True,False]
    }

GS = GridSearchCV(clf, parameters,n_jobs=5,scoring='f1')
GS.fit(train_X, train_y['outcome'])


sorted(GS.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_bootstrap',
 'param_criterion',
 'param_n_estimators',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [82]:
print(GS.best_params_)
print(pd.DataFrame(GS.cv_results_))

{'bootstrap': False, 'criterion': 'gini', 'n_estimators': 500}
    mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0        0.658384      0.091159         0.059243        0.007920   
1        0.078238      0.004885         0.016203        0.002135   
2        1.334204      0.189339         0.102243        0.009358   
3        3.477599      0.573176         0.211886        0.012749   
4        0.660148      0.036896         0.051219        0.002711   
5        0.091420      0.008959         0.014004        0.001266   
6        1.608505      0.260770         0.091832        0.007031   
7        4.277824      0.779033         0.250274        0.048961   
8        0.827420      0.017800         0.066626        0.035081   
9        0.135830      0.045750         0.032207        0.021513   
10       2.154429      0.452014         0.158408        0.066380   
11       5.246040      1.129059         0.327818        0.108778   
12       1.002575      0.028427         0.061026     

In [84]:
from sklearn.feature_selection import RFE

best_parameters = {'bootstrap': False, 'criterion': 'gini', 'n_estimators': 500}


clf = RFC(**best_parameters,class_weight={1:1,0:0.1},n_jobs=5) 

clf.fit(train_X, train_y['outcome'])
result = clf.predict(val_X)


print(pd.DataFrame({
    'train':tools.get_performance(train_y,clf.predict(train_X)),
    'val':tools.get_performance(val_y,result)
    }
    ))




                                   train                      val
confusion matrix   [[7180, 0], [0, 718]]  [[5419, 11], [58, 103]]
acc                                  1.0                 0.987659
precision                            1.0                 0.903509
f1_score                             1.0                 0.749091
recall                               1.0                 0.639752
matthews_corrcoef                    1.0                 0.754611
