# Import libraries required

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle

from model_creation import create_smote_data, fit_val_data, model_data, perform_base_models
from model_creation import perform_feat_sel_models, perform_grid_search

# Read in the training data and target

In [3]:
train_data = pd.read_csv('C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\CervicalCancerRisks_Combo_target_TrainData.csv', 
                          delimiter=",", index_col = 0, header = 0)
train_target = pd.read_csv('C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\CervicalCancerRisks_Combo_target_TrainTarget.csv', 
                          delimiter=",", index_col = 0, header = 0)

# Read in the validation data and target

In [4]:
val_data = pd.read_csv('C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\CervicalCancerRisks_Combo_target_ValData.csv', 
                          delimiter=",", index_col = 0, header = 0)
val_target = pd.read_csv('C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\CervicalCancerRisks_Combo_target_ValTarget.csv', 
                          delimiter=",", index_col = 0, header = 0)

# Declare random state variable used for all modeling

In [5]:
rand_st = 1


# Perform SMOTE oversampling with a sampling strategy of 1.0

In [6]:
train_data_sm1p0, train_target_sm1p0 = create_smote_data(train_data, train_target, rand_st, 1.0)

Number of Original Target 0 Value: 506
Number of Original Target 1 Value: 74


Number of SMOTE Target 0 Value: 506
Number of SMOTE Target 1 Value: 506


In [7]:
#train_sm_data_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_TrainData_SMOTE.csv'
#data_train_sm1p0.to_csv(train_sm_data_name, sep = ',')
#train_sm_target_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_TrainTarget_SMOTE.csv'
#target_train_sm1p0.to_csv(train_sm_target_name, sep = ',')


# Declare base model and variables used during feature selection

In [8]:
scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc'}
thresh_vals = [0.4, 0.5, 0.6]
k_vals = [5, 6, 7, 8, 9, 10]

In [9]:
model_type = 'Random Forest'
clf_base = RandomForestClassifier(n_estimators = 100, max_depth = None, 
                             min_samples_split = 3, criterion = 'entropy', 
                             random_state = rand_st)


# Create base models with original and SMOTE 1.0 oversampled data

In [10]:
perform_base_models(train_data, train_target, val_data, val_target, train_data_sm1p0, train_target_sm1p0, clf_base, 
                    scorers, model_type)

Base Random Forest CV Acc: 0.8724401901613297 +/- 0.04253575457525603
Base Random Forest CV AUC: 0.6501187009457248 +/- 0.13926408973075674
CV Runtime: 2.4276037216186523


Base Random Forest Validation Acc: 0.8493150684931506
Base Random Forest Validation AUC: 0.4881889763779528


SMOTE Base Random Forest CV Acc: 0.913395457192778 +/- 0.1790345994627237
SMOTE Base Random Forest CV AUC: 0.9817983352002677 +/- 0.04451299147139268
CV Runtime: 3.623898983001709


SMOTE Base Random Forest Validation Acc: 0.8424657534246576
SMOTE Base Random Forest Validation AUC: 0.5290095317032738




(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=1, verbose=0, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=1, verbose=0, warm_start=False))

# Perform all types of model selection on the original data

In [11]:
results_dict_base = \
  perform_feat_sel_models(train_data, train_target, val_data, val_target, clf_base, model_type, thresh_vals, k_vals)

-- Low Variance Filtering --
Threshold Selected: 0.4
Selected Model Mean Accuracy Score: 0.8758434885121542
Selected Model Accuracy Deviation: 0.018055859134615666
Selected Model Mean AUC Score: 0.5489963113958455
Selected Model AUC Deviation: 0.11577955046957523
Number of Original Features: 30
Number of Selected Features: 8
Features Selected:
['age', 'num_sex_partners', 'first_sex_int', 'num_pregnancies', 'smokes_yrs', 'smokes_pk_yrs', 'hormonal_contr_yrs', 'iud_yrs']
Features Removed:
['smokes', 'hormonal_contr', 'iud', 'stds', 'stds_num', 'stds_condylomatosis', 'stds_cerv_condylomatosis', 'stds_vag_condylomatosis', 'stds_vp_condylomatosis', 'stds_syphillus', 'stds_pelvic_inf_disease', 'stds_gen_herpes', 'stds_molluscom_contagiosum', 'stds_aids', 'stds_hiv', 'stds_hep_b', 'stds_hpv', 'stds_num_dx', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx']


Low Variance Filter Random Forest Validation Acc: 0.8561643835616438
Low Variance Filter Random Forest Validation AUC: 0.4921259842519685


-- Mode

# Perform feature selection on the SMOTE 1.0 oversampled data

In [12]:
model_type = 'SMOTE Random Forest'
results_dict_1p0 = \
  perform_feat_sel_models(train_data_sm1p0, train_target_sm1p0, val_data, val_target, clf_base, model_type, 
                          thresh_vals, k_vals)

-- Low Variance Filtering --
Threshold Selected: 0.4
Selected Model Mean Accuracy Score: 0.9034750533876916
Selected Model Accuracy Deviation: 0.15954871079646563
Selected Model Mean AUC Score: 0.9740464858949494
Selected Model AUC Deviation: 0.06721053945764088
Number of Original Features: 30
Number of Selected Features: 8
Features Selected:
['age', 'num_sex_partners', 'first_sex_int', 'num_pregnancies', 'smokes_yrs', 'smokes_pk_yrs', 'hormonal_contr_yrs', 'iud_yrs']
Features Removed:
['smokes', 'hormonal_contr', 'iud', 'stds', 'stds_num', 'stds_condylomatosis', 'stds_cerv_condylomatosis', 'stds_vag_condylomatosis', 'stds_vp_condylomatosis', 'stds_syphillus', 'stds_pelvic_inf_disease', 'stds_gen_herpes', 'stds_molluscom_contagiosum', 'stds_aids', 'stds_hiv', 'stds_hep_b', 'stds_hpv', 'stds_num_dx', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx']


Low Variance Filter SMOTE Random Forest Validation Acc: 0.8356164383561644
Low Variance Filter SMOTE Random Forest Validation AUC: 0.547451305428926

# Perform SMOTE oversampling with a sampling strategy of 0.3

In [13]:
train_data_sm0p3, train_target_sm0p3 = create_smote_data(train_data, train_target, rand_st, 0.3)

Number of Original Target 0 Value: 506
Number of Original Target 1 Value: 74


Number of SMOTE Target 0 Value: 506
Number of SMOTE Target 1 Value: 151


# Create base models with original and SMOTE 0.3 oversampled data

In [14]:
perform_base_models(train_data, train_target, val_data, val_target, train_data_sm0p3, train_target_sm0p3, clf_base, 
                    scorers, model_type)

Base SMOTE Random Forest CV Acc: 0.8724401901613297 +/- 0.04253575457525603
Base SMOTE Random Forest CV AUC: 0.6501187009457248 +/- 0.13926408973075674
CV Runtime: 2.264678716659546


Base SMOTE Random Forest Validation Acc: 0.8493150684931506
Base SMOTE Random Forest Validation AUC: 0.4881889763779528


SMOTE Base SMOTE Random Forest CV Acc: 0.8647420076909832 +/- 0.12010934588010432
SMOTE Base SMOTE Random Forest CV AUC: 0.8717987738052742 +/- 0.1806475854506773
CV Runtime: 2.41658353805542


SMOTE Base SMOTE Random Forest Validation Acc: 0.8356164383561644
SMOTE Base SMOTE Random Forest Validation AUC: 0.48031496062992124




(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=1, verbose=0, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=1, verbose=0, warm_start=False))

# Perform feature selection on the SMOTE 0.3 oversampled data

In [15]:
results_dict_0p3 = \
  perform_feat_sel_models(train_data_sm0p3, train_target_sm0p3, val_data, val_target, clf_base, model_type, 
                          thresh_vals, k_vals)

-- Low Variance Filtering --
Threshold Selected: 0.4
Selected Model Mean Accuracy Score: 0.8571772943809906
Selected Model Accuracy Deviation: 0.12230094964378609
Selected Model Mean AUC Score: 0.8313748035145071
Selected Model AUC Deviation: 0.22177216283535356
Number of Original Features: 30
Number of Selected Features: 8
Features Selected:
['age', 'num_sex_partners', 'first_sex_int', 'num_pregnancies', 'smokes_yrs', 'smokes_pk_yrs', 'hormonal_contr_yrs', 'iud_yrs']
Features Removed:
['smokes', 'hormonal_contr', 'iud', 'stds', 'stds_num', 'stds_condylomatosis', 'stds_cerv_condylomatosis', 'stds_vag_condylomatosis', 'stds_vp_condylomatosis', 'stds_syphillus', 'stds_pelvic_inf_disease', 'stds_gen_herpes', 'stds_molluscom_contagiosum', 'stds_aids', 'stds_hiv', 'stds_hep_b', 'stds_hpv', 'stds_num_dx', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx']


Low Variance Filter SMOTE Random Forest Validation Acc: 0.8561643835616438
Low Variance Filter SMOTE Random Forest Validation AUC: 0.514504765851637

# Declare parameter values used during grid search

In [16]:
cv = 5
clf_grid = RandomForestClassifier(random_state = rand_st)

n_estimators = [50, 100, 250, 500]
max_depth = [None, 3, 5, 10, 20]
min_samples_split = [3, 5, 7, 10, 15, 20]
criterion = ['entropy', 'gini']
class_weight = [None, 'balanced', 'balanced_subsample']

#n_estimators = [100]
#max_depth = [None]
#min_samples_split = [3]
#criterion = ['entropy']
#class_weight  = [None]

model_params = {
    'criterion': criterion,
    'max_depth': max_depth,
    'n_estimators': n_estimators,
    'min_samples_split': min_samples_split,
    'class_weight': class_weight
}

score = 'roc_auc'

# Perform grid search on the feature selection dataset from low variance filter with SMOTE 1.0 oversampling

In [17]:
lvf_train_data_1p0 = results_dict_1p0['lvf']['data']
lvf_del_cols_1p0 = results_dict_1p0['lvf']['del_cols']
lvf_val_data_1p0 = val_data.drop(lvf_del_cols_1p0, axis = 1)

model_type = 'SMOTE 1.0 Random Forest'
model_desc = 'Low Variance Filter FS'

opt_model = perform_grid_search(clf_grid, lvf_train_data_1p0, train_target_sm1p0, model_params, score, cv, 
                              model_type, model_desc)
lvf_model_sm_1p0 = model_data(lvf_train_data_1p0, train_target_sm1p0, lvf_val_data_1p0, val_target, 
                                opt_model, scorers, model_type, model_desc, cv)


Low Variance Filter FS SMOTE 1.0 Random Forest Grid Search
Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 3600 out of 3600 | elapsed: 66.4min finished


Grid Search Runtime: 3988.276356935501


Grid Search Optimal Parameters: {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 250}
Grid Search Optimal Parameter Score: 0.9773450490829134


Final Model Parameter Settings:
RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='entropy', max_depth=20, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=1,
            min_samples_split=3, min_weight_fraction_leaf=0.0,
            n_estimators=250, n_jobs=None, oob_score=False, random_state=1,
            verbose=0, warm_start=False)


Low Variance Filter FS SMOTE 1.0 Random Forest CV Acc: 0.9104154533100368 +/- 0.17119560812381138
Low Variance Filter FS SMOTE 1.0 Random Forest CV AUC: 0.9774593040516152 +/- 0.05863594091149113
CV Runtime: 9.667466163635254


Low Variance Filter FS SMOTE 1.0 Random For

In [18]:
filename = 'Combo_LVF_FS_SMOTE_1p0_RF_Model.sav'
pickle.dump(lvf_model_sm_1p0, open(filename, 'wb'))

# Perform grid search on the feature selection dataset from mutual information with SMOTE 0.3 oversampling

In [19]:
mutinf_train_data_1p0 = results_dict_1p0['mutinf']['data']
mutinf_del_cols_1p0 = results_dict_1p0['mutinf']['del_cols']
mutinf_val_data_1p0 = val_data.drop(mutinf_del_cols_1p0, axis = 1)

model_type = 'SMOTE 1.0 Random Forest'
model_desc = 'Mutual Information FS'

opt_model = perform_grid_search(clf_grid, mutinf_train_data_1p0, train_target_sm1p0, model_params, score, cv, 
                              model_type, model_desc)
mutinf_model_sm_1p0 = model_data(mutinf_train_data_1p0, train_target_sm1p0, mutinf_val_data_1p0, val_target, 
                                opt_model, scorers, model_type, model_desc, cv)


Mutual Information FS SMOTE 1.0 Random Forest Grid Search
Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 3600 out of 3600 | elapsed: 64.0min finished


Grid Search Runtime: 3844.6389865875244


Grid Search Optimal Parameters: {'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 3, 'n_estimators': 500}
Grid Search Optimal Parameter Score: 0.9798125272885345


Final Model Parameter Settings:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)


Mutual Information FS SMOTE 1.0 Random Forest CV Acc: 0.9124053581828772 +/- 0.1797892660097043
Mutual Information FS SMOTE 1.0 Random Forest CV AUC: 0.9799239599693979 +/- 0.05690785167522335
CV Runtime: 12.769638061523438


Mutual Information FS SMOTE 1.0 Random Forest Validation Acc: 0.863013698630137
Mutual Inf

In [20]:
filename = 'Combo_MutInf_FS_SMOTE_1p0_RF_Model.sav'
pickle.dump(mutinf_model_sm_1p0, open(filename, 'wb'))