# Import libraries required

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle

from model_creation import create_smote_data, fit_val_data, model_data, perform_base_models
from model_creation import perform_feat_sel_models, perform_grid_search

# Read in the training data and target

In [3]:
train_data = pd.read_csv('C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\CervicalCancerRisks_Biopsy_TrainData.csv', 
                          delimiter=",", index_col = 0, header = 0)
train_target = pd.read_csv('C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\CervicalCancerRisks_Biopsy_TrainTarget.csv', 
                          delimiter=",", index_col = 0, header = 0)

# Read in the validation data and target

In [4]:
val_data = pd.read_csv('C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\CervicalCancerRisks_Biopsy_ValData.csv', 
                          delimiter=",", index_col = 0, header = 0)
val_target = pd.read_csv('C:\\DePaulCoursework\\Winter 2019 CSC 529\\Project\\Data\\CervicalCancerRisks_Biopsy_ValTarget.csv', 
                          delimiter=",", index_col = 0, header = 0)

# Declare random state variable used for all modeling

In [5]:
rand_st = 1


# Perform SMOTE oversampling with a sampling strategy of 1.0

In [6]:
train_data_sm1p0, train_target_sm1p0 = create_smote_data(train_data, train_target, rand_st, 1.0)

Number of Original Target 0 Value: 540
Number of Original Target 1 Value: 40


Number of SMOTE Target 0 Value: 540
Number of SMOTE Target 1 Value: 540


In [7]:
#train_sm_data_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_TrainData_SMOTE.csv'
#data_train_sm1p0.to_csv(train_sm_data_name, sep = ',')
#train_sm_target_name = data_save_path + 'CervicalCancerRisks_' + target_name + '_TrainTarget_SMOTE.csv'
#target_train_sm1p0.to_csv(train_sm_target_name, sep = ',')


# Declare base model and variables used during feature selection

In [8]:
scorers = {'Accuracy': 'accuracy', 'roc_auc': 'roc_auc'}
thresh_vals = [0.4, 0.5, 0.6]
k_vals = [5, 6, 7, 8, 9, 10]

In [9]:
model_type = 'Random Forest'
clf_base = RandomForestClassifier(n_estimators = 100, max_depth = None, 
                             min_samples_split = 3, criterion = 'entropy', 
                             random_state = rand_st)


# Create base models with original and SMOTE 1.0 oversampled data

In [10]:
perform_base_models(train_data, train_target, val_data, val_target, train_data_sm1p0, train_target_sm1p0, clf_base, 
                    scorers, model_type)

Base Random Forest CV Acc: 0.9275862068965518 +/- 0.013793103448275877
Base Random Forest CV AUC: 0.7184027777777777 +/- 0.09777619948221135
CV Runtime: 2.2957048416137695


Base Random Forest Validation Acc: 0.9178082191780822
Base Random Forest Validation AUC: 0.49264705882352944


SMOTE Base Random Forest CV Acc: 0.9657407407407408 +/- 0.06404302394737252
SMOTE Base Random Forest CV AUC: 0.9944530178326474 +/- 0.02004938478079519
CV Runtime: 3.1651611328125


SMOTE Base Random Forest Validation Acc: 0.8972602739726028
SMOTE Base Random Forest Validation AUC: 0.5279411764705882




(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=1, verbose=0, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=1, verbose=0, warm_start=False))

# Perform all types of model selection on the original data

In [11]:
results_dict_base = \
  perform_feat_sel_models(train_data, train_target, val_data, val_target, clf_base, model_type, thresh_vals, k_vals)

-- Low Variance Filtering --
Threshold Selected: 0.4
Selected Model Mean Accuracy Score: 0.9310344827586207
Selected Model Accuracy Deviation: 0.0
Selected Model Mean AUC Score: 0.5826388888888889
Selected Model AUC Deviation: 0.16837218921635758
Number of Original Features: 30
Number of Selected Features: 8
Features Selected:
['age', 'num_sex_partners', 'first_sex_int', 'num_pregnancies', 'smokes_yrs', 'smokes_pk_yrs', 'hormonal_contr_yrs', 'iud_yrs']
Features Removed:
['smokes', 'hormonal_contr', 'iud', 'stds', 'stds_num', 'stds_condylomatosis', 'stds_cerv_condylomatosis', 'stds_vag_condylomatosis', 'stds_vp_condylomatosis', 'stds_syphillus', 'stds_pelvic_inf_disease', 'stds_gen_herpes', 'stds_molluscom_contagiosum', 'stds_aids', 'stds_hiv', 'stds_hep_b', 'stds_hpv', 'stds_num_dx', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx']


Low Variance Filter Random Forest Validation Acc: 0.9315068493150684
Low Variance Filter Random Forest Validation AUC: 0.5


-- Model Wrapper Feature Selection --
M

# Perform feature selection on the SMOTE 1.0 oversampled data

In [12]:
model_type = 'SMOTE Random Forest'
results_dict_1p0 = \
  perform_feat_sel_models(train_data_sm1p0, train_target_sm1p0, val_data, val_target, clf_base, model_type, 
                          thresh_vals, k_vals)

-- Low Variance Filtering --
Threshold Selected: 0.4
Selected Model Mean Accuracy Score: 0.9638888888888889
Selected Model Accuracy Deviation: 0.07508568493597523
Selected Model Mean AUC Score: 0.9933384773662551
Selected Model AUC Deviation: 0.022459371114342066
Number of Original Features: 30
Number of Selected Features: 8
Features Selected:
['age', 'num_sex_partners', 'first_sex_int', 'num_pregnancies', 'smokes_yrs', 'smokes_pk_yrs', 'hormonal_contr_yrs', 'iud_yrs']
Features Removed:
['smokes', 'hormonal_contr', 'iud', 'stds', 'stds_num', 'stds_condylomatosis', 'stds_cerv_condylomatosis', 'stds_vag_condylomatosis', 'stds_vp_condylomatosis', 'stds_syphillus', 'stds_pelvic_inf_disease', 'stds_gen_herpes', 'stds_molluscom_contagiosum', 'stds_aids', 'stds_hiv', 'stds_hep_b', 'stds_hpv', 'stds_num_dx', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx']


Low Variance Filter SMOTE Random Forest Validation Acc: 0.8767123287671232
Low Variance Filter SMOTE Random Forest Validation AUC: 0.47058823529411

#  Perform SMOTE oversampling with a sampling strategy of 0.3

In [13]:
train_data_sm0p3, train_target_sm0p3 = create_smote_data(train_data, train_target, rand_st, 0.3)

Number of Original Target 0 Value: 540
Number of Original Target 1 Value: 40


Number of SMOTE Target 0 Value: 540
Number of SMOTE Target 1 Value: 162


# Create base models with original and SMOTE 0.3 oversampled data

In [14]:
perform_base_models(train_data, train_target, val_data, val_target, train_data_sm0p3, train_target_sm0p3, clf_base, 
                    scorers, model_type)

Base SMOTE Random Forest CV Acc: 0.9275862068965518 +/- 0.013793103448275877
Base SMOTE Random Forest CV AUC: 0.7184027777777777 +/- 0.09777619948221135
CV Runtime: 2.544536590576172


Base SMOTE Random Forest Validation Acc: 0.9178082191780822
Base SMOTE Random Forest Validation AUC: 0.49264705882352944


SMOTE Base SMOTE Random Forest CV Acc: 0.9389057750759878 +/- 0.10393550738027854
SMOTE Base SMOTE Random Forest CV AUC: 0.9743914842873176 +/- 0.05882664778010867
CV Runtime: 2.5355653762817383


SMOTE Base SMOTE Random Forest Validation Acc: 0.8904109589041096
SMOTE Base SMOTE Random Forest Validation AUC: 0.47794117647058826




(RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=1, verbose=0, warm_start=False),
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
             oob_score=False, random_state=1, verbose=0, warm_start=False))

# Perform feature selection on the SMOTE 0.3 oversampled data

In [15]:
results_dict_0p3 = \
  perform_feat_sel_models(train_data_sm0p3, train_target_sm0p3, val_data, val_target, clf_base, model_type, 
                          thresh_vals, k_vals)

-- Low Variance Filtering --
Threshold Selected: 0.4
Selected Model Mean Accuracy Score: 0.9303444782168186
Selected Model Accuracy Deviation: 0.09827873697636468
Selected Model Mean AUC Score: 0.964393062570146
Selected Model AUC Deviation: 0.0792369590436736
Number of Original Features: 30
Number of Selected Features: 8
Features Selected:
['age', 'num_sex_partners', 'first_sex_int', 'num_pregnancies', 'smokes_yrs', 'smokes_pk_yrs', 'hormonal_contr_yrs', 'iud_yrs']
Features Removed:
['smokes', 'hormonal_contr', 'iud', 'stds', 'stds_num', 'stds_condylomatosis', 'stds_cerv_condylomatosis', 'stds_vag_condylomatosis', 'stds_vp_condylomatosis', 'stds_syphillus', 'stds_pelvic_inf_disease', 'stds_gen_herpes', 'stds_molluscom_contagiosum', 'stds_aids', 'stds_hiv', 'stds_hep_b', 'stds_hpv', 'stds_num_dx', 'dx_cancer', 'dx_cin', 'dx_hpv', 'dx']


Low Variance Filter SMOTE Random Forest Validation Acc: 0.910958904109589
Low Variance Filter SMOTE Random Forest Validation AUC: 0.4889705882352941



# Declare parameter values used during grid search

In [16]:
cv = 5
clf_grid = RandomForestClassifier(random_state = rand_st)

n_estimators = [50, 100, 250, 500]
max_depth = [None, 3, 5, 10, 20]
min_samples_split = [3, 5, 7, 10, 15, 20]
criterion = ['entropy', 'gini']
class_weight = [None, 'balanced', 'balanced_subsample']

#n_estimators = [100]
#max_depth = [None]
#min_samples_split = [3]
#criterion = ['entropy']
#class_weight  = [None]

model_params = {
    'criterion': criterion,
    'max_depth': max_depth,
    'n_estimators': n_estimators,
    'min_samples_split': min_samples_split,
    'class_weight': class_weight
}

score = 'roc_auc'

# Perform grid search on the feature selection dataset from chi-squared with SMOTE 1.0 oversampling

In [17]:
chisq_train_data_1p0 = results_dict_1p0['chisq']['data']
chisq_del_cols_1p0 = results_dict_1p0['chisq']['del_cols']
chisq_val_data_1p0 = val_data.drop(chisq_del_cols_1p0, axis = 1)

model_type = 'SMOTE 1.0 Random Forest'
model_desc = 'Chi-Squared FS'

opt_model = perform_grid_search(clf_grid, chisq_train_data_1p0, train_target_sm1p0, model_params, score, cv, 
                              model_type, model_desc)
chisq_model_sm_1p0 = model_data(chisq_train_data_1p0, train_target_sm1p0, chisq_val_data_1p0, val_target, 
                                opt_model, scorers, model_type, model_desc, cv)


Chi-Squared FS SMOTE 1.0 Random Forest Grid Search
Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 3600 out of 3600 | elapsed: 60.5min finished


Grid Search Runtime: 3632.343730211258


Grid Search Optimal Parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 7, 'n_estimators': 250}
Grid Search Optimal Parameter Score: 0.9468707133058986


Final Model Parameter Settings:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)


Chi-Squared FS SMOTE 1.0 Random Forest CV Acc: 0.887962962962963 +/- 0.06180858988251345
Chi-Squared FS SMOTE 1.0 Random Forest CV AUC: 0.9468707133058984 +/- 0.04860222240146897
CV Runtime: 6.415312767028809


Chi-Squared FS SMOTE 1.0 Random Forest Validation Acc: 0.8698630136986302
Chi-Squared FS SMOTE 1.0 Rando

In [18]:
filename = 'Biopsy_ChiSq_FS_SMOTE_1p0_RF_Model.sav'
pickle.dump(chisq_model_sm_1p0, open(filename, 'wb'))

# Perform grid search on the feature selection dataset from chi-squared with SMOTE 0.3 oversampling

In [19]:
chisq_train_data_0p3 = results_dict_0p3['chisq']['data']
chisq_del_cols_0p3 = results_dict_0p3['chisq']['del_cols']
chisq_val_data_0p3 = val_data.drop(chisq_del_cols_0p3, axis = 1)

model_type = 'SMOTE 0.3 Random Forest'
model_desc = 'Chi-Squared FS'

opt_model = perform_grid_search(clf_grid, chisq_train_data_0p3, train_target_sm0p3, model_params, score, cv, 
                                model_type, model_desc)
chisq_model_sm_0p3 = model_data(chisq_train_data_0p3, train_target_sm0p3, chisq_val_data_0p3, val_target, 
                                opt_model, scorers, model_type, model_desc, cv)


Chi-Squared FS SMOTE 0.3 Random Forest Grid Search
Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 3600 out of 3600 | elapsed: 51.4min finished


Grid Search Runtime: 3087.280085325241


Grid Search Optimal Parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 7, 'n_estimators': 500}
Grid Search Optimal Parameter Score: 0.8986937273974311


Final Model Parameter Settings:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=7,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)


Chi-Squared FS SMOTE 0.3 Random Forest CV Acc: 0.893242147922999 +/- 0.06621346744674762
Chi-Squared FS SMOTE 0.3 Random Forest CV AUC: 0.898814534231201 +/- 0.09972748014359012
CV Runtime: 10.637908458709717


Chi-Squared FS SMOTE 0.3 Random Forest Validation Acc: 0.8904109589041096
Chi-Squared FS SMOTE 0.3 Rando

In [20]:
filename = 'Biopsy_ChiSq_FS_SMOTE_0p3_RF_Model.sav'
pickle.dump(chisq_model_sm_0p3, open(filename, 'wb'))
