In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from baseline_package.base_predictor import BasePredictor

In [2]:
d_path = "dataset/preprocessed.csv"
r_path = "dataset/resampled.csv"

In [3]:
rsf = RandomForestClassifier()

param_grid_rsf = {
    'criterion' : ['entropy','log_loss'],
    'n_estimators' : [50,100,120,150],
    'max_depth' : [None],
    'min_samples_split' : [4,5,6],
    'min_samples_leaf' : [2,3,4],
    'max_leaf_nodes' : [None],
    'bootstrap' : [False],
    'max_features' : ['sqrt','log2'],
    'min_weight_fraction_leaf' : [0.0],
    'class_weight' : [{0:1, 1:5},'balanced'],
    'warm_start' : [False],
    'ccp_alpha' : [0.005,0.01]
}

bp_rsf = BasePredictor(model = rsf, d_path = d_path, param_grid = param_grid_rsf, n_splits = 5)
bp_rsf.run(features = None, select_path = None)

In [4]:
bp_rsf.sens

[64.28571428571429,
 47.61904761904761,
 40.476190476190474,
 61.904761904761905,
 85.71428571428571]

In [5]:
bp_rsf.specs

[74.16107382550335,
 86.24161073825503,
 90.93959731543623,
 82.88590604026845,
 65.1006711409396]

In [6]:
bp_rsf.best_params

{'bootstrap': False,
 'ccp_alpha': 0.01,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'warm_start': False}

In [7]:
bp_rsf.confs

[array([[221,  77],
        [ 15,  27]]),
 array([[257,  41],
        [ 22,  20]]),
 array([[271,  27],
        [ 25,  17]]),
 array([[247,  51],
        [ 16,  26]]),
 array([[194, 104],
        [  6,  36]])]

In [8]:
rsf = RandomForestClassifier()

param_grid_rsf_re = {
    'criterion' : ['entropy','log_loss'],
    'n_estimators' : [50,70,100,120,150],
    'max_depth' : [None],
    'min_samples_split' : [4,5,6],
    'min_samples_leaf' : [2,3,4],
    'max_leaf_nodes' : [None],
    'bootstrap' : [False],
    'max_features' : ['sqrt','log2'],
    'min_weight_fraction_leaf' : [0.0],
    'class_weight' : [{0:1, 1:5},'balanced'],
    'warm_start' : [False],
    'ccp_alpha' : [0.005,0.01]
}

bp_rsf_resampled = BasePredictor(model = rsf, d_path = r_path, param_grid = param_grid_rsf_re, n_splits = 5)
bp_rsf_resampled.run(features = None, select_path = None)

In [9]:
bp_rsf_resampled.sens

[87.0, 82.0, 87.0, 83.0, 83.5]

In [10]:
bp_rsf_resampled.specs

[80.0, 83.5, 82.0, 88.0, 86.5]

In [11]:
bp_rsf_resampled.confs

[array([[160,  40],
        [ 26, 174]]),
 array([[167,  33],
        [ 36, 164]]),
 array([[164,  36],
        [ 26, 174]]),
 array([[176,  24],
        [ 34, 166]]),
 array([[173,  27],
        [ 33, 167]])]

In [12]:
bp_rsf_resampled.best_params

{'bootstrap': False,
 'ccp_alpha': 0.005,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'log2',
 'max_leaf_nodes': None,
 'min_samples_leaf': 2,
 'min_samples_split': 6,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 70,
 'warm_start': False}

In [13]:
log_r = LogisticRegression()

param_grid_logr = {
    'penalty' : ['l2'],
    'dual' : [False],
    'tol' : [1e-3,1e-4],
    'C' : [0.3,0.29,0.28,0.277,0.275,0.274,0.273,0.27],
    'fit_intercept' : [True],
    'class_weight' : [{0:1, 1:42},{0:1,1:20},'balanced'],
    'solver' : ['lbfgs','newton-cg'],
    'max_iter' : [5000,1000],
    'warm_start' : [False],
    'n_jobs' : [None        ],
    'multi_class' : ['auto'],
    'l1_ratio' : [None],
}

bp_logr = BasePredictor(model = log_r, d_path = d_path, param_grid = param_grid_logr, n_splits = 5)
bp_logr.run(features = None, select_path = None)

In [14]:
bp_logr.sens

[73.80952380952381,
 54.761904761904766,
 47.61904761904761,
 64.28571428571429,
 83.33333333333334]

In [15]:
bp_logr.specs

[72.81879194630872,
 85.57046979865773,
 87.58389261744966,
 80.20134228187919,
 65.77181208053692]

In [16]:
bp_logr.best_params

{'C': 0.28,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'l1_ratio': None,
 'max_iter': 5000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'solver': 'lbfgs',
 'tol': 0.001,
 'warm_start': False}

In [17]:
bp_logr.confs

[array([[217,  81],
        [ 11,  31]]),
 array([[255,  43],
        [ 19,  23]]),
 array([[261,  37],
        [ 22,  20]]),
 array([[239,  59],
        [ 15,  27]]),
 array([[196, 102],
        [  7,  35]])]

In [19]:
log_r = LogisticRegression()

param_grid_logr_re = {
    'penalty' : ['l2'],
    'dual' : [False],
    'tol' : [1e-3,1e-4],
    'C' : [0.3,0.29,0.28,0.277,0.275,0.274,0.273,0.27],
    'fit_intercept' : [True],
    'class_weight' : [{0:1, 1:42},{0:1,1:20},'balanced'],
    'solver' : ['lbfgs','newton-cg'],
    'max_iter' : [5000,1000],
    'warm_start' : [False],
    'n_jobs' : [None        ],
    'multi_class' : ['auto'],
    'l1_ratio' : [None],
}

bp_logr_resampled = BasePredictor(model = log_r, d_path = r_path, param_grid = param_grid_logr_re, n_splits = 5)
bp_logr_resampled.run(features = None, select_path = None)

In [20]:
bp_logr_resampled.sens

[67.5, 62.0, 73.0, 71.5, 64.0]

In [25]:
bp_logr_resampled.specs

[78.0, 81.0, 78.0, 80.5, 82.5]

In [26]:
bp_logr_resampled.confs

[array([[156,  44],
        [ 65, 135]]),
 array([[162,  38],
        [ 76, 124]]),
 array([[156,  44],
        [ 54, 146]]),
 array([[161,  39],
        [ 57, 143]]),
 array([[165,  35],
        [ 72, 128]])]

In [27]:
bp_logr_resampled.best_params

{'C': 0.3,
 'class_weight': 'balanced',
 'dual': False,
 'fit_intercept': True,
 'l1_ratio': None,
 'max_iter': 5000,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'solver': 'lbfgs',
 'tol': 0.001,
 'warm_start': False}

In [48]:
etc = ExtraTreesClassifier()

param_grid_etc = {
    'criterion' : ['gini','entropy','log_loss'],
    'max_depth' : [None],
    'min_samples_split' : [2,3,4],
    'min_samples_leaf' : [2,3],
    'min_weight_fraction_leaf' : [0.0],
    'max_features' : ['sqrt','log2'],
    'max_leaf_nodes' : [None],
    'min_impurity_decrease' : [0.0],
    'bootstrap' : [True, False],
    'class_weight' : [{0:1,1:5},'balanced'],
    'warm_start' : [False],
    'ccp_alpha' : [0.007,0.005],
    'random_state' : [0]
}

bp_etc = BasePredictor(model = etc, d_path = d_path, param_grid = param_grid_etc, n_splits = 5)
bp_etc.run(features = None, select_path = None)

In [49]:
bp_etc.sens

[61.904761904761905,
 47.61904761904761,
 40.476190476190474,
 69.04761904761905,
 85.71428571428571]

In [50]:
bp_etc.specs

[77.85234899328859,
 94.63087248322147,
 91.94630872483222,
 86.91275167785236,
 72.14765100671141]

In [51]:
bp_etc.confs

[array([[232,  66],
        [ 16,  26]]),
 array([[282,  16],
        [ 22,  20]]),
 array([[274,  24],
        [ 25,  17]]),
 array([[259,  39],
        [ 13,  29]]),
 array([[215,  83],
        [  6,  36]])]

In [52]:
bp_etc.best_params

{'bootstrap': False,
 'ccp_alpha': 0.005,
 'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 2,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 0,
 'warm_start': False}

In [53]:
etc = ExtraTreesClassifier()

param_grid_etc_re = {
    'criterion' : ['gini','entropy','log_loss'],
    'max_depth' : [None],
    'min_samples_split' : [2,3,4],
    'min_samples_leaf' : [1,2],
    'min_weight_fraction_leaf' : [0.0],
    'max_features' : ['sqrt','log2'],
    'max_leaf_nodes' : [None],
    'min_impurity_decrease' : [0.0],
    'bootstrap' : [True, False],
    'class_weight' : [{0:1,1:5},'balanced'],
    'warm_start' : [False],
    'ccp_alpha' : [0.0,0.005],
    'random_state' : [0]
}

bp_etc_resampled = BasePredictor(model = etc, d_path = r_path, param_grid = param_grid_etc_re, n_splits = 5)
bp_etc_resampled.run(features = None, select_path = None)

In [54]:
bp_etc_resampled.sens

[99.0, 98.5, 98.5, 100.0, 100.0]

In [55]:
bp_etc_resampled.specs

[95.5, 95.5, 95.0, 96.5, 96.5]

In [56]:
bp_etc_resampled.confs

[array([[191,   9],
        [  2, 198]]),
 array([[191,   9],
        [  3, 197]]),
 array([[190,  10],
        [  3, 197]]),
 array([[193,   7],
        [  0, 200]]),
 array([[193,   7],
        [  0, 200]])]

In [57]:
bp_etc_resampled.best_params

{'bootstrap': False,
 'ccp_alpha': 0.0,
 'class_weight': {0: 1, 1: 5},
 'criterion': 'entropy',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 0,
 'warm_start': False}