# Feature Selection & Modelling

In this notebook, I will be using decision tree to identify the feature importance and use them as weights for the columns.

In [1]:
# Base
import pandas as pd
import numpy as np

# Model
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
res_X = pd.read_pickle('SMOTE_res_x.pickle')
res_y = pd.read_pickle('SMOTE_res_y.pickle')
X_test = pd.read_pickle('scaled_X_test.pickle')
y_test = pd.read_pickle('y_test.pickle')
scaled_test = pd.read_pickle('scaled_test.pickle')

In [3]:
# Setting AUC as scoring metrics

from sklearn.metrics import roc_auc_score, make_scorer

def roc_auc_score_proba(y_true, proba):
    return roc_auc_score(y_true, proba[:, 1])

auc = make_scorer(roc_auc_score_proba, needs_proba=True)# Decision Tree

# Decision Tree

I will be using decision tree for feature selection using feature importance.

In [4]:
dtc_params = {
    'max_depth':[1,2,3,4,5,6,7,8],
    'max_features':[None,'log2','sqrt',2,3,4],
    'min_samples_split':[2,3,4,5,10,15,20,25]
}


dtc_gs = GridSearchCV(DecisionTreeClassifier(random_state=2), dtc_params, cv=3, verbose=1,scoring=auc)
dtc_gs.fit(res_X, res_y)

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


[Parallel(n_jobs=1)]: Done 1152 out of 1152 | elapsed:  4.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=2,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [1, 2, 3, 4, 5, 6, 7, 8], 'max_features': [None, 'log2', 'sqrt', 2, 3, 4], 'min_samples_split': [2, 3, 4, 5, 10, 15, 20, 25]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=make_scorer(roc_auc_score_proba, needs_proba=True),
       verbose=1)

In [None]:
dtc_gs.best_estimator_.feature_importances_

In [None]:
pd.DataFrame(dtc_gs.best_estimator_.feature_importances_,index=X.columns).sort_values(by=0,ascending=False)

# Logistic Regression

In [None]:
logregcv = LogisticRegressionCV(n_jobs=-1, random_state=2, max_iter=200, cv=3,\
                                scoring=auc, penalty='l1',solver='saga')
logregcv.fit(res_X, res_y)

In [None]:
logregcv.score(res_X,res_y)   # training score

In [None]:
logregcv.score(X_test,y_test)   # test score