Feature selection

Genetic Feature Selection

Grid optimisation

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN

pd.set_option("display.max_columns", None)

def myGrid(X, y, estimate, param):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)]) 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    def analyse(model, param_grid):
        gsSearch = GridSearchCV(estimator=model, param_grid=param_grid, 
                 scoring=['f1_weighted'], n_jobs=4, 
                 refit='f1_weighted', cv=5, 
                 verbose=0, pre_dispatch='8*n_jobs', 
                 error_score=np.nan, return_train_score=False)

        acc_score = []
        gsSearch.fit(X_train, y_train)
        y_pred = gsSearch.predict(X_test)
        acc = accuracy_score(y_pred , y_test)
        acc_score.append(acc)
        
        print('best_estimator = ', gsSearch.best_estimator_)
        best_grid = gsSearch.best_estimator_
        print('best_params = ', gsSearch.best_params_)
        print('best_score = ', gsSearch.best_score_)
        y_pred=best_grid.predict(X_test)
        print('confusion_matrix \n', confusion_matrix(y_test,y_pred))
        print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
        print('classification_report \n', classification_report(y_test,y_pred))
        return ""
    
    print('StandardScaler')
    analyse(model(StandardScaler()), param)
    print('RobustScaler')
    analyse(model(RobustScaler()), param)
    return ""



print('### Random Forest SMOTEN median noncontin ###')
print()

smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']

print('Random Forest')
print()

n_estimators_lst = [50, 100, 150]
param_grid = [{
    'estimate__n_estimators': n_estimators_lst}]


model = RandomForestClassifier(
                        criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

myGrid(X, y, model, param_grid)



print('### AdaBoost SMOTEN knn contin ###')
print()

smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']

print('Random Forest')
print()

n_estimators_lst = [50, 100, 150]
param_grid = [{
    'estimate__n_estimators': n_estimators_lst}]


model = RandomForestClassifier(
                        criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)


myGrid(X, y, model, param_grid)


print('### Random Forest SMOTEN median contin ###')
print()


smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

print('Random Forest')
print()

n_estimators_lst = [50, 100, 150]
param_grid = [{
    'estimate__n_estimators': n_estimators_lst}]


model = RandomForestClassifier(
                        criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)


myGrid(X, y, model, param_grid)


print('### Random Forest SMOTEN all under 40 missing ###')
print()

smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

X = smoten_median_imputed_less_40.drop('outcome',axis= 1)
y = smoten_median_imputed_less_40['outcome']

print('Random Forest')
print()

n_estimators_lst = [50, 100, 150]
param_grid = [{
    'estimate__n_estimators': n_estimators_lst}]


model = RandomForestClassifier(
                        criterion='entropy', 
                        max_depth=None, min_samples_split=2, 
                        min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                        max_features='sqrt', max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, bootstrap=True, oob_score=True, 
                        n_jobs=4, 
                        random_state=None, verbose=0, warm_start=False, 
                        class_weight=None, ccp_alpha=0.0, max_samples=None)

myGrid(X, y, model, param_grid)


### Random Forest SMOTEN median noncontin ###

Random Forest

StandardScaler
best_estimator =  Pipeline(steps=[('scaler', StandardScaler()),
                ('estimate',
                 RandomForestClassifier(criterion='entropy', n_estimators=150,
                                        n_jobs=4, oob_score=True))])
best_params =  {'estimate__n_estimators': 150}
best_score =  0.8474949536701939
confusion_matrix 
 [[5644 1258]
 [ 954 6208]]
Accuracy Score 0.8427189988623436
classification_report 
               precision    recall  f1-score   support

           0       0.86      0.82      0.84      6902
           1       0.83      0.87      0.85      7162

    accuracy                           0.84     14064
   macro avg       0.84      0.84      0.84     14064
weighted avg       0.84      0.84      0.84     14064

RobustScaler
best_estimator =  Pipeline(steps=[('scaler', RobustScaler()),
                ('estimate',
                 RandomForestClassifier(criterion='entropy', n_esti

''