Feature selection

Genetic Feature Selection

Grid optimisation

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN

pd.set_option("display.max_columns", None)

def myGrid(X, y, estimate, param):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)]) 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    def analyse(model, param_grid):
        gsSearch = GridSearchCV(estimator=model, param_grid=param_grid, 
                 scoring=['f1_weighted'], n_jobs=4, 
                 refit='f1_weighted', cv=5, 
                 verbose=0, pre_dispatch='8*n_jobs', 
                 error_score=np.nan, return_train_score=False)

        acc_score = []
        gsSearch.fit(X_train, y_train)
        y_pred = gsSearch.predict(X_test)
        acc = accuracy_score(y_pred , y_test)
        acc_score.append(acc)
        
        print('best_estimator = ', gsSearch.best_estimator_)
        best_grid = gsSearch.best_estimator_
        print('best_params = ', gsSearch.best_params_)
        print('best_score = ', gsSearch.best_score_)
        y_pred=best_grid.predict(X_test)
        print('confusion_matrix \n', confusion_matrix(y_test,y_pred))
        print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
        print('classification_report \n', classification_report(y_test,y_pred))
        return ""
    
    print('StandardScaler')
    analyse(model(StandardScaler()), param)
    print('RobustScaler')
    analyse(model(RobustScaler()), param)
    return ""



print('### log reg SMOTEN median noncontin ###')
print()

smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']

print('Log Reg')
print()

c_lst = [1, 2 ,3]
param_grid = [{
    'estimate__C': c_lst}]


print('Logistic Regression lbfgs')

model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, 
                   fit_intercept=True, intercept_scaling=1, class_weight=None, 
                   random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', 
                   verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

myGrid(X, y, model, param_grid)



print('### Log Reg SMOTEN knn contin ###')
print()

smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']

print('Random Forest')
print()

c_lst = [1, 2 ,3]
param_grid = [{
    'estimate__C': c_lst}]


print('Logistic Regression lbfgs')

model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, 
                   fit_intercept=True, intercept_scaling=1, class_weight=None, 
                   random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', 
                   verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)


myGrid(X, y, model, param_grid)


print('### Log Reg SMOTEN median contin ###')
print()


smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

print('Log Reg')
print()

c_lst = [1, 2 ,3]
param_grid = [{
    'estimate__C': c_lst}]


print('Logistic Regression lbfgs')

model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, 
                   fit_intercept=True, intercept_scaling=1, class_weight=None, 
                   random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', 
                   verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)


myGrid(X, y, model, param_grid)


print('### Log Reg SMOTEN all under 40 missing ###')
print()

smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

X = smoten_median_imputed_less_40.drop('outcome',axis= 1)
y = smoten_median_imputed_less_40['outcome']

print('Log Reg')
print()

c_lst = [1, 2 ,3]
param_grid = [{
    'estimate__C': c_lst}]


print('Logistic Regression lbfgs')

model = LogisticRegression(penalty='l2', dual=False, tol=0.0001, 
                   fit_intercept=True, intercept_scaling=1, class_weight=None, 
                   random_state=None, solver='lbfgs', max_iter=100, multi_class='auto', 
                   verbose=0, warm_start=False, n_jobs=None, l1_ratio=None)

myGrid(X, y, model, param_grid)


### log reg SMOTEN median noncontin ###

Log Reg

Logistic Regression lbfgs
StandardScaler
best_estimator =  Pipeline(steps=[('scaler', StandardScaler()),
                ('estimate', LogisticRegression(C=1))])
best_params =  {'estimate__C': 1}
best_score =  0.6990668621988698
confusion_matrix 
 [[5414 1488]
 [2857 4305]]
Accuracy Score 0.6910551763367463
classification_report 
               precision    recall  f1-score   support

           0       0.65      0.78      0.71      6902
           1       0.74      0.60      0.66      7162

    accuracy                           0.69     14064
   macro avg       0.70      0.69      0.69     14064
weighted avg       0.70      0.69      0.69     14064

RobustScaler
best_estimator =  Pipeline(steps=[('scaler', RobustScaler()),
                ('estimate', LogisticRegression(C=1))])
best_params =  {'estimate__C': 1}
best_score =  0.6990668621988698
confusion_matrix 
 [[5415 1487]
 [2857 4305]]
Accuracy Score 0.6911262798634812
classificatio

''