Feature selection

Genetic Feature Selection

Grid optimisation

In [7]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN

pd.set_option("display.max_columns", None)

def myGrid(X, y, estimate, param):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)]) 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    def analyse(model, param_grid):
        gsSearch = GridSearchCV(estimator=model, param_grid=param_grid, 
                 scoring=['f1_weighted'], n_jobs=4, 
                 refit='f1_weighted', cv=5, 
                 verbose=0, pre_dispatch='8*n_jobs', 
                 error_score=np.nan, return_train_score=False)

        acc_score = []
        gsSearch.fit(X_train, y_train)
        y_pred = gsSearch.predict(X_test)
        acc = accuracy_score(y_pred , y_test)
        acc_score.append(acc)
        
        print('best_estimator = ', gsSearch.best_estimator_)
        best_grid = gsSearch.best_estimator_
        print('best_params = ', gsSearch.best_params_)
        print('best_score = ', gsSearch.best_score_)
        y_pred=best_grid.predict(X_test)
        print('confusion_matrix \n', confusion_matrix(y_test,y_pred))
        print("Accuracy Score {}".format(accuracy_score(y_test,y_pred)))
        print('classification_report \n', classification_report(y_test,y_pred))
        return ""
    
    print('StandardScaler')
    analyse(model(StandardScaler()), param)
    print('RobustScaler')
    analyse(model(RobustScaler()), param)
    return ""



print('### SMOTEN median noncontin ###')
print()

smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']

print('SVC rbf')
print()

c_lst = [1, 2 ,3]

param_grid = [{
    'estimate__C': c_lst}]

model = SVC(kernel='rbf')

myGrid(X, y, model, param_grid)




print('### SVC SMOTEN knn contin ###')
print()


smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']

print('SVC rbf')
print()

c_lst = [1, 2 ,3]


param_grid = [{
    'estimate__C': c_lst}]

model = SVC(kernel='rbf')

myGrid(X, y, model, param_grid)


print('### SVC SMOTEN median contin ###')
print()


smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

print('SVC rbf')
print()

c_lst = [1, 2 ,3]


param_grid = [{
    'estimate__C': c_lst}]

model = SVC(kernel='rbf')

myGrid(X, y, model, param_grid)


print('### SMOTEN all under 40 missing ###')
print()

smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

X = smoten_median_imputed_less_40.drop('outcome',axis= 1)
y = smoten_median_imputed_less_40['outcome']

print('SVC rbf')
print()

c_lst = [1, 2 ,3]

param_grid = [{
    'estimate__C': c_lst}]

model = SVC(kernel='rbf')

myGrid(X, y, model, param_grid)


### SMOTEN median noncontin ###

SVC rbf

StandardScaler
best_estimator =  Pipeline(steps=[('scaler', StandardScaler()), ('estimate', SVC(C=3))])
best_params =  {'estimate__C': 3}
best_score =  0.7659930476658459
confusion_matrix 
 [[5383 1519]
 [1815 5347]]
Accuracy Score 0.7629408418657565
classification_report 
               precision    recall  f1-score   support

           0       0.75      0.78      0.76      6902
           1       0.78      0.75      0.76      7162

    accuracy                           0.76     14064
   macro avg       0.76      0.76      0.76     14064
weighted avg       0.76      0.76      0.76     14064

RobustScaler
best_estimator =  Pipeline(steps=[('scaler', RobustScaler()), ('estimate', SVC(C=3))])
best_params =  {'estimate__C': 3}
best_score =  0.7646751740206018
confusion_matrix 
 [[5327 1575]
 [1788 5374]]
Accuracy Score 0.7608788395904437
classification_report 
               precision    recall  f1-score   support

           0       0.75      0

''

In [8]:

print('SGDClassifier noncontin')
print()

smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']

alpha_lst = [i/10000 for i in range(1,10000,1000)]
fraction_lst = [i/10 for i in range(1,10,2)]

param_grid = [{
    'estimate__validation_fraction': fraction_lst,
    'estimate__alpha': alpha_lst,}]

model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, verbose=0, epsilon=0.1, 
                         n_jobs=2, random_state=None, learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, early_stopping=True, 
                         validation_fraction=0.1, n_iter_no_change=5, 
                         class_weight=None, warm_start=False, average=False)

myGrid(X, y, model, param_grid)




print('SGDClassifier knn contin')
print()

smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']

alpha_lst = [i/10000 for i in range(1,10000,1000)]
fraction_lst = [i/10 for i in range(1,10,2)]

param_grid = [{
    'estimate__validation_fraction': fraction_lst,
    'estimate__alpha': alpha_lst,}]

model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, verbose=0, epsilon=0.1, 
                         n_jobs=2, random_state=None, learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, early_stopping=True, 
                         validation_fraction=0.1, n_iter_no_change=5, 
                         class_weight=None, warm_start=False, average=False)

myGrid(X, y, model, param_grid)


print('SGDClassifier median contin')
print()

smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

alpha_lst = [i/10000 for i in range(1,10000,1000)]
fraction_lst = [i/10 for i in range(1,10,2)]

param_grid = [{
    'estimate__validation_fraction': fraction_lst,
    'estimate__alpha': alpha_lst,}]

model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, verbose=0, epsilon=0.1, 
                         n_jobs=2, random_state=None, learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, early_stopping=True, 
                         validation_fraction=0.1, n_iter_no_change=5, 
                         class_weight=None, warm_start=False, average=False)

myGrid(X, y, model, param_grid)



print('SGDClassifier all under 40 missing')
print()

smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)
smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

X = smoten_median_imputed_less_40.drop('outcome',axis= 1)
y = smoten_median_imputed_less_40['outcome']

fraction_lst = [i/10 for i in range(1,10,2)]
alpha_lst = [i/10000 for i in range(1,10000,1000)]


param_grid = [{
    'estimate__validation_fraction': fraction_lst,
    'estimate__alpha': alpha_lst,}]

model = SGDClassifier(loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, verbose=0, epsilon=0.1, 
                         n_jobs=2, random_state=None, learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, early_stopping=True, 
                         validation_fraction=0.1, n_iter_no_change=5, 
                         class_weight=None, warm_start=False, average=False)

myGrid(X, y, model, param_grid)



SGDClassifier noncontin

StandardScaler
best_estimator =  Pipeline(steps=[('scaler', StandardScaler()),
                ('estimate',
                 SGDClassifier(alpha=0.2001, early_stopping=True, n_jobs=2,
                               validation_fraction=0.9))])
best_params =  {'estimate__alpha': 0.2001, 'estimate__validation_fraction': 0.9}
best_score =  0.6960347391802996
confusion_matrix 
 [[5647 1255]
 [3072 4090]]
Accuracy Score 0.692335039817975
classification_report 
               precision    recall  f1-score   support

           0       0.65      0.82      0.72      6902
           1       0.77      0.57      0.65      7162

    accuracy                           0.69     14064
   macro avg       0.71      0.69      0.69     14064
weighted avg       0.71      0.69      0.69     14064

RobustScaler
best_estimator =  Pipeline(steps=[('scaler', RobustScaler()),
                ('estimate',
                 SGDClassifier(alpha=0.1001, early_stopping=True, n_jobs=2))])
best_

''