Feature selection

Genetic Feature Selection

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN

pd.set_option("display.max_columns", None)

def myGAFeature(X, y, estimate, flag):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)])
    
    cv = 2 
    top_k = 1
    dispatch = '4*n_jobs'
    return_train = True
    gen = 22
    
    def analyse(model):
        geneSelectFeature = GAFeatureSelectionCV(estimator = model, cv=cv, 
                                                 scoring=None, population_size=50, 
                                                 generations=gen, crossover_probability=0.2, 
                                                 mutation_probability=0.8, tournament_size=3, 
                                                 elitism=True, max_features=None, verbose=True, 
                                                 keep_top_k=top_k, criteria='max', 
                                                 algorithm='eaMuPlusLambda', refit=True, 
                                                 n_jobs=4, 
                                                 pre_dispatch=dispatch, error_score=np.nan, 
                                                 return_train_score=return_train, log_config=None)

        geneSelectFeature.fit(X_train, y_train)
        y_pred = geneSelectFeature.predict(X_test)
        print('best estimator ' + str(geneSelectFeature.best_estimator_))
        print()
        outcome_labels = ['Intubation False', 'Intubation True']
        print('Genetic Feature Selection:', geneSelectFeature.support_, '\n')
        
        print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
        micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
        print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
        print('\n')
        return ""
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    if flag == "std":
        print('StandardScaler')
        analyse(model(StandardScaler()))
        
    elif flag == "rbt":
        print('RobustScaler')
        analyse(model(RobustScaler()))
    return ""


print('### SVM SGDClassifier ###')
print()

print('SMOTEN median noncontin')
print()

print('Genetic Algorithm Feature Selection')
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']


best_paramsS =  {'estimate__alpha': 0.2001, 'estimate__validation_fraction': 0.9}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.1}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt')

    
    
    
    


### SVM SGDClassifier ###

SMOTEN median noncontin

Genetic Algorithm Feature Selection
StandardScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.633322	0.0897003  	0.700476   	0.494578   
1  	100   	0.689994	0.0414969  	0.700476   	0.522575   
2  	100   	0.683338	0.0515721  	0.700476   	0.509155   
3  	100   	0.686101	0.0487871  	0.700476   	0.509012   
4  	100   	0.686814	0.0463423  	0.700476   	0.524513   
5  	100   	0.679792	0.0560624  	0.700476   	0.515732   
6  	100   	0.673199	0.0625117  	0.700476   	0.523624   
7  	100   	0.679903	0.0557278  	0.700476   	0.524851   
8  	100   	0.690143	0.040909   	0.700476   	0.526131   
9  	100   	0.683227	0.0517769  	0.700476   	0.522486   
10 	100   	0.690188	0.0407294  	0.700476   	0.525171   
11 	100   	0.690132	0.0409545  	0.700476   	0.52494    
12 	100   	0.690011	0.041424   	0.700476   	0.524868   
13 	100   	0.687007	0.0457042  	0.700476   	0.525633   
14 	100   	0.690116	0.0410344  	0.700476   	0.521349   
1



best estimator Pipeline(steps=[('scaler', StandardScaler()),
                ('estimate',
                 SGDClassifier(alpha=0.2001, validation_fraction=0.9))])

Genetic Feature Selection: [False False  True False False False] 

classification report 
                   precision    recall  f1-score   support

Intubation False       0.65      0.82      0.72      6902
 Intubation True       0.77      0.57      0.65      7162

        accuracy                           0.69     14064
       macro avg       0.71      0.69      0.69     14064
    weighted avg       0.71      0.69      0.69     14064

Micro-averaged One-vs-Rest ROC AUC score:
0.69


RobustScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.594872	0.0974739  	0.70053    	0.494312   
1  	100   	0.656387	0.0783763  	0.70053    	0.501938   
2  	100   	0.685004	0.052178   	0.701667   	0.500622   
3  	100   	0.689948	0.0416543  	0.701667   	0.514754   
4  	100   	0.677633	0.0622772  	0.701667   	0.500036



''

In [1]:
#check
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN

pd.set_option("display.max_columns", None)

def myGAFeature(X, y, estimate, flag):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)])
    
    cv = 2 
    top_k = 1
    dispatch = '4*n_jobs'
    return_train = True
    gen = 22
    
    def analyse(model):
        geneSelectFeature = GAFeatureSelectionCV(estimator = model, cv=cv, 
                                                 scoring=None, population_size=50, 
                                                 generations=gen, crossover_probability=0.2, 
                                                 mutation_probability=0.8, tournament_size=3, 
                                                 elitism=True, max_features=None, verbose=True, 
                                                 keep_top_k=top_k, criteria='max', 
                                                 algorithm='eaMuPlusLambda', refit=True, 
                                                 n_jobs=4, 
                                                 pre_dispatch=dispatch, error_score=np.nan, 
                                                 return_train_score=return_train, log_config=None)

        geneSelectFeature.fit(X_train, y_train)
        y_pred = geneSelectFeature.predict(X_test)
        print('best estimator ' + str(geneSelectFeature.best_estimator_))
        print()
        outcome_labels = ['Intubation False', 'Intubation True']
        print('Genetic Feature Selection:', geneSelectFeature.support_, '\n')
        
        print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
        micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
        print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
        print('\n')
        return ""
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    if flag == "std":
        print('StandardScaler')
        analyse(model(StandardScaler()))
        
    elif flag == "rbt":
        print('RobustScaler')
        analyse(model(RobustScaler()))
    return ""


print('### SVM SGDClassifier ###')
print()

print('SMOTEN median noncontin')
print()

print('Genetic Algorithm Feature Selection')
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']


best_paramsS =  {'estimate__alpha': 0.2001, 'estimate__validation_fraction': 0.9}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.1}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


### SVM SGDClassifier ###

SMOTEN median noncontin

Genetic Algorithm Feature Selection
StandardScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.606699	0.0940457  	0.700565   	0.483504   
1  	100   	0.690032	0.0413326  	0.701241   	0.519642   
2  	100   	0.700597	0.000457964	0.702165   	0.698788   
3  	100   	0.697393	0.0227804  	0.702165   	0.537952   
4  	100   	0.686488	0.0476527  	0.702165   	0.509812   
5  	100   	0.689531	0.0428139  	0.702165   	0.509812   
6  	100   	0.686066	0.0490267  	0.700832   	0.508159   
7  	100   	0.682746	0.0535142  	0.702556   	0.508355   
8  	100   	0.680321	0.0545677  	0.702556   	0.529757   
9  	100   	0.683256	0.051889   	0.702556   	0.519713   
10 	100   	0.683058	0.0528062  	0.702556   	0.51127    
11 	100   	0.682937	0.0532448  	0.702556   	0.516425   
12 	100   	0.693705	0.0338606  	0.702556   	0.524975   
13 	100   	0.67599 	0.0607442  	0.701099   	0.515554   
14 	100   	0.693622	0.0339253  	0.701099   	0.525437   
1



''

In [5]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN

pd.set_option("display.max_columns", None)

def myGAFeature(X, y, estimate, flag):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)])
    
    cv = 2 
    top_k = 1
    dispatch = '4*n_jobs'
    return_train = True
    gen = 22
    
    def analyse(model):
        geneSelectFeature = GAFeatureSelectionCV(estimator = model, cv=cv, 
                                                 scoring=None, population_size=50, 
                                                 generations=gen, crossover_probability=0.2, 
                                                 mutation_probability=0.8, tournament_size=3, 
                                                 elitism=True, max_features=None, verbose=True, 
                                                 keep_top_k=top_k, criteria='max', 
                                                 algorithm='eaMuPlusLambda', refit=True, 
                                                 n_jobs=4, 
                                                 pre_dispatch=dispatch, error_score=np.nan, 
                                                 return_train_score=return_train, log_config=None)

        geneSelectFeature.fit(X_train, y_train)
        y_pred = geneSelectFeature.predict(X_test)
        print('best estimator ' + str(geneSelectFeature.best_estimator_))
        print()
        outcome_labels = ['Intubation False', 'Intubation True']
        print('Genetic Feature Selection:', geneSelectFeature.support_, '\n')
        
        print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
        micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
        print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
        print('\n')
        return ""
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    if flag == "std":
        print('StandardScaler')
        analyse(model(StandardScaler()))
        
    elif flag == "rbt":
        print('RobustScaler')
        analyse(model(RobustScaler()))
    return ""

print('### SVM SGDClassifier ###')
print()

print('SMOTEN knn contin')
print()

print('Genetic Algorithm Feature Selection')
smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']


best_paramsS =  {'estimate__alpha': 0.0001, 'estimate__validation_fraction': 0.1}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.5}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt')


### SVM SGDClassifier ###

SMOTEN knn contin

Genetic Algorithm Feature Selection
StandardScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.926045	0.0194414  	0.956414   	0.875604   
1  	100   	0.941935	0.00871401 	0.956129   	0.924026   
2  	100   	0.941802	0.0100916  	0.960342   	0.915031   
3  	100   	0.948813	0.0085451  	0.96068    	0.925857   
4  	100   	0.947895	0.0134468  	0.96068    	0.902766   
5  	100   	0.950743	0.00984364 	0.96068    	0.923581   
6  	100   	0.949923	0.011331   	0.96068    	0.918373   
7  	100   	0.94847 	0.0150318  	0.96068    	0.897824   
8  	100   	0.948068	0.0164434  	0.961977   	0.896242   
9  	100   	0.949808	0.0140884  	0.961977   	0.911369   
10 	100   	0.953459	0.0111144  	0.961977   	0.91784    
11 	100   	0.949909	0.0140978  	0.961977   	0.904277   
12 	100   	0.950454	0.01222    	0.961977   	0.91848    
13 	100   	0.95339 	0.0110869  	0.961977   	0.90657    
14 	100   	0.951423	0.0110037  	0.961977   	0.921306   
15 	100



best estimator Pipeline(steps=[('scaler', StandardScaler()),
                ('estimate', SGDClassifier(early_stopping=True))])

Genetic Feature Selection: [ True  True  True  True  True  True  True False False  True  True False
  True  True  True  True  True  True False  True  True False  True False
  True False  True  True] 

classification report 
                   precision    recall  f1-score   support

Intubation False       0.95      0.96      0.96      6902
 Intubation True       0.96      0.95      0.96      7162

        accuracy                           0.96     14064
       macro avg       0.96      0.96      0.96     14064
    weighted avg       0.96      0.96      0.96     14064

Micro-averaged One-vs-Rest ROC AUC score:
0.96


RobustScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.935051	0.0152895  	0.956218   	0.896082   
1  	100   	0.949465	0.00911589 	0.960324   	0.924595   
2  	100   	0.950016	0.0110576  	0.960324   	0.904224   
3  	100  



''

In [6]:
print('### SVM SGDClassifier ###')
print()

print('SMOTEN median contin')
print()

print('Genetic Algorithm Feature Selection')
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

best_paramsS =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.7}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.1}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt')

### SVM SGDClassifier ###

SMOTEN median contin

Genetic Algorithm Feature Selection
StandardScaler
gen	nevals	fitness	fitness_std	fitness_max	fitness_min
0  	50    	0.92915	0.0206875  	0.961231   	0.86357    
1  	100   	0.944163	0.0114101  	0.961231   	0.913929   
2  	100   	0.945014	0.0115978  	0.959827   	0.908223   
3  	100   	0.947303	0.0124712  	0.959933   	0.910783   
4  	100   	0.951802	0.00825768 	0.959933   	0.923581   
5  	100   	0.946112	0.0171612  	0.959702   	0.898873   
6  	100   	0.949646	0.0113465  	0.959702   	0.91608    
7  	100   	0.950477	0.0104023  	0.959702   	0.91608    
8  	100   	0.949344	0.012552   	0.960538   	0.910801   
9  	100   	0.948874	0.0136922  	0.960538   	0.913449   
10 	100   	0.950785	0.0127753  	0.960538   	0.909752   
11 	100   	0.951389	0.0121619  	0.960538   	0.909752   
12 	100   	0.952971	0.0120995  	0.960929   	0.904526   
13 	100   	0.950686	0.015207   	0.960929   	0.900828   
14 	100   	0.951163	0.0147926  	0.961373   	0.908045   
15 	10



best estimator Pipeline(steps=[('scaler', StandardScaler()),
                ('estimate',
                 SGDClassifier(alpha=0.1001, early_stopping=True,
                               validation_fraction=0.7))])

Genetic Feature Selection: [ True False  True  True  True False False  True False  True False False
  True  True  True  True False  True  True  True  True  True  True  True
  True  True  True  True] 

classification report 
                   precision    recall  f1-score   support

Intubation False       0.95      0.97      0.96      6902
 Intubation True       0.97      0.95      0.96      7162

        accuracy                           0.96     14064
       macro avg       0.96      0.96      0.96     14064
    weighted avg       0.96      0.96      0.96     14064

Micro-averaged One-vs-Rest ROC AUC score:
0.96


RobustScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.929691	0.0172763  	0.961942   	0.898997   
1  	100   	0.941923	0.0104872  	0.



''

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN

pd.set_option("display.max_columns", None)

def myGAFeature(X, y, estimate, flag):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)])
    
    cv = 2 
    top_k = 1
    dispatch = '4*n_jobs'
    return_train = True
    gen = 22
    
    def analyse(model):
        geneSelectFeature = GAFeatureSelectionCV(estimator = model, cv=cv, 
                                                 scoring=None, population_size=50, 
                                                 generations=gen, crossover_probability=0.2, 
                                                 mutation_probability=0.8, tournament_size=3, 
                                                 elitism=True, max_features=None, verbose=True, 
                                                 keep_top_k=top_k, criteria='max', 
                                                 algorithm='eaMuPlusLambda', refit=True, 
                                                 n_jobs=4, 
                                                 pre_dispatch=dispatch, error_score=np.nan, 
                                                 return_train_score=return_train, log_config=None)

        geneSelectFeature.fit(X_train, y_train)
        y_pred = geneSelectFeature.predict(X_test)
        print('best estimator ' + str(geneSelectFeature.best_estimator_))
        print()
        outcome_labels = ['Intubation False', 'Intubation True']
        print('Genetic Feature Selection:', geneSelectFeature.support_, '\n')
        
        print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
        micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
        print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
        print('\n')
        return ""
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    if flag == "std":
        print('StandardScaler')
        analyse(model(StandardScaler()))
        
    elif flag == "rbt":
        print('RobustScaler')
        analyse(model(RobustScaler()))
    return ""

In [4]:
print('### SVM SGDClassifier ###')
print()

print('SGDClassifier all under 40 missing')
print()

print('Genetic Algorithm Feature Selection')
smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)
smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

X = smoten_median_imputed_less_40.drop('outcome',axis= 1)
y = smoten_median_imputed_less_40['outcome']

best_paramsS =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.9}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.5}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=-1, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=-1, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt')

### SVM SGDClassifier ###

SGDClassifier all under 40 missing

Genetic Algorithm Feature Selection
StandardScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.942175	0.0136419  	0.961551   	0.900011   
1  	100   	0.952035	0.00708129 	0.961551   	0.936913   
2  	100   	0.953203	0.00701882 	0.962368   	0.932132   
3  	100   	0.954384	0.00801338 	0.96587    	0.931705   
4  	100   	0.953061	0.00930518 	0.963844   	0.931563   
5  	100   	0.953797	0.00862968 	0.963844   	0.931581   
6  	100   	0.95528 	0.00726571 	0.963133   	0.933963   
7  	100   	0.954496	0.00962022 	0.964822   	0.92024    
8  	100   	0.958655	0.00757764 	0.964679   	0.914302   
9  	100   	0.956444	0.0109197  	0.964679   	0.918586   
10 	100   	0.957388	0.00868889 	0.964679   	0.928203   
11 	100   	0.954501	0.0122333  	0.966066   	0.924986   
12 	100   	0.953219	0.0124927  	0.966066   	0.912578   
13 	100   	0.956448	0.0101584  	0.966066   	0.930141   
14 	100   	0.958555	0.00849032 	0.966066   	0.



best estimator Pipeline(steps=[('scaler', StandardScaler()),
                ('estimate',
                 SGDClassifier(alpha=0.1001, early_stopping=True, n_jobs=-1,
                               validation_fraction=0.9))])

Genetic Feature Selection: [ True  True  True  True  True False  True False  True False  True False
 False  True  True False  True False  True False  True False  True  True
 False  True  True  True  True  True  True  True False] 

classification report 
                   precision    recall  f1-score   support

Intubation False       0.96      0.97      0.97      6902
 Intubation True       0.98      0.96      0.97      7162

        accuracy                           0.97     14064
       macro avg       0.97      0.97      0.97     14064
    weighted avg       0.97      0.97      0.97     14064

Micro-averaged One-vs-Rest ROC AUC score:
0.97


RobustScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.941279	0.0146952  	0.961711   	0.897



''

In [38]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN


def myGAFeature(X, y, estimate, flag, feature_count):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)])
    
    cv = 5 
    top_k = 1
    dispatch = '4*n_jobs'
    return_train = True
    gen = 22
    
    def analyse(model):
        SelectFeature = SequentialFeatureSelector(estimator=model, n_features_to_select=feature_count, tol=None, direction='forward', 
                                                  scoring='f1_micro', cv=cv, n_jobs=None)

        SelectFeature.fit(X_train, y_train)
        #y_pred = SelectFeature.predict(X_test)
        #print('best estimator ' + str(SelectFeature.best_estimator_))
        print()
        outcome_labels = ['Intubation False', 'Intubation True']
        print('Feature Selection:') 
        features = []
        for i in range(len(SelectFeature.feature_names_in_)):
            if SelectFeature.support_[i] == True:
                features.append(SelectFeature.feature_names_in_[i])
        print(features)
        print()
        #print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
        #micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
        #print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
        #print('\n')
        return ""
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    if flag == "std":
        print('StandardScaler')
        analyse(model(StandardScaler()))
        
    elif flag == "rbt":
        print('RobustScaler')
        analyse(model(RobustScaler()))
    return ""


pd.set_option("display.max_columns", None)


In [41]:
print('### SVM SGDClassifier ### - SFS Forward')
print()
'''
print('SMOTEN median noncontin')
print()

print('SFS Feature Selection')
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']


best_paramsS =  {'estimate__alpha': 0.2001, 'estimate__validation_fraction': 0.9}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.1}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std',3)


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt',3)
'''

print('SMOTEN knn contin')
print()

print('SFS Feature Selection')
smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False).drop(['pt_min','pt_max','urineoutput'],axis=1)

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']


best_paramsS =  {'estimate__alpha': 0.0001, 'estimate__validation_fraction': 0.1}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.5}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std',5)


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt',5)



print('SMOTEN median contin')
print()

print('SFS Feature Selection')
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False).drop(['pt_min','pt_max','urineoutput'],axis=1)

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

best_paramsS =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.7}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.1}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std',5)


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt',5)
'''
print('SGDClassifier all under 40 missing')
print()

print('SFS Feature Selection')
smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)
smoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)

X = smoten_median_imputed_less_40.drop('outcome',axis= 1)
y = smoten_median_imputed_less_40['outcome']

best_paramsS =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.9}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.5}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=-1, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std',5)


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=-1, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt',5)
'''


### SVM SGDClassifier ### - SFS Forward

SMOTEN median noncontin

SFS Feature Selection
StandardScaler

Feature Selection:
['sofa_coagulation', 'sofa_cardiovascular', 'sofa_cns']

RobustScaler

Feature Selection:
['sofa_coagulation', 'sofa_cardiovascular', 'sofa_cns']

SMOTEN knn contin

SFS Feature Selection
StandardScaler

Feature Selection:
['age', 'heart_rate_mean', 'sbp_mean', 'temperature_mean', 'hemoglobin_max']

RobustScaler

Feature Selection:
['age', 'dbp_mean', 'temperature_mean', 'glucose_max', 'wbc_max']

SMOTEN median contin

SFS Feature Selection
StandardScaler

Feature Selection:
['age', 'heart_rate_mean', 'dbp_mean', 'temperature_mean', 'hemoglobin_max']

RobustScaler

Feature Selection:
['age', 'dbp_mean', 'temperature_mean', 'glucose_max', 'wbc_max']



"\nprint('SGDClassifier all under 40 missing')\nprint()\n\nprint('SFS Feature Selection')\nsmoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)\nsmoten_median_imputed_less_40 = pd.read_csv('smoten_median_imputed_less_40.csv', index_col=False)\n\nX = smoten_median_imputed_less_40.drop('outcome',axis= 1)\ny = smoten_median_imputed_less_40['outcome']\n\nbest_paramsS =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.9}\nbest_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.5}\n\nestimate = SGDClassifier(loss='hinge', penalty='l2', \n                         alpha=best_paramsS['estimate__alpha'], \n                         l1_ratio=0.15, \n                         fit_intercept=True, max_iter=1000, \n                         tol=0.001, shuffle=True, \n                         verbose=0, epsilon=0.1, \n                         n_jobs=-1, random_state=None, \n                         learning_rate='optimal',