Feature selection

Genetic Feature Selection

In [8]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn_genetic import *
from sklearn_genetic.space import *
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, StandardScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier, SGDRegressor, RidgeCV, LassoCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import accuracy_score, mean_squared_error, classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTEN

pd.set_option("display.max_columns", None)

def myGAFeature(X, y, estimate, flag):
    
    model = lambda aPipe: Pipeline([('scaler',  aPipe), ('estimate', estimate)])
    
    cv = 5 
    top_k = 1
    dispatch = '8*n_jobs'
    return_train = True
    gen = 40
    
    def analyse(model):
        geneSelectFeature = GAFeatureSelectionCV(estimator = model, cv=cv, 
                                                 scoring='f1_weighted', population_size=50, 
                                                 generations=gen, crossover_probability=0.2, 
                                                 mutation_probability=0.8, tournament_size=3, 
                                                 elitism=True, max_features=None, verbose=True, 
                                                 keep_top_k=top_k, criteria='max', 
                                                 algorithm='eaMuPlusLambda', refit=True, 
                                                 n_jobs=4, 
                                                 pre_dispatch=dispatch, error_score=np.nan, 
                                                 return_train_score=return_train, log_config=None)

        geneSelectFeature.fit(X_train, y_train)
        y_pred = geneSelectFeature.predict(X_test)
        print('best estimator ' + str(geneSelectFeature.best_estimator_))
        print()
        outcome_labels = ['Intubation False', 'Intubation True']
        print('Genetic Feature Selection:', geneSelectFeature.support_, '\n')
        for i in range(len(clf.feature_names_in_)):
            print(clf.feature_names_in_[i], ":" ,clf[1].feature_importances_[i])
            print('score \n', geneSelectFeature.score(X_train, y_train))
            print()
        print('classification report \n', classification_report(y_test, y_pred, target_names=outcome_labels))
        micro_roc_auc_ovr = roc_auc_score(y_test, y_pred, multi_class="ovr", average="micro")
        print(f"Micro-averaged One-vs-Rest ROC AUC score:\n{micro_roc_auc_ovr:.2f}")
        print('\n')
        return ""
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 0) #train split
    
    if flag == "std":
        print('StandardScaler')
        analyse(model(StandardScaler()))
        
    elif flag == "rbt":
        print('RobustScaler')
        analyse(model(RobustScaler()))
    return ""


print('### SVM SGDClassifier ###')
print()

print('SMOTEN median noncontin')
print()

print('Genetic Algorithm Feature Selection')
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)
smoten_noncontin = pd.read_csv('smoten_noncontin.csv', index_col=False)

X = smoten_noncontin.drop('outcome',axis= 1)
y = smoten_noncontin['outcome']


best_paramsS =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.1}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.1}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=False, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt')

    
    
    
    


### SVM SGDClassifier ###

SMOTEN median noncontin

Genetic Algorithm Feature Selection
StandardScaler
gen	nevals	fitness 	fitness_std	fitness_max	fitness_min
0  	50    	0.600015	0.122764   	0.695968   	0.334857   
1  	100   	0.678345	0.0553575  	0.695968   	0.416458   
2  	100   	0.671057	0.0637169  	0.695968   	0.416458   
3  	100   	0.670801	0.0639475  	0.695968   	0.426491   
4  	100   	0.684979	0.0446847  	0.695968   	0.454232   
5  	100   	0.677068	0.0513111  	0.695968   	0.526948   
6  	100   	0.679351	0.0498932  	0.695968   	0.519325   
7  	100   	0.677737	0.0494291  	0.695968   	0.532096   
8  	100   	0.680322	0.0470016  	0.695968   	0.530502   
9  	100   	0.690085	0.028819   	0.695968   	0.548901   
10 	100   	0.663555	0.0657085  	0.695968   	0.46632    
11 	100   	0.674824	0.0524572  	0.695968   	0.533384   
12 	100   	0.683672	0.0417412  	0.695968   	0.533384   
13 	100   	0.69277 	0.022386   	0.695968   	0.536068   
14 	100   	0.683149	0.0435318  	0.695968   	0.528348   
1



NameError: name 'clf' is not defined

In [None]:
print('### SVM SGDClassifier ###')
print()

print('SMOTEN knn contin')
print()

print('Genetic Algorithm Feature Selection')
smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)
smoten_knn_contin = pd.read_csv('smoten_knn_contin.csv', index_col=False)

X = smoten_knn_contin.drop('outcome',axis= 1)
y = smoten_knn_contin['outcome']


best_paramsS =  {'estimate__alpha': 0.0001, 'estimate__validation_fraction': 0.1}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.5}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=False, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt')


In [None]:
print('### SVM SGDClassifier ###')
print()

print('SMOTEN median contin')
print()

print('Genetic Algorithm Feature Selection')
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

best_paramsS =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.7}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.1}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=False, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt')

In [None]:
print('### SVM SGDClassifier ###')
print()

print('SGDClassifier all under 40 missing')
print()

print('Genetic Algorithm Feature Selection')
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)
smoten_median_imputed_contin = pd.read_csv('smoten_median_imputed_contin.csv', index_col=False)

X = smoten_median_imputed_contin.drop('outcome',axis= 1)
y = smoten_median_imputed_contin['outcome']

best_paramsS =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.9}
best_paramsR =  {'estimate__alpha': 0.1001, 'estimate__validation_fraction': 0.5}

estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsS['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=False, 
                         validation_fraction=best_paramsS['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'std')


estimate = SGDClassifier(loss='hinge', penalty='l2', 
                         alpha=best_paramsR['estimate__alpha'], 
                         l1_ratio=0.15, 
                         fit_intercept=True, max_iter=1000, 
                         tol=0.001, shuffle=True, 
                         verbose=0, epsilon=0.1, 
                         n_jobs=None, random_state=None, 
                         learning_rate='optimal', 
                         eta0=0.0, power_t=0.5, 
                         early_stopping=True, 
                         validation_fraction=best_paramsR['estimate__validation_fraction'], 
                         n_iter_no_change=5, 
                         class_weight=None, 
                         warm_start=False, average=False)

myGAFeature(X, y, estimate, 'rbt')