In [1]:
import pandas as pd
import numpy as np
from time import time
from config_loader import load
import argparse
import sys
import seaborn as sns
from MyDataUnderstanding import featureAnalysis
from MyPreprocessing import MyPreprocessing
import numpy as np
from time import time
import matplotlib.pyplot as plt
from model.models import models_perform
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from statistics import mean
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from detect_outliers import detect_outliers
from MyFeatureSelection import MyFeatureSelection

In [2]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC
from xgboost import XGBClassifier as XGB
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from model.MyIBL import MyIBL as IBL
from sklearn.linear_model import LogisticRegression as LG

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 500)

In [4]:
def getData(path, filenames_type):
    '''
    features_lst = [
        "Pclass", "Survived", "Name", "Sex", "Age",
        "Sibsp", "Parch", "Ticket", "Fare", "Cabin","Embarked"]
    '''
    if filenames_type == 'train':
        filename = 'train'
    elif filenames_type == 'test':
        filename = 'test'
    else:
        filename = 'titanicAll'
    df_features = pd.read_csv(path + filename + '.csv',
                           sep=',')

    if filenames_type not in ['train', 'test']:
        # drop unnecessary columns that don't exist in the official dataset
        df_features.drop(['Boat', 'Body', 'Home.dest'],
                          axis=1,
                         inplace=True)
    #labels = df_features['Survived']
    #df_features = df_features.drop(['Survived'], axis=1)
    return df_features

In [5]:
##
# Loads config
parser = argparse.ArgumentParser()
parser.add_argument(
    "-c", "--config", default="titanic.cfg",
    help="specify the location of the clustering config file"
)
args, _ = parser.parse_known_args()

config_file = args.config
config = load(config_file)

##
verbose = config.get('titanic', 'verbose')
path = config.get('titanic', 'path') + '/'
file_type = config.get('titanic', 'file_type')

filename_type = 'train'
if file_type == 'all':
    filename_type = 'other'

In [6]:
print('Filename type:', filename_type)
print()
## train
trainData = getData(path, filename_type)
# Preprocessing
trainPreprocess = MyPreprocessing(process_type='all',
                                  filename_type=filename_type,
                                  remove_outliers=True)

## test
filename_type = 'test'
testData = getData(path, filename_type)
# Preprocessing
testPreprocess = MyPreprocessing(process_type='all',
                                 filename_type=filename_type,
                                 remove_outliers=False)


Filename type: train



In [7]:
trainPreprocess.fit(trainData)
df_train = trainPreprocess.new_df
# the labels "Survived"
labels = trainPreprocess.labels_
#print(labels.head())
# the initial dataset without any preprocessing
#print(trainPreprocess.df_initial.head())
# the preprocessed data
#print(trainPreprocess.new_df.head())

testPreprocess.fit(testData)
df_test = testPreprocess.new_df

process_type: all
process_type: all


In [8]:
# fix missing columns because of NaNs and one hot encoding without dummy_na
if df_train.shape[1] != df_test.shape[1]:
    missing_cols = set(df_test.columns) - set(df_train.columns)
    for col in missing_cols:
        #df_train[col] = np.zeros([df_train.shape[0], 1])
        df_test.drop([col], axis=1, inplace=True)

    missing_cols = set(df_train.columns) - set(df_test.columns)
    for col in missing_cols:
        #df_test[col] = np.zeros([df_test.shape[0], 1])
        df_train.drop([col], axis=1, inplace=True)

labels_test = testPreprocess.labels_

print(df_train.columns, df_test.columns)
print(df_train.shape, df_test.shape)

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'PclassCp_2', 'PclassCp_3', 'Title_Mr0', 'Title_Mr50', 'Title_Mrs', 'Title_Ms', 'FamilySize', 'Em_C', 'Em_Q', 'Em_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_X', 'Age_bin_Kid', 'Age_bin_Teenager', 'Age_bin_Adult', 'Family_bin_SmallFamily', 'Family_bin_BigFamily', 'Family_bin_Team', 'Fare_bin_Median', 'Fare_bin_Average', 'Fare_bin_High', 'Mother', 'Father', 'Daughter', 'Son', 'Orphan', 'RichWoman', 'MiddleClassWoman', 'PoorWoman', 'RichMan', 'MiddleClassMan', 'PoorMan', 'RichGirl', 'MiddleClassGirl', 'PoorGirl', 'RichBoy', 'MiddleClassBoy', 'PoorBoy'], dtype='object') Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'PclassCp_2', 'PclassCp_3', 'Title_Mr0', 'Title_Mr50', 'Title_Mrs', 'Title_Ms', 'FamilySize', 'Em_C', 'Em_Q', 'Em_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_X', 'Age_bin_Kid', 'Age_bin_Teenager', 'Age_bin_Adult', 'Family_bin_SmallFamily', 'Fam

In [9]:
df_train.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'PclassCp_2', 'PclassCp_3', 'Title_Mr0', 'Title_Mr50', 'Title_Mrs', 'Title_Ms', 'FamilySize', 'Em_C', 'Em_Q', 'Em_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_X', 'Age_bin_Kid', 'Age_bin_Teenager', 'Age_bin_Adult', 'Family_bin_SmallFamily', 'Family_bin_BigFamily', 'Family_bin_Team', 'Fare_bin_Median', 'Fare_bin_Average', 'Fare_bin_High', 'Mother', 'Father', 'Daughter', 'Son', 'Orphan', 'RichWoman', 'MiddleClassWoman', 'PoorWoman', 'RichMan', 'MiddleClassMan', 'PoorMan', 'RichGirl', 'MiddleClassGirl', 'PoorGirl', 'RichBoy', 'MiddleClassBoy', 'PoorBoy'], dtype='object')

In [10]:
df_test.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'PclassCp_2', 'PclassCp_3', 'Title_Mr0', 'Title_Mr50', 'Title_Mrs', 'Title_Ms', 'FamilySize', 'Em_C', 'Em_Q', 'Em_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_X', 'Age_bin_Kid', 'Age_bin_Teenager', 'Age_bin_Adult', 'Family_bin_SmallFamily', 'Family_bin_BigFamily', 'Family_bin_Team', 'Fare_bin_Median', 'Fare_bin_Average', 'Fare_bin_High', 'Mother', 'Father', 'Daughter', 'Son', 'Orphan', 'RichWoman', 'MiddleClassWoman', 'PoorWoman', 'RichMan', 'MiddleClassMan', 'PoorMan', 'RichGirl', 'MiddleClassGirl', 'PoorGirl', 'RichBoy', 'MiddleClassBoy', 'PoorBoy'], dtype='object')

In [11]:
#df_test.dtypes

In [12]:
def feature_importance(clf, df_train):
    if hasattr(clf, 'feature_importances_'):
        name = str(clf.__class__).split('.')[-1][:-2]
        feat_imp = pd.DataFrame({'importance':clf.feature_importances_})    
        feat_imp['feature'] = df_train.columns
        feat_imp.sort_values(by='importance', ascending=False, inplace=True)
        #feat_imp = feat_imp.iloc[:top_n]
        feat_imp.sort_values(by='importance', inplace=True)
        feat_imp = feat_imp.set_index('feature', drop=True)
        feat_imp.plot.barh(title='Feature Importance', figsize=(10,10))
        plt.xlabel('%s Feature Importance Score' %name)
        plt.show()
        return feat_imp
    return pd.DataFrame()

In [13]:
def ensemble(clf, x_train, y_train, x_test):
    
    name = str(clf.__class__).split('.')[-1][:-2]
    kf = KFold(n_splits=5)

    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]

    #ens_train = np.zeros((ntrain,))
    ens_train = np.array([])
    #ens_test = np.zeros((ntest,))
    #test_kf = np.empty((kf.n_folds, ntest))
    test_kf = np.zeros((kf.n_splits, ntest))
        
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.fit(x_tr, y_tr)

        #ens_train[test_index] = clf.predict(x_te)
        ens_train = np.concatenate([ens_train, clf.predict(x_te)])
        # predict from the whole train set 
        #clf.fit(x_train,y_train)
        test_kf[i, :] = clf.predict(x_test)
    
    # features for next ensemble layer
    ens_train = pd.DataFrame(ens_train.reshape(-1,1), columns=[name])
    
    # combine for test
    ens_test = pd.DataFrame(test_kf.T).mode(axis=1)
    ens_test.columns = [name]
    
    return ens_train, ens_test

In [14]:
def modeler(clf, data, labels, train_fidx, validation_fidx, name_prefix):
    name = name_prefix + str(clf.__class__).split('.')[-1][:-2]
    #print(name)
    folds_accuracy = list()
    duration = list()
    start = time()
    for idx, trf in enumerate(train_fidx):
        clf.fit(data.loc[trf], labels.loc[trf])
        prediction_labels = clf.predict(data.loc[validation_fidx[idx]])
        folds_accuracy.append(accuracy_score(labels.loc[validation_fidx[idx]], prediction_labels))
        
    mean_acc = mean(folds_accuracy)
    duration = time() - start
    #print(name, mean_acc, "accuracy at validation stage,", duration, 's')
    df_folds_accuracy = pd.DataFrame([folds_accuracy], index=[name])
    df_folds = pd.DataFrame([[mean_acc, duration/len(train_fidx)]], columns=['Accuracy', 'Duration'], index=[name])
    return clf.fit(data, labels), df_folds, df_folds_accuracy

In [15]:
def get_cv_data(df_train, cv=5):
    kf = KFold(n_splits=cv)
    folds = [(train_idx, validation_idx) for train_idx, validation_idx in kf.split(df_train)]
    train_idx = [f[0] for f in folds]
    validation_idx = [f[1] for f in folds]
    return train_idx, validation_idx

In [16]:
def run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True):
    models = []
    df_ens_features = pd.DataFrame()
    df_ens_test = pd.DataFrame()
    df_model_cvaccuracies = pd.DataFrame()
    df_model_info = pd.DataFrame()
    for clf in clfs:
        name = str(clf.__class__).split('.')[-1][:-2]
        
        # cross validation
        if cv:
            model, df_folds, df_folds_accuracy = modeler(clf, df_train, labels, train_idx, validation_idx, name_prefix)
            models.append(model)
            df_model_cvaccuracies = pd.concat([df_model_cvaccuracies, df_folds_accuracy], sort=False)
            df_model_info = pd.concat([df_model_info, df_folds], sort=False)  
            
            #feature_importance(clf, df_train)
        
        # building ensemble
        if cv and ens:
            #df_feature = pd.DataFrame(cross_val_predict(clf, df_train, labels, cv=ens_kf).reshape(-1,1), columns=[name])
            df_ens_feature, ens_test = ensemble(clf, df_train, labels, df_test)
            df_ens_features = pd.concat([df_ens_features, df_ens_feature], axis=1, sort=False).reset_index(drop=True)
            df_ens_test = pd.concat([df_ens_test, ens_test], axis=1, sort=False).reset_index(drop=True)
            
    return models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test
        

In [17]:
clfs = [IBL(), RF(n_estimators =12), SVC(gamma='scale'), XGB(), MLP(max_iter=1000), KNN(), LDA()]
#clfs = [RF(n_estimators =12), SVC(gamma='scale'), XGB(), KNN(), LDA()]

In [18]:
train_idx, validation_idx = get_cv_data(df_train, cv=5)

In [19]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
print('Original')
print('##################################')
name_prefix = 'ALL_' + str(len(df_train.columns)) + '_'
models, df_results_all_cv, df_results_all, df_ens_features, df_ens_test = run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
df_results_all

Original
##################################


Unnamed: 0,Accuracy,Duration
ALL_49_MyIBL,0.716213,2.64659
ALL_49_RandomForestClassifier,0.80631,0.025595
ALL_49_SVC,0.831092,0.032588
ALL_49_XGBClassifier,0.829963,0.150703
ALL_49_MLPClassifier,0.818708,4.661699
ALL_49_KNeighborsClassifier,0.820974,0.014391
ALL_49_LinearDiscriminantAnalysis,0.815337,0.020188


In [50]:
print('PCA')
print('##################################')
df_results_pca = pd.DataFrame()
df_results_pca_cv = pd.DataFrame()
for n_dim in range(15, 22):
    print(n_dim, ' dimensions')
    pca_train, pca_test, ev = MyFeatureSelection.applyPCA(df_train, df_test, n_dim)
    name_prefix = 'PCA_d_' + str(n_dim) + '/' + str(len(df_train.columns)) + '_ev_' + str(round(ev,3)) + '_'
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, pca_train, pca_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_pca_cv = pd.concat([df_results_pca_cv, df_model_cvaccuracies], axis=0)
    df_results_pca = pd.concat([df_results_pca, df_model_info], axis=0)

PCA
##################################
15  dimensions
16  dimensions
17  dimensions
18  dimensions
19  dimensions
20  dimensions
21  dimensions


In [51]:
df_results_pca

Unnamed: 0,Accuracy,Duration
PCA_d_15/49_ev_0.889_MyIBL,0.698318,2.427006
PCA_d_15/49_ev_0.889_RandomForestClassifier,0.792833,0.037291
PCA_d_15/49_ev_0.889_SVC,0.825462,0.019388
PCA_d_15/49_ev_0.889_XGBClassifier,0.825462,0.134437
PCA_d_15/49_ev_0.889_MLPClassifier,0.820974,3.42075
PCA_d_15/49_ev_0.889_KNeighborsClassifier,0.820974,0.007395
PCA_d_15/49_ev_0.889_LinearDiscriminantAnalysis,0.823227,0.00759
PCA_d_16/49_ev_0.902_MyIBL,0.730839,2.371318
PCA_d_16/49_ev_0.902_RandomForestClassifier,0.793963,0.038576
PCA_d_16/49_ev_0.902_SVC,0.823215,0.019188


In [52]:
df_results_pca_order = df_results_pca.copy()
df_results_pca_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
PCA_d_15/49_ev_0.889_MyIBL,0.698318,2.427006
PCA_d_20/49_ev_0.943_MyIBL,0.730794,2.443629
PCA_d_16/49_ev_0.902_MyIBL,0.730839,2.371318
PCA_d_21/49_ev_0.95_MyIBL,0.730896,2.390154
PCA_d_19/49_ev_0.934_MyIBL,0.749971,2.447647
PCA_d_17/49_ev_0.914_MyIBL,0.749971,2.40549
PCA_d_18/49_ev_0.925_MyIBL,0.753412,2.335019
PCA_d_19/49_ev_0.934_RandomForestClassifier,0.786066,0.039543
PCA_d_18/49_ev_0.925_RandomForestClassifier,0.790548,0.039505
PCA_d_21/49_ev_0.95_RandomForestClassifier,0.791703,0.039775


In [53]:
df_results_pca_cv

Unnamed: 0,0,1,2,3,4
PCA_d_15/49_ev_0.889_MyIBL,0.640449,0.702247,0.646067,0.762712,0.740113
PCA_d_15/49_ev_0.889_RandomForestClassifier,0.775281,0.769663,0.797753,0.779661,0.841808
PCA_d_15/49_ev_0.889_SVC,0.831461,0.820225,0.814607,0.785311,0.875706
PCA_d_15/49_ev_0.889_XGBClassifier,0.808989,0.797753,0.859551,0.80226,0.858757
PCA_d_15/49_ev_0.889_MLPClassifier,0.825843,0.775281,0.837079,0.80791,0.858757
PCA_d_15/49_ev_0.889_KNeighborsClassifier,0.780899,0.797753,0.859551,0.80791,0.858757
PCA_d_15/49_ev_0.889_LinearDiscriminantAnalysis,0.775281,0.825843,0.842697,0.79661,0.875706
PCA_d_16/49_ev_0.902_MyIBL,0.702247,0.724719,0.780899,0.745763,0.700565
PCA_d_16/49_ev_0.902_RandomForestClassifier,0.780899,0.758427,0.803371,0.79096,0.836158
PCA_d_16/49_ev_0.902_SVC,0.831461,0.803371,0.820225,0.785311,0.875706


In [54]:
max(df_results_pca['Accuracy'])

0.8277470957912778

In [55]:
print('ICA')
print('##################################')
df_results_ica = pd.DataFrame()
df_results_ica_cv = pd.DataFrame()
for n_dim in [10,20,25,30,35,40,45]:
    print(n_dim, ' dimensions')
    ica_train, ica_test = MyFeatureSelection.applyICA(df_train, df_test, n_dim)
    name_prefix = 'ICA_d_' + str(n_dim) + '/' + str(len(df_train.columns))
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, ica_train, ica_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_ica_cv = pd.concat([df_results_ica_cv, df_model_cvaccuracies], axis=0)
    df_results_ica = pd.concat([df_results_ica, df_model_info], axis=0)

ICA
##################################
10  dimensions
20  dimensions
25  dimensions
30  dimensions
35  dimensions
40  dimensions
45  dimensions


In [56]:
df_results_ica

Unnamed: 0,Accuracy,Duration
ICA_d_10/49MyIBL,0.744341,2.392404
ICA_d_10/49RandomForestClassifier,0.775922,0.033422
ICA_d_10/49SVC,0.778137,0.021188
ICA_d_10/49XGBClassifier,0.81646,0.094912
ICA_d_10/49MLPClassifier,0.796217,1.58485
ICA_d_10/49KNeighborsClassifier,0.82324,0.006796
ICA_d_10/49LinearDiscriminantAnalysis,0.790548,0.005997
ICA_d_20/49MyIBL,0.753355,2.596923
ICA_d_20/49RandomForestClassifier,0.764642,0.03997
ICA_d_20/49SVC,0.797315,0.027303


In [57]:
df_results_ica_cv

Unnamed: 0,0,1,2,3,4
ICA_d_10/49MyIBL,0.713483,0.713483,0.831461,0.728814,0.734463
ICA_d_10/49RandomForestClassifier,0.775281,0.741573,0.792135,0.79096,0.779661
ICA_d_10/49SVC,0.786517,0.764045,0.797753,0.734463,0.80791
ICA_d_10/49XGBClassifier,0.808989,0.786517,0.837079,0.80226,0.847458
ICA_d_10/49MLPClassifier,0.786517,0.764045,0.797753,0.757062,0.875706
ICA_d_10/49KNeighborsClassifier,0.803371,0.792135,0.837079,0.80791,0.875706
ICA_d_10/49LinearDiscriminantAnalysis,0.786517,0.769663,0.808989,0.751412,0.836158
ICA_d_20/49MyIBL,0.786517,0.662921,0.831461,0.745763,0.740113
ICA_d_20/49RandomForestClassifier,0.769663,0.769663,0.752809,0.740113,0.79096
ICA_d_20/49SVC,0.786517,0.780899,0.808989,0.768362,0.841808


In [58]:
df_results_ica_order = df_results_ica.copy()
df_results_ica_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
ICA_d_40/49MyIBL,0.689196,2.734592
ICA_d_30/49MyIBL,0.689221,2.600489
ICA_d_45/49MyIBL,0.709471,2.460483
ICA_d_35/49MyIBL,0.71069,2.619876
ICA_d_25/49MyIBL,0.728687,2.529183
ICA_d_10/49MyIBL,0.744341,2.392404
ICA_d_20/49MyIBL,0.753355,2.596923
ICA_d_20/49RandomForestClassifier,0.764642,0.03997
ICA_d_40/49RandomForestClassifier,0.770253,0.049023
ICA_d_30/49RandomForestClassifier,0.773669,0.042519


In [59]:
max(df_results_ica['Accuracy'])

0.829987938805307

In [60]:
print('INFO GAIN SELECTION')
print('##################################')
df_results_ig = pd.DataFrame()
df_results_ig_cv = pd.DataFrame()
for n_dim in [10,20,25,30,35,40,45]:
    print(n_dim, ' dimensions')
    ig_train, ig_test = MyFeatureSelection.InfoGainSelection(df_train, df_test, labels, n_dim)
    name_prefix = 'IG_d_' + str(n_dim) + '/' + str(len(df_train.columns))
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, ig_train, ig_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_ig_cv = pd.concat([df_results_ig_cv, df_model_cvaccuracies], axis=0)
    df_results_ig = pd.concat([df_results_ig, df_model_info], axis=0)

INFO GAIN SELECTION
##################################
10  dimensions
20  dimensions
25  dimensions
30  dimensions
35  dimensions
40  dimensions
45  dimensions


In [61]:
df_results_ig

Unnamed: 0,Accuracy,Duration
IG_d_10/49MyIBL,0.721824,2.701633
IG_d_10/49RandomForestClassifier,0.808589,0.023791
IG_d_10/49SVC,0.80066,0.017101
IG_d_10/49XGBClassifier,0.829969,0.052834
IG_d_10/49MLPClassifier,0.824364,1.938015
IG_d_10/49KNeighborsClassifier,0.822104,0.007397
IG_d_10/49LinearDiscriminantAnalysis,0.807446,0.005798
IG_d_20/49MyIBL,0.732026,2.613834
IG_d_20/49RandomForestClassifier,0.805212,0.025014
IG_d_20/49SVC,0.791659,0.020301


In [62]:
df_results_ig_order = df_results_ig.copy()
df_results_ig_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
IG_d_45/49MyIBL,0.718517,2.629957
IG_d_10/49MyIBL,0.721824,2.701633
IG_d_35/49MyIBL,0.72298,2.581891
IG_d_30/49MyIBL,0.73087,2.595564
IG_d_20/49MyIBL,0.732026,2.613834
IG_d_25/49MyIBL,0.754542,2.504721
IG_d_40/49MyIBL,0.756783,2.602803
IG_d_20/49SVC,0.791659,0.020301
IG_d_35/49RandomForestClassifier,0.798407,0.023603
IG_d_40/49RandomForestClassifier,0.798419,0.023603


In [63]:
df_results_ig_cv

Unnamed: 0,0,1,2,3,4
IG_d_10/49MyIBL,0.775281,0.679775,0.730337,0.774011,0.649718
IG_d_10/49RandomForestClassifier,0.792135,0.792135,0.814607,0.79096,0.853107
IG_d_10/49SVC,0.825843,0.803371,0.786517,0.757062,0.830508
IG_d_10/49XGBClassifier,0.814607,0.825843,0.837079,0.80791,0.864407
IG_d_10/49MLPClassifier,0.842697,0.780899,0.814607,0.80226,0.881356
IG_d_10/49KNeighborsClassifier,0.825843,0.769663,0.842697,0.80791,0.864407
IG_d_10/49LinearDiscriminantAnalysis,0.792135,0.814607,0.803371,0.785311,0.841808
IG_d_20/49MyIBL,0.730337,0.691011,0.735955,0.785311,0.717514
IG_d_20/49RandomForestClassifier,0.752809,0.803371,0.831461,0.779661,0.858757
IG_d_20/49SVC,0.775281,0.797753,0.808989,0.734463,0.841808


In [64]:
max(df_results_ig['Accuracy'])

0.8355805243445693

In [65]:
print('ANOVA SELECTION')
print('##################################')
df_results_an = pd.DataFrame()
df_results_an_cv = pd.DataFrame()
for n_dim in [10,20,25,30,35,40,45]:
    print(n_dim, ' dimensions')
    an_train, an_test = MyFeatureSelection.AnovaSelection(df_train, df_test, labels, n_dim)
    name_prefix = 'AN_d_' + str(n_dim) + '/' + str(len(df_train.columns))
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, an_train, an_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_an_cv = pd.concat([df_results_an_cv, df_model_cvaccuracies], axis=0)
    df_results_an = pd.concat([df_results_an, df_model_info], axis=0)

ANOVA SELECTION
##################################
10  dimensions
20  dimensions
25  dimensions
30  dimensions
35  dimensions
40  dimensions
45  dimensions


In [66]:
df_results_an

Unnamed: 0,Accuracy,Duration
AN_d_10/49MyIBL,0.71394,2.612042
AN_d_10/49RandomForestClassifier,0.797302,0.022314
AN_d_10/49SVC,0.780391,0.016793
AN_d_10/49XGBClassifier,0.806329,0.051943
AN_d_10/49MLPClassifier,0.787202,1.246113
AN_d_10/49KNeighborsClassifier,0.808582,0.006796
AN_d_10/49LinearDiscriminantAnalysis,0.778169,0.005711
AN_d_20/49MyIBL,0.711737,2.645317
AN_d_20/49RandomForestClassifier,0.800724,0.022988
AN_d_20/49SVC,0.800673,0.021789


In [67]:
df_results_an_order = df_results_an.copy()
df_results_an_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
AN_d_20/49MyIBL,0.711737,2.645317
AN_d_10/49MyIBL,0.71394,2.612042
AN_d_25/49MyIBL,0.728591,2.655629
AN_d_45/49MyIBL,0.733111,2.557232
AN_d_40/49MyIBL,0.738875,2.500354
AN_d_30/49MyIBL,0.756719,2.400575
AN_d_35/49MyIBL,0.757925,2.434744
AN_d_10/49LinearDiscriminantAnalysis,0.778169,0.005711
AN_d_10/49SVC,0.780391,0.016793
AN_d_10/49MLPClassifier,0.787202,1.246113


In [68]:
df_results_an_cv

Unnamed: 0,0,1,2,3,4
AN_d_10/49MyIBL,0.747191,0.657303,0.758427,0.683616,0.723164
AN_d_10/49RandomForestClassifier,0.792135,0.769663,0.825843,0.779661,0.819209
AN_d_10/49SVC,0.780899,0.780899,0.792135,0.734463,0.813559
AN_d_10/49XGBClassifier,0.764045,0.803371,0.831461,0.79661,0.836158
AN_d_10/49MLPClassifier,0.741573,0.797753,0.786517,0.779661,0.830508
AN_d_10/49KNeighborsClassifier,0.803371,0.775281,0.825843,0.79661,0.841808
AN_d_10/49LinearDiscriminantAnalysis,0.747191,0.780899,0.792135,0.757062,0.813559
AN_d_20/49MyIBL,0.646067,0.702247,0.764045,0.711864,0.734463
AN_d_20/49RandomForestClassifier,0.792135,0.758427,0.808989,0.79661,0.847458
AN_d_20/49SVC,0.814607,0.792135,0.797753,0.768362,0.830508


In [69]:
max(df_results_an['Accuracy'])

0.8344696248333651

In [72]:
# Random forest estimator for feature selection
print('Random Forest Estimator Selection')
print('##################################')
rf_train, rf_test = MyFeatureSelection.RandomForestSelection(df_train, df_test, labels, 100)
print(len(rf_train.columns), ' dimensions')
name_prefix = 'RF_d_' + str(len(rf_train.columns)) + '/' + str(len(df_train.columns))
models, df_results_rf_cv, df_results_rf, df_ens_features, df_ens_test = run_models(clfs, rf_train, rf_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)



Random Forest Estimator Selection
##################################
25  dimensions


In [73]:
df_results_rf

Unnamed: 0,Accuracy,Duration
RF_d_25/49MyIBL,0.728674,2.619417
RF_d_25/49RandomForestClassifier,0.805186,0.024819
RF_d_25/49SVC,0.825449,0.02248
RF_d_25/49XGBClassifier,0.834457,0.090197
RF_d_25/49MLPClassifier,0.820974,4.702457
RF_d_25/49KNeighborsClassifier,0.814232,0.010193
RF_d_25/49LinearDiscriminantAnalysis,0.82211,0.010329


In [74]:
df_results_rf_cv

Unnamed: 0,0,1,2,3,4
RF_d_25/49MyIBL,0.668539,0.747191,0.707865,0.762712,0.757062
RF_d_25/49RandomForestClassifier,0.786517,0.780899,0.842697,0.779661,0.836158
RF_d_25/49SVC,0.842697,0.808989,0.825843,0.779661,0.870056
RF_d_25/49XGBClassifier,0.820225,0.820225,0.865169,0.79096,0.875706
RF_d_25/49MLPClassifier,0.808989,0.775281,0.853933,0.80791,0.858757
RF_d_25/49KNeighborsClassifier,0.775281,0.780899,0.848315,0.80226,0.864407
RF_d_25/49LinearDiscriminantAnalysis,0.797753,0.814607,0.820225,0.80791,0.870056


In [75]:
print('Lasso Regression Selection')
print('##################################')
df_results_lr = pd.DataFrame()
df_results_lr_cv = pd.DataFrame()
for alpha in [0.005, 0.003, 0.0009, 0.0005, 0.0002, 0.00001]:
    lr_train, lr_test = MyFeatureSelection.LassoRegressionSelection(df_train, df_test, labels, alpha)
    print(len(lr_train.columns), ' dimensions')
    name_prefix = 'LR_d_' + str(n_dim) + '/' + str(len(df_train.columns))
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, lr_train, lr_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_lr_cv = pd.concat([df_results_lr_cv, df_model_cvaccuracies], axis=0)
    df_results_lr = pd.concat([df_results_lr, df_model_info], axis=0)

Lasso Regression Selection
##################################
14  dimensions
18  dimensions
30  dimensions
35  dimensions
39  dimensions
46  dimensions


In [76]:
df_results_lr

Unnamed: 0,Accuracy,Duration
LR_d_45/49MyIBL,0.752365,2.637707
LR_d_45/49RandomForestClassifier,0.815343,0.020794
LR_d_45/49SVC,0.823202,0.018593
LR_d_45/49XGBClassifier,0.826598,0.059238
LR_d_45/49MLPClassifier,0.814207,1.975328
LR_d_45/49KNeighborsClassifier,0.798496,0.007712
LR_d_45/49LinearDiscriminantAnalysis,0.828826,0.006796
LR_d_45/49MyIBL,0.738716,2.637327
LR_d_45/49RandomForestClassifier,0.81872,0.023591
LR_d_45/49SVC,0.828839,0.021191


In [77]:
df_results_lr_cv

Unnamed: 0,0,1,2,3,4
LR_d_45/49MyIBL,0.685393,0.775281,0.696629,0.79096,0.813559
LR_d_45/49RandomForestClassifier,0.814607,0.786517,0.820225,0.79096,0.864407
LR_d_45/49SVC,0.837079,0.814607,0.814607,0.785311,0.864407
LR_d_45/49XGBClassifier,0.831461,0.808989,0.820225,0.79096,0.881356
LR_d_45/49MLPClassifier,0.808989,0.803371,0.814607,0.779661,0.864407
LR_d_45/49KNeighborsClassifier,0.696629,0.814607,0.820225,0.80791,0.853107
LR_d_45/49LinearDiscriminantAnalysis,0.842697,0.831461,0.814607,0.80226,0.853107
LR_d_45/49MyIBL,0.735955,0.752809,0.747191,0.711864,0.745763
LR_d_45/49RandomForestClassifier,0.831461,0.786517,0.814607,0.80226,0.858757
LR_d_45/49SVC,0.831461,0.820225,0.825843,0.79096,0.875706


In [78]:
df_results_lr_order = df_results_lr.copy()
df_results_lr_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
LR_d_45/49MyIBL,0.737745,2.50673
LR_d_45/49MyIBL,0.738716,2.637327
LR_d_45/49MyIBL,0.7433,2.474821
LR_d_45/49MyIBL,0.751197,2.56353
LR_d_45/49MyIBL,0.752231,2.622679
LR_d_45/49MyIBL,0.752365,2.637707
LR_d_45/49KNeighborsClassifier,0.798496,0.007712
LR_d_45/49RandomForestClassifier,0.802926,0.023825
LR_d_45/49RandomForestClassifier,0.807459,0.023992
LR_d_45/49RandomForestClassifier,0.808576,0.023594


In [79]:
df_total_order =  pd.concat([df_results_all, df_results_pca, df_results_ica, df_results_ig, df_results_an, df_results_rf, df_results_lr], axis=0)
df_total_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
ICA_d_40/49MyIBL,0.689196,2.734592
ICA_d_30/49MyIBL,0.689221,2.600489
PCA_d_15/49_ev_0.889_MyIBL,0.698318,2.427006
ICA_d_45/49MyIBL,0.709471,2.460483
ICA_d_35/49MyIBL,0.71069,2.619876
AN_d_20/49MyIBL,0.711737,2.645317
AN_d_10/49MyIBL,0.71394,2.612042
ALL_49_MyIBL,0.716213,2.64659
IG_d_45/49MyIBL,0.718517,2.629957
IG_d_10/49MyIBL,0.721824,2.701633


In [80]:
lr_train.shape

(888, 46)