In [1]:
import pandas as pd
import numpy as np
from time import time
from config_loader import load
import argparse
import sys
import seaborn as sns
from MyDataUnderstanding import featureAnalysis
from MyPreprocessing import MyPreprocessing
import numpy as np
from time import time
import matplotlib.pyplot as plt
from model.models import models_perform
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from statistics import mean
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from detect_outliers import detect_outliers
from MyFeatureSelection import MyFeatureSelection

In [2]:
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.svm import SVC
from xgboost import XGBClassifier as XGB
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from model.MyIBL import MyIBL as IBL
from sklearn.linear_model import LogisticRegression as LG

In [3]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 500)

In [4]:
def getData(path, filenames_type):
    '''
    features_lst = [
        "Pclass", "Survived", "Name", "Sex", "Age",
        "Sibsp", "Parch", "Ticket", "Fare", "Cabin","Embarked"]
    '''
    if filenames_type == 'train':
        filename = 'train'
    elif filenames_type == 'test':
        filename = 'test'
    else:
        filename = 'titanicAll'
    df_features = pd.read_csv(path + filename + '.csv',
                           sep=',')

    if filenames_type not in ['train', 'test']:
        # drop unnecessary columns that don't exist in the official dataset
        df_features.drop(['Boat', 'Body', 'Home.dest'],
                          axis=1,
                         inplace=True)
    #labels = df_features['Survived']
    #df_features = df_features.drop(['Survived'], axis=1)
    return df_features

In [5]:
##
# Loads config
parser = argparse.ArgumentParser()
parser.add_argument(
    "-c", "--config", default="titanic.cfg",
    help="specify the location of the clustering config file"
)
args, _ = parser.parse_known_args()

config_file = args.config
config = load(config_file)

##
verbose = config.get('titanic', 'verbose')
path = config.get('titanic', 'path') + '/'
file_type = config.get('titanic', 'file_type')

filename_type = 'train'
if file_type == 'all':
    filename_type = 'other'

In [6]:
print('Filename type:', filename_type)
print()
## train
trainData = getData(path, filename_type)
# Preprocessing
trainPreprocess = MyPreprocessing(process_type='all',
                                  filename_type=filename_type,
                                  remove_outliers=True)

## test
filename_type = 'test'
testData = getData(path, filename_type)
# Preprocessing
testPreprocess = MyPreprocessing(process_type='all',
                                 filename_type=filename_type,
                                 remove_outliers=False)


Filename type: train



In [7]:
trainPreprocess.fit(trainData)
df_train = trainPreprocess.new_df
# the labels "Survived"
labels = trainPreprocess.labels_
#print(labels.head())
# the initial dataset without any preprocessing
#print(trainPreprocess.df_initial.head())
# the preprocessed data
#print(trainPreprocess.new_df.head())

testPreprocess.fit(testData)
df_test = testPreprocess.new_df

process_type: all
process_type: all


In [8]:
# fix missing columns because of NaNs and one hot encoding without dummy_na
if df_train.shape[1] != df_test.shape[1]:
    missing_cols = set(df_test.columns) - set(df_train.columns)
    for col in missing_cols:
        #df_train[col] = np.zeros([df_train.shape[0], 1])
        df_test.drop([col], axis=1, inplace=True)

    missing_cols = set(df_train.columns) - set(df_test.columns)
    for col in missing_cols:
        #df_test[col] = np.zeros([df_test.shape[0], 1])
        df_train.drop([col], axis=1, inplace=True)

labels_test = testPreprocess.labels_

print(df_train.columns, df_test.columns)
print(df_train.shape, df_test.shape)

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'PclassCp_2', 'PclassCp_3', 'Title_Mr0', 'Title_Mr50', 'Title_Mrs', 'Title_Ms', 'FamilySize', 'Em_C', 'Em_Q', 'Em_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_X', 'Age_bin_Kid', 'Age_bin_Teenager', 'Age_bin_Adult', 'Family_bin_SmallFamily', 'Family_bin_BigFamily', 'Family_bin_Team', 'Fare_bin_Median', 'Fare_bin_Average', 'Fare_bin_High', 'Mother', 'Father', 'Daughter', 'Son', 'Orphan', 'RichWoman', 'MiddleClassWoman', 'PoorWoman', 'RichMan', 'MiddleClassMan', 'PoorMan', 'RichGirl', 'MiddleClassGirl', 'PoorGirl', 'RichBoy', 'MiddleClassBoy', 'PoorBoy'], dtype='object') Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'PclassCp_2', 'PclassCp_3', 'Title_Mr0', 'Title_Mr50', 'Title_Mrs', 'Title_Ms', 'FamilySize', 'Em_C', 'Em_Q', 'Em_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_X', 'Age_bin_Kid', 'Age_bin_Teenager', 'Age_bin_Adult', 'Family_bin_SmallFamily', 'Fam

In [9]:
df_train.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'PclassCp_2', 'PclassCp_3', 'Title_Mr0', 'Title_Mr50', 'Title_Mrs', 'Title_Ms', 'FamilySize', 'Em_C', 'Em_Q', 'Em_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_X', 'Age_bin_Kid', 'Age_bin_Teenager', 'Age_bin_Adult', 'Family_bin_SmallFamily', 'Family_bin_BigFamily', 'Family_bin_Team', 'Fare_bin_Median', 'Fare_bin_Average', 'Fare_bin_High', 'Mother', 'Father', 'Daughter', 'Son', 'Orphan', 'RichWoman', 'MiddleClassWoman', 'PoorWoman', 'RichMan', 'MiddleClassMan', 'PoorMan', 'RichGirl', 'MiddleClassGirl', 'PoorGirl', 'RichBoy', 'MiddleClassBoy', 'PoorBoy'], dtype='object')

In [10]:
df_test.columns

Index(['Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'PclassCp_2', 'PclassCp_3', 'Title_Mr0', 'Title_Mr50', 'Title_Mrs', 'Title_Ms', 'FamilySize', 'Em_C', 'Em_Q', 'Em_S', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F', 'Cabin_G', 'Cabin_X', 'Age_bin_Kid', 'Age_bin_Teenager', 'Age_bin_Adult', 'Family_bin_SmallFamily', 'Family_bin_BigFamily', 'Family_bin_Team', 'Fare_bin_Median', 'Fare_bin_Average', 'Fare_bin_High', 'Mother', 'Father', 'Daughter', 'Son', 'Orphan', 'RichWoman', 'MiddleClassWoman', 'PoorWoman', 'RichMan', 'MiddleClassMan', 'PoorMan', 'RichGirl', 'MiddleClassGirl', 'PoorGirl', 'RichBoy', 'MiddleClassBoy', 'PoorBoy'], dtype='object')

In [11]:
#df_test.dtypes

In [12]:
def feature_importance(clf, df_train):
    if hasattr(clf, 'feature_importances_'):
        name = str(clf.__class__).split('.')[-1][:-2]
        feat_imp = pd.DataFrame({'importance':clf.feature_importances_})    
        feat_imp['feature'] = df_train.columns
        feat_imp.sort_values(by='importance', ascending=False, inplace=True)
        #feat_imp = feat_imp.iloc[:top_n]
        feat_imp.sort_values(by='importance', inplace=True)
        feat_imp = feat_imp.set_index('feature', drop=True)
        feat_imp.plot.barh(title='Feature Importance', figsize=(10,10))
        plt.xlabel('%s Feature Importance Score' %name)
        plt.show()
        return feat_imp
    return pd.DataFrame()

In [13]:
def ensemble(clf, x_train, y_train, x_test):
    
    name = str(clf.__class__).split('.')[-1][:-2]
    kf = KFold(n_splits=5)

    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]

    #ens_train = np.zeros((ntrain,))
    ens_train = np.array([])
    #ens_test = np.zeros((ntest,))
    #test_kf = np.empty((kf.n_folds, ntest))
    test_kf = np.zeros((kf.n_splits, ntest))
        
    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train.loc[train_index]
        y_tr = y_train.loc[train_index]
        x_te = x_train.loc[test_index]

        clf.fit(x_tr, y_tr)

        #ens_train[test_index] = clf.predict(x_te)
        ens_train = np.concatenate([ens_train, clf.predict(x_te)])
        # predict from the whole train set 
        #clf.fit(x_train,y_train)
        test_kf[i, :] = clf.predict(x_test)
    
    # features for next ensemble layer
    ens_train = pd.DataFrame(ens_train.reshape(-1,1), columns=[name])
    
    # combine for test
    ens_test = pd.DataFrame(test_kf.T).mode(axis=1)
    ens_test.columns = [name]
    
    return ens_train, ens_test

In [14]:
def modeler(clf, data, labels, train_fidx, validation_fidx, name_prefix):
    name = name_prefix + str(clf.__class__).split('.')[-1][:-2]
    #print(name)
    folds_accuracy = list()
    duration = list()
    start = time()
    for idx, trf in enumerate(train_fidx):
        clf.fit(data.loc[trf], labels.loc[trf])
        prediction_labels = clf.predict(data.loc[validation_fidx[idx]])
        folds_accuracy.append(accuracy_score(labels.loc[validation_fidx[idx]], prediction_labels))
        
    mean_acc = mean(folds_accuracy)
    duration = time() - start
    #print(name, mean_acc, "accuracy at validation stage,", duration, 's')
    df_folds_accuracy = pd.DataFrame([folds_accuracy], index=[name])
    df_folds = pd.DataFrame([[mean_acc, duration/len(train_fidx)]], columns=['Accuracy', 'Duration'], index=[name])
    return clf.fit(data, labels), df_folds, df_folds_accuracy

In [15]:
def get_cv_data(df_train, cv=5):
    kf = KFold(n_splits=cv)
    folds = [(train_idx, validation_idx) for train_idx, validation_idx in kf.split(df_train)]
    train_idx = [f[0] for f in folds]
    validation_idx = [f[1] for f in folds]
    return train_idx, validation_idx

In [16]:
def run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True):
    models = []
    df_ens_features = pd.DataFrame()
    df_ens_test = pd.DataFrame()
    df_model_cvaccuracies = pd.DataFrame()
    df_model_info = pd.DataFrame()
    for clf in clfs:
        name = str(clf.__class__).split('.')[-1][:-2]
        
        # cross validation
        if cv:
            model, df_folds, df_folds_accuracy = modeler(clf, df_train, labels, train_idx, validation_idx, name_prefix)
            models.append(model)
            df_model_cvaccuracies = pd.concat([df_model_cvaccuracies, df_folds_accuracy], sort=False)
            df_model_info = pd.concat([df_model_info, df_folds], sort=False)  
            
            #feature_importance(clf, df_train)
        
        # building ensemble
        if cv and ens:
            #df_feature = pd.DataFrame(cross_val_predict(clf, df_train, labels, cv=ens_kf).reshape(-1,1), columns=[name])
            df_ens_feature, ens_test = ensemble(clf, df_train, labels, df_test)
            df_ens_features = pd.concat([df_ens_features, df_ens_feature], axis=1, sort=False).reset_index(drop=True)
            df_ens_test = pd.concat([df_ens_test, ens_test], axis=1, sort=False).reset_index(drop=True)
            
    return models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test
        

In [17]:
clfs = [IBL(), RF(n_estimators =12), SVC(gamma='scale'), XGB(), MLP(max_iter=1000), KNN(), LDA()]
#clfs = [RF(n_estimators =12), SVC(gamma='scale'), XGB(), KNN(), LDA()]

In [18]:
train_idx, validation_idx = get_cv_data(df_train, cv=5)

In [19]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
print('Original')
print('##################################')
name_prefix = 'ALL_' + str(len(df_train.columns)) + '_'
models, df_results_all_cv, df_results_all, df_ens_features, df_ens_test = run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
df_results_all

Original
##################################


Unnamed: 0,Accuracy,Duration
ALL_49_MyIBL,0.716213,2.64659
ALL_49_RandomForestClassifier,0.80631,0.025595
ALL_49_SVC,0.831092,0.032588
ALL_49_XGBClassifier,0.829963,0.150703
ALL_49_MLPClassifier,0.818708,4.661699
ALL_49_KNeighborsClassifier,0.820974,0.014391
ALL_49_LinearDiscriminantAnalysis,0.815337,0.020188


In [21]:
print('PCA')
print('##################################')
df_results_pca = pd.DataFrame()
df_results_pca_cv = pd.DataFrame()
for n_dim in range(15, 22):
    print(n_dim, ' dimensions')
    pca_train, pca_test, ev = MyFeatureSelection.applyPCA(df_train, df_test, n_dim)
    name_prefix = 'PCA_d_' + str(n_dim) + '/' + str(len(df_train.columns)) + '_ev_' + str(round(ev,3)) + '_'
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_pca_cv = pd.concat([df_results_pca_cv, df_model_cvaccuracies], axis=0)
    df_results_pca = pd.concat([df_results_pca, df_model_info], axis=0)

PCA
##################################
15  dimensions
16  dimensions
17  dimensions
18  dimensions
19  dimensions
20  dimensions
21  dimensions


In [22]:
df_results_pca

Unnamed: 0,Accuracy,Duration
PCA_d_15/49_ev_0.889_MyIBL,0.745521,2.581354
PCA_d_15/49_ev_0.889_RandomForestClassifier,0.809674,0.02459
PCA_d_15/49_ev_0.889_SVC,0.831092,0.032118
PCA_d_15/49_ev_0.889_XGBClassifier,0.829963,0.146284
PCA_d_15/49_ev_0.889_MLPClassifier,0.818733,4.383952
PCA_d_15/49_ev_0.889_KNeighborsClassifier,0.820974,0.013915
PCA_d_15/49_ev_0.889_LinearDiscriminantAnalysis,0.815337,0.018789
PCA_d_16/49_ev_0.902_MyIBL,0.737567,2.598762
PCA_d_16/49_ev_0.902_RandomForestClassifier,0.808544,0.026184
PCA_d_16/49_ev_0.902_SVC,0.831092,0.032389


In [23]:
df_results_pca_order = df_results_pca.copy()
df_results_pca_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
PCA_d_20/49_ev_0.943_MyIBL,0.717305,2.57166
PCA_d_18/49_ev_0.925_MyIBL,0.734241,2.616389
PCA_d_16/49_ev_0.902_MyIBL,0.737567,2.598762
PCA_d_21/49_ev_0.95_MyIBL,0.739878,2.610005
PCA_d_17/49_ev_0.914_MyIBL,0.744404,2.574282
PCA_d_15/49_ev_0.889_MyIBL,0.745521,2.581354
PCA_d_19/49_ev_0.934_MyIBL,0.74667,2.724717
PCA_d_17/49_ev_0.914_RandomForestClassifier,0.787158,0.02439
PCA_d_21/49_ev_0.95_RandomForestClassifier,0.796191,0.025584
PCA_d_20/49_ev_0.943_RandomForestClassifier,0.80179,0.024594


In [24]:
df_results_pca_cv

Unnamed: 0,0,1,2,3,4
PCA_d_15/49_ev_0.889_MyIBL,0.668539,0.780899,0.764045,0.774011,0.740113
PCA_d_15/49_ev_0.889_RandomForestClassifier,0.814607,0.786517,0.837079,0.79661,0.813559
PCA_d_15/49_ev_0.889_SVC,0.837079,0.820225,0.825843,0.79661,0.875706
PCA_d_15/49_ev_0.889_XGBClassifier,0.814607,0.814607,0.853933,0.79661,0.870056
PCA_d_15/49_ev_0.889_MLPClassifier,0.814607,0.769663,0.837079,0.813559,0.858757
PCA_d_15/49_ev_0.889_KNeighborsClassifier,0.797753,0.792135,0.848315,0.80226,0.864407
PCA_d_15/49_ev_0.889_LinearDiscriminantAnalysis,0.797753,0.814607,0.814607,0.79661,0.853107
PCA_d_16/49_ev_0.902_MyIBL,0.719101,0.752809,0.780899,0.717514,0.717514
PCA_d_16/49_ev_0.902_RandomForestClassifier,0.792135,0.808989,0.837079,0.774011,0.830508
PCA_d_16/49_ev_0.902_SVC,0.837079,0.820225,0.825843,0.79661,0.875706


In [25]:
max(df_results_pca['Accuracy'])

0.8310924903193043

In [26]:
print('ICA')
print('##################################')
df_results_ica = pd.DataFrame()
df_results_ica_cv = pd.DataFrame()
for n_dim in [10,20,25,30,35,40,45]:
    print(n_dim, ' dimensions')
    ica_train, ica_test = MyFeatureSelection.applyICA(df_train, df_test, n_dim)
    name_prefix = 'ICA_d_' + str(n_dim) + '/' + str(len(df_train.columns))
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_ica_cv = pd.concat([df_results_ica_cv, df_model_cvaccuracies], axis=0)
    df_results_ica = pd.concat([df_results_ica, df_model_info], axis=0)

ICA
##################################
10  dimensions
20  dimensions
25  dimensions
30  dimensions
35  dimensions
40  dimensions
45  dimensions


In [27]:
df_results_ica

Unnamed: 0,Accuracy,Duration
ICA_d_10/49MyIBL,0.745528,2.511777
ICA_d_10/49RandomForestClassifier,0.791659,0.026303
ICA_d_10/49SVC,0.831092,0.032457
ICA_d_10/49XGBClassifier,0.829963,0.169212
ICA_d_10/49MLPClassifier,0.817609,4.151898
ICA_d_10/49KNeighborsClassifier,0.820974,0.014893
ICA_d_10/49LinearDiscriminantAnalysis,0.815337,0.018654
ICA_d_20/49MyIBL,0.732019,2.664402
ICA_d_20/49RandomForestClassifier,0.814226,0.024643
ICA_d_20/49SVC,0.831092,0.032014


In [28]:
df_results_ica_cv

Unnamed: 0,0,1,2,3,4
ICA_d_10/49MyIBL,0.696629,0.724719,0.786517,0.745763,0.774011
ICA_d_10/49RandomForestClassifier,0.775281,0.758427,0.848315,0.779661,0.79661
ICA_d_10/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706
ICA_d_10/49XGBClassifier,0.814607,0.814607,0.853933,0.79661,0.870056
ICA_d_10/49MLPClassifier,0.825843,0.764045,0.825843,0.819209,0.853107
ICA_d_10/49KNeighborsClassifier,0.797753,0.792135,0.848315,0.80226,0.864407
ICA_d_10/49LinearDiscriminantAnalysis,0.797753,0.814607,0.814607,0.79661,0.853107
ICA_d_20/49MyIBL,0.713483,0.707865,0.741573,0.757062,0.740113
ICA_d_20/49RandomForestClassifier,0.780899,0.780899,0.848315,0.813559,0.847458
ICA_d_20/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706


In [29]:
df_results_ica_order = df_results_ica.copy()
df_results_ica_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
ICA_d_30/49MyIBL,0.725252,2.674518
ICA_d_20/49MyIBL,0.732019,2.664402
ICA_d_40/49MyIBL,0.734203,2.523907
ICA_d_25/49MyIBL,0.735441,2.568944
ICA_d_35/49MyIBL,0.742113,2.559268
ICA_d_10/49MyIBL,0.745528,2.511777
ICA_d_45/49MyIBL,0.746639,2.608173
ICA_d_35/49RandomForestClassifier,0.78945,0.025344
ICA_d_10/49RandomForestClassifier,0.791659,0.026303
ICA_d_30/49RandomForestClassifier,0.797302,0.024706


In [30]:
max(df_results_ica['Accuracy'])

0.8310924903193043

In [31]:
print('INFO GAIN SELECTION')
print('##################################')
df_results_ig = pd.DataFrame()
df_results_ig_cv = pd.DataFrame()
for n_dim in [10,20,25,30,35,40,45]:
    print(n_dim, ' dimensions')
    ig_train, ig_test = MyFeatureSelection.InfoGainSelection(df_train, df_test, labels, n_dim)
    name_prefix = 'IG_d_' + str(n_dim) + '/' + str(len(df_train.columns))
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_ig_cv = pd.concat([df_results_ig_cv, df_model_cvaccuracies], axis=0)
    df_results_ig = pd.concat([df_results_ig, df_model_info], axis=0)

INFO GAIN SELECTION
##################################
10  dimensions
20  dimensions
25  dimensions
30  dimensions
35  dimensions
40  dimensions
45  dimensions


In [32]:
df_results_ig

Unnamed: 0,Accuracy,Duration
IG_d_10/49MyIBL,0.734216,2.616977
IG_d_10/49RandomForestClassifier,0.798426,0.024385
IG_d_10/49SVC,0.831092,0.03218
IG_d_10/49XGBClassifier,0.829963,0.144094
IG_d_10/49MLPClassifier,0.822104,4.603249
IG_d_10/49KNeighborsClassifier,0.820974,0.013392
IG_d_10/49LinearDiscriminantAnalysis,0.815337,0.019418
IG_d_20/49MyIBL,0.730896,2.561217
IG_d_20/49RandomForestClassifier,0.805218,0.024185
IG_d_20/49SVC,0.831092,0.030781


In [33]:
df_results_ig_order = df_results_ig.copy()
df_results_ig_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
IG_d_40/49MyIBL,0.724173,2.651152
IG_d_25/49MyIBL,0.726338,2.646519
IG_d_20/49MyIBL,0.730896,2.561217
IG_d_10/49MyIBL,0.734216,2.616977
IG_d_45/49MyIBL,0.736514,2.708625
IG_d_30/49MyIBL,0.738812,2.698594
IG_d_35/49MyIBL,0.744334,2.633191
IG_d_40/49RandomForestClassifier,0.789386,0.025448
IG_d_30/49RandomForestClassifier,0.792789,0.025784
IG_d_10/49RandomForestClassifier,0.798426,0.024385


In [34]:
df_results_ig_cv

Unnamed: 0,0,1,2,3,4
IG_d_10/49MyIBL,0.747191,0.758427,0.713483,0.706215,0.745763
IG_d_10/49RandomForestClassifier,0.786517,0.780899,0.825843,0.768362,0.830508
IG_d_10/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706
IG_d_10/49XGBClassifier,0.814607,0.814607,0.853933,0.79661,0.870056
IG_d_10/49MLPClassifier,0.814607,0.769663,0.853933,0.80791,0.864407
IG_d_10/49KNeighborsClassifier,0.797753,0.792135,0.848315,0.80226,0.864407
IG_d_10/49LinearDiscriminantAnalysis,0.797753,0.814607,0.814607,0.79661,0.853107
IG_d_20/49MyIBL,0.735955,0.696629,0.724719,0.711864,0.785311
IG_d_20/49RandomForestClassifier,0.786517,0.769663,0.825843,0.779661,0.864407
IG_d_20/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706


In [35]:
max(df_results_ig['Accuracy'])

0.8310924903193043

In [36]:
print('ANOVA SELECTION')
print('##################################')
df_results_an = pd.DataFrame()
df_results_an_cv = pd.DataFrame()
for n_dim in [10,20,25,30,35,40,45]:
    print(n_dim, ' dimensions')
    an_train, an_test = MyFeatureSelection.AnovaSelection(df_train, df_test, labels, n_dim)
    name_prefix = 'AN_d_' + str(n_dim) + '/' + str(len(df_train.columns))
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_an_cv = pd.concat([df_results_an_cv, df_model_cvaccuracies], axis=0)
    df_results_an = pd.concat([df_results_an, df_model_info], axis=0)

ANOVA SELECTION
##################################
10  dimensions
20  dimensions
25  dimensions
30  dimensions
35  dimensions
40  dimensions
45  dimensions


In [37]:
df_results_an

Unnamed: 0,Accuracy,Duration
AN_d_10/49MyIBL,0.755647,2.650869
AN_d_10/49RandomForestClassifier,0.793912,0.024301
AN_d_10/49SVC,0.831092,0.031437
AN_d_10/49XGBClassifier,0.829963,0.142927
AN_d_10/49MLPClassifier,0.817603,4.219324
AN_d_10/49KNeighborsClassifier,0.820974,0.013791
AN_d_10/49LinearDiscriminantAnalysis,0.815337,0.018189
AN_d_20/49MyIBL,0.734216,2.616844
AN_d_20/49RandomForestClassifier,0.802958,0.024393
AN_d_20/49SVC,0.831092,0.031589


In [38]:
df_results_an_order = df_results_an.copy()
df_results_an_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
AN_d_25/49MyIBL,0.719628,2.544674
AN_d_40/49MyIBL,0.729753,2.636947
AN_d_30/49MyIBL,0.730826,2.730386
AN_d_20/49MyIBL,0.734216,2.616844
AN_d_35/49MyIBL,0.736495,2.680291
AN_d_45/49MyIBL,0.743262,2.6551
AN_d_10/49MyIBL,0.755647,2.650869
AN_d_10/49RandomForestClassifier,0.793912,0.024301
AN_d_35/49RandomForestClassifier,0.795055,0.024532
AN_d_40/49RandomForestClassifier,0.798419,0.024791


In [39]:
df_results_an_cv

Unnamed: 0,0,1,2,3,4
AN_d_10/49MyIBL,0.752809,0.780899,0.719101,0.723164,0.80226
AN_d_10/49RandomForestClassifier,0.797753,0.769663,0.820225,0.774011,0.80791
AN_d_10/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706
AN_d_10/49XGBClassifier,0.814607,0.814607,0.853933,0.79661,0.870056
AN_d_10/49MLPClassifier,0.820225,0.764045,0.837079,0.824859,0.841808
AN_d_10/49KNeighborsClassifier,0.797753,0.792135,0.848315,0.80226,0.864407
AN_d_10/49LinearDiscriminantAnalysis,0.797753,0.814607,0.814607,0.79661,0.853107
AN_d_20/49MyIBL,0.696629,0.735955,0.786517,0.768362,0.683616
AN_d_20/49RandomForestClassifier,0.780899,0.769663,0.831461,0.79096,0.841808
AN_d_20/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706


In [40]:
max(df_results_an['Accuracy'])

0.8310924903193043

In [41]:
# Random forest estimator for feature selection
print('Random Forest Estimator Selection')
print('##################################')
rf_train, rf_test = MyFeatureSelection.RandomForestSelection(df_train, df_test, labels, 100)
print(len(rf_train.columns), ' dimensions')
name_prefix = 'RF_d_' + str(len(rf_train.columns)) + '/' + str(len(df_train.columns))
models, df_results_rf_cv, df_results_rf, df_ens_features, df_ens_test = run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)



Random Forest Estimator Selection
##################################
25  dimensions


In [42]:
df_results_rf

Unnamed: 0,Accuracy,Duration
RF_d_25/49MyIBL,0.744347,2.744606
RF_d_25/49RandomForestClassifier,0.807446,0.025192
RF_d_25/49SVC,0.831092,0.032378
RF_d_25/49XGBClassifier,0.829963,0.142247
RF_d_25/49MLPClassifier,0.817584,4.249379
RF_d_25/49KNeighborsClassifier,0.820974,0.014191
RF_d_25/49LinearDiscriminantAnalysis,0.815337,0.018591


In [43]:
df_results_rf_cv

Unnamed: 0,0,1,2,3,4
RF_d_25/49MyIBL,0.741573,0.735955,0.775281,0.706215,0.762712
RF_d_25/49RandomForestClassifier,0.775281,0.792135,0.842697,0.774011,0.853107
RF_d_25/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706
RF_d_25/49XGBClassifier,0.814607,0.814607,0.853933,0.79661,0.870056
RF_d_25/49MLPClassifier,0.814607,0.769663,0.853933,0.80226,0.847458
RF_d_25/49KNeighborsClassifier,0.797753,0.792135,0.848315,0.80226,0.864407
RF_d_25/49LinearDiscriminantAnalysis,0.797753,0.814607,0.814607,0.79661,0.853107


In [44]:
print('Lasso Regression Selection')
print('##################################')
df_results_lr = pd.DataFrame()
df_results_lr_cv = pd.DataFrame()
for alpha in [0.005, 0.003, 0.0009, 0.0005, 0.0002, 0.00001]:
    lr_train, lr_test = MyFeatureSelection.LassoRegressionSelection(df_train, df_test, labels, alpha)
    print(len(lr_train.columns), ' dimensions')
    name_prefix = 'LR_d_' + str(n_dim) + '/' + str(len(df_train.columns))
    models, df_model_cvaccuracies, df_model_info, df_ens_features, df_ens_test = run_models(clfs, df_train, df_test, train_idx, validation_idx, name_prefix, cv=True, ens=True)
    df_results_lr_cv = pd.concat([df_results_lr_cv, df_model_cvaccuracies], axis=0)
    df_results_lr = pd.concat([df_results_lr, df_model_info], axis=0)

Lasso Regression Selection
##################################
14  dimensions
18  dimensions
30  dimensions
35  dimensions
39  dimensions
46  dimensions


In [45]:
df_results_lr

Unnamed: 0,Accuracy,Duration
LR_d_45/49MyIBL,0.734178,2.555974
LR_d_45/49RandomForestClassifier,0.806304,0.025786
LR_d_45/49SVC,0.831092,0.03142
LR_d_45/49XGBClassifier,0.829963,0.142678
LR_d_45/49MLPClassifier,0.817603,4.517008
LR_d_45/49KNeighborsClassifier,0.820974,0.013991
LR_d_45/49LinearDiscriminantAnalysis,0.815337,0.018444
LR_d_45/49MyIBL,0.741014,2.680602
LR_d_45/49RandomForestClassifier,0.795036,0.025734
LR_d_45/49SVC,0.831092,0.032596


In [46]:
df_results_lr_cv

Unnamed: 0,0,1,2,3,4
LR_d_45/49MyIBL,0.747191,0.730337,0.775281,0.700565,0.717514
LR_d_45/49RandomForestClassifier,0.792135,0.786517,0.842697,0.785311,0.824859
LR_d_45/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706
LR_d_45/49XGBClassifier,0.814607,0.814607,0.853933,0.79661,0.870056
LR_d_45/49MLPClassifier,0.814607,0.769663,0.837079,0.813559,0.853107
LR_d_45/49KNeighborsClassifier,0.797753,0.792135,0.848315,0.80226,0.864407
LR_d_45/49LinearDiscriminantAnalysis,0.797753,0.814607,0.814607,0.79661,0.853107
LR_d_45/49MyIBL,0.741573,0.730337,0.730337,0.774011,0.728814
LR_d_45/49RandomForestClassifier,0.769663,0.780899,0.842697,0.768362,0.813559
LR_d_45/49SVC,0.837079,0.820225,0.825843,0.79661,0.875706


In [47]:
df_results_lr_order = df_results_lr.copy()
df_results_lr_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
LR_d_45/49MyIBL,0.724116,2.554366
LR_d_45/49MyIBL,0.734178,2.555974
LR_d_45/49MyIBL,0.735352,2.59526
LR_d_45/49MyIBL,0.741014,2.680602
LR_d_45/49MyIBL,0.746651,2.511999
LR_d_45/49MyIBL,0.747756,2.571296
LR_d_45/49RandomForestClassifier,0.788288,0.024785
LR_d_45/49RandomForestClassifier,0.795036,0.025734
LR_d_45/49RandomForestClassifier,0.797296,0.025325
LR_d_45/49RandomForestClassifier,0.797315,0.024588


In [48]:
df_total_order =  pd.concat([df_results_all, df_results_pca, df_results_ica, df_results_ig, df_results_an, df_results_rf, df_results_lr], axis=0)
df_total_order.sort_values('Accuracy')

Unnamed: 0,Accuracy,Duration
ALL_49_MyIBL,0.716213,2.64659
PCA_d_20/49_ev_0.943_MyIBL,0.717305,2.57166
AN_d_25/49MyIBL,0.719628,2.544674
LR_d_45/49MyIBL,0.724116,2.554366
IG_d_40/49MyIBL,0.724173,2.651152
ICA_d_30/49MyIBL,0.725252,2.674518
IG_d_25/49MyIBL,0.726338,2.646519
AN_d_40/49MyIBL,0.729753,2.636947
AN_d_30/49MyIBL,0.730826,2.730386
IG_d_20/49MyIBL,0.730896,2.561217


In [49]:
lr_train.shape

(888, 46)