In [24]:

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import os
import pandas as pd
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.pandas.set_option('display.max_columns', None)

import pickle
import warnings
warnings.simplefilter(action='ignore')

import eli5

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_regression, SelectPercentile, SelectFromModel
from eli5.sklearn import PermutationImportance
from sklearn.neighbors import KNeighborsClassifier
from xgboost.sklearn import XGBClassifier

In [25]:
project_path = r'C:\Users\kchanas\Documents\Learning\Kaggle\Titanic'
X_train = pd.read_csv(os.path.join(project_path, r'data\X_train.csv'), index_col=0)
y_train = pd.read_csv(os.path.join(project_path, r'data\y_train.csv'), index_col=0, names=['Survived'])

In [26]:
X_train.shape
X_train.head()

(710, 29)

Unnamed: 0_level_0,Pclass,Age,Fare,SibSp_enc,Parch_enc,Embarked_enc,Age_Bucket_enc,Fare_Bucket_enc,SibSp_enc_%,Parch_enc_%,Embarked_enc_%,Age_Bucket_enc_%,Fare_Bucket_enc_%,Sex_male,SibSp_1-2,SibSp_3andMore,Parch_1-2,Parch_3andMore,Embarked_Q,Embarked_S,Age_Bucket_18-25,Age_Bucket_25-30,Age_Bucket_30-40,Age_Bucket_40-80,Fare_Bucket_10-15,Fare_Bucket_15-25,Fare_Bucket_25-40,Fare_Bucket_40-75,Fare_Bucket_75-513
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
302,0.833889,-0.017326,0.28205,1.481301,-0.473054,0.677858,-1.152788,0.122758,1.47211,-0.526577,0.171883,-0.887161,0.281248,0.730395,1.61371,-0.21725,-0.53936,-0.125446,3.177066,-1.61371,-0.464493,1.477788,-0.466732,-0.46225,-0.393029,2.767406,-0.460004,-0.303822,-0.353833
310,-1.563119,0.057272,1.134582,-0.448467,-0.473054,1.944384,-1.152788,1.254678,-0.441015,-0.526577,2.038906,-0.887161,0.501921,-1.369122,-0.61969,-0.21725,-0.53936,-0.125446,-0.314756,-1.61371,-0.464493,1.477788,-0.466732,-0.46225,-0.393029,-0.361349,-0.460004,3.291403,-0.353833
517,-0.364615,0.350944,-0.506659,-0.448467,-0.473054,-0.588667,0.905417,-0.443203,-0.441015,-0.526577,-0.550049,0.980409,0.074323,-1.369122,-0.61969,-0.21725,-0.53936,-0.125446,-0.314756,0.61969,-0.464493,-0.676687,2.142557,-0.46225,2.544344,-0.361349,-0.460004,-0.303822,-0.353833
121,-0.364615,-0.634488,1.368022,1.481301,-0.473054,-0.588667,-0.46672,1.254678,1.47211,-0.526577,-0.550049,-0.824874,0.501921,0.730395,1.61371,-0.21725,-0.53936,-0.125446,-0.314756,0.61969,2.152887,-0.676687,-0.466732,-0.46225,-0.393029,-0.361349,-0.460004,3.291403,-0.353833
571,-0.364615,2.253449,-0.506659,-0.448467,-0.473054,-0.588667,0.219349,-0.443203,-0.441015,-0.526577,-0.550049,-0.089238,0.074323,0.730395,-0.61969,-0.21725,-0.53936,-0.125446,-0.314756,0.61969,-0.464493,-0.676687,-0.466732,2.163331,2.544344,-0.361349,-0.460004,-0.303822,-0.353833


In [27]:
y_train.head()

Unnamed: 0,Survived
302,1
310,1
517,1
121,0
571,1


In the next lines of code I will try out different methods of feature importance assesment and rank variables using each of these methods. The final subset of features will be selected basing on the sum of rankings coming from these methods.

## Feature importance based on correlation with SalePrice

In [28]:
corr = pd.concat([X_train, y_train], axis=1).corr()
corr = np.abs(corr)
imp_corr = corr['Survived'].sort_values(ascending=False).reset_index().rename(columns = 
                                                                               {'index': 'Feature', 'Survived': 'Correlation'})
imp_corr['Corr_Rank'] = imp_corr['Correlation'].rank(ascending=False) - 1
# delete SalePrice from ranking
imp_corr = imp_corr[imp_corr['Corr_Rank']>0].set_index('Feature')
imp_corr.head(30)

Unnamed: 0_level_0,Correlation,Corr_Rank
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1
Sex_male,0.544012,1.0
Fare_Bucket_enc_%,0.372686,2.0
Pclass,0.352763,3.0
Fare_Bucket_enc,0.350372,4.0
Fare,0.334244,5.0
Fare_Bucket_75-513,0.289334,6.0
SibSp_enc_%,0.212966,7.0
SibSp_enc,0.212945,8.0
SibSp_1-2,0.194279,9.0
Parch_enc_%,0.194164,10.0


I will eliminate variables from feature set, so that there will be no highly correlated variables. From each pair of highly correlated variables I will eliminate the one that is less correlated with Survived.

In [29]:

def filter_df_corr(inp_data, imp_corr, corr_val):
    '''
    Returns an array or dataframe (based on type(inp_data) adjusted to drop \
        columns with high correlation to one another. Takes second arg corr_val
        that defines the cutoff

    ----------
    inp_data : np.array, pd.DataFrame
        Values to consider
    corr_val : float
        Value [0, 1] on which to base the correlation cutoff
    '''
    # Creates Correlation Matrix
    if isinstance(inp_data, np.ndarray):
        inp_data = pd.DataFrame(data=inp_data)
        array_flag = True
    else:
        array_flag = False
    corr_matrix = inp_data.corr()

    # Iterates through Correlation Matrix Table to find correlated columns
    drop_cols = []
    n_cols = len(corr_matrix.columns)
    for i in range(n_cols):
        for k in range(i+1, n_cols):
            val = corr_matrix.iloc[k, i]
            col = corr_matrix.columns[i]
            row = corr_matrix.index[k]
            if abs(val) >= corr_val:
                col_corr_with_target = imp_corr.loc[col].iloc[0]
                row_corr_with_target = imp_corr.loc[row].iloc[0]
                # Prints the correlated feature set and the corr val
                print(col, "|", row, "|", round(val, 2))
                if col_corr_with_target > row_corr_with_target:
                    drop_cols.append(row)
                else:
                    drop_cols.append(col)

    # Drops the correlated columns
    drop_cols = set(drop_cols)
    print(drop_cols)
    inp_data = inp_data.drop(columns=drop_cols)
    # Return same type as inp
    if array_flag:
        return inp_data.values
    else:
        return inp_data

X_train = filter_df_corr(X_train, imp_corr, 0.8)

Fare | Fare_Bucket_enc | 0.92
Fare | Fare_Bucket_enc_% | 0.88
SibSp_enc | SibSp_enc_% | 1.0
SibSp_enc | SibSp_1-2 | 0.92
Parch_enc | Parch_enc_% | 0.98
Parch_enc | Parch_1-2 | 0.96
Embarked_enc | Embarked_enc_% | 0.99
Embarked_enc | Embarked_S | -0.95
Age_Bucket_enc | Age_Bucket_enc_% | 0.97
Fare_Bucket_enc | Fare_Bucket_enc_% | 0.94
SibSp_enc_% | SibSp_1-2 | 0.91
Parch_enc_% | Parch_1-2 | 1.0
Embarked_enc_% | Embarked_S | -0.89
{'Embarked_enc', 'Embarked_S', 'Parch_enc', 'SibSp_1-2', 'SibSp_enc', 'Fare', 'Age_Bucket_enc', 'Fare_Bucket_enc', 'Parch_1-2'}


In [30]:
X_train.head()

Unnamed: 0_level_0,Pclass,Age,SibSp_enc_%,Parch_enc_%,Embarked_enc_%,Age_Bucket_enc_%,Fare_Bucket_enc_%,Sex_male,SibSp_3andMore,Parch_3andMore,Embarked_Q,Age_Bucket_18-25,Age_Bucket_25-30,Age_Bucket_30-40,Age_Bucket_40-80,Fare_Bucket_10-15,Fare_Bucket_15-25,Fare_Bucket_25-40,Fare_Bucket_40-75,Fare_Bucket_75-513
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
302,0.833889,-0.017326,1.47211,-0.526577,0.171883,-0.887161,0.281248,0.730395,-0.21725,-0.125446,3.177066,-0.464493,1.477788,-0.466732,-0.46225,-0.393029,2.767406,-0.460004,-0.303822,-0.353833
310,-1.563119,0.057272,-0.441015,-0.526577,2.038906,-0.887161,0.501921,-1.369122,-0.21725,-0.125446,-0.314756,-0.464493,1.477788,-0.466732,-0.46225,-0.393029,-0.361349,-0.460004,3.291403,-0.353833
517,-0.364615,0.350944,-0.441015,-0.526577,-0.550049,0.980409,0.074323,-1.369122,-0.21725,-0.125446,-0.314756,-0.464493,-0.676687,2.142557,-0.46225,2.544344,-0.361349,-0.460004,-0.303822,-0.353833
121,-0.364615,-0.634488,1.47211,-0.526577,-0.550049,-0.824874,0.501921,0.730395,-0.21725,-0.125446,-0.314756,2.152887,-0.676687,-0.466732,-0.46225,-0.393029,-0.361349,-0.460004,3.291403,-0.353833
571,-0.364615,2.253449,-0.441015,-0.526577,-0.550049,-0.089238,0.074323,0.730395,-0.21725,-0.125446,-0.314756,-0.464493,-0.676687,-0.466732,2.163331,2.544344,-0.361349,-0.460004,-0.303822,-0.353833


In [31]:
X_train.shape

(710, 20)

## Feature importance from models

### KNeighbors

In [51]:
kn = KNeighborsClassifier(n_neighbors=10)
kn.fit(X_train, y_train)
perm = PermutationImportance(kn, random_state=1).fit(X_train, y_train)
perm_imp_kn = eli5.explain_weights_df(perm, feature_names = X_train.columns.tolist(), top=100)
perm_imp_kn = perm_imp_kn.set_index('feature')
perm_imp_kn['Perm_Kn_Rank'] = perm_imp_kn['weight'].rank(ascending=False)
perm_imp_kn.head(15)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                     weights='uniform')

Unnamed: 0_level_0,weight,std,Perm_Kn_Rank
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sex_male,0.05295775,0.007485,1.0
SibSp_3andMore,0.02056338,0.003285,2.0
Embarked_Q,0.01605634,0.001911,3.0
Fare_Bucket_10-15,0.01239437,0.002582,4.0
Parch_3andMore,0.007042254,0.00252,5.0
Age,0.002816901,0.005418,6.0
Fare_Bucket_enc_%,0.0008450704,0.003404,7.0
Fare_Bucket_40-75,6.661338000000001e-17,0.003883,8.0
Age_Bucket_enc_%,-0.0005633803,0.002612,9.0
SibSp_enc_%,-0.002535211,0.004992,10.0


### RandomForest

In [44]:
rf = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=1)
rf.fit(X_train, y_train)
imp_rf = pd.DataFrame(rf.feature_importances_, 
                      index=X_train.columns, columns=['Importance']).sort_values('Importance', ascending=False)
imp_rf['Rf_Rank'] = imp_rf['Importance'].rank(ascending=False)
imp_rf.head(15)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=1, verbose=0,
                       warm_start=False)

Unnamed: 0,Importance,Rf_Rank
Sex_male,0.273879,1.0
Age,0.24439,2.0
Pclass,0.09743,3.0
Fare_Bucket_enc_%,0.069343,4.0
SibSp_enc_%,0.045679,5.0
Parch_enc_%,0.043294,6.0
Embarked_enc_%,0.042659,7.0
Age_Bucket_enc_%,0.042638,8.0
Fare_Bucket_75-513,0.024244,9.0
Age_Bucket_30-40,0.014192,10.0


In [45]:
rf = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=1)
rf.fit(X_train, y_train)
perm = PermutationImportance(rf, random_state=1).fit(X_train, y_train)
perm_imp_rf = eli5.explain_weights_df(perm, feature_names = X_train.columns.tolist(), top=100)
perm_imp_rf = perm_imp_rf.set_index('feature')
perm_imp_rf['Perm_Rf_Rank'] = perm_imp_rf['weight'].rank(ascending=False)
perm_imp_rf.head(15)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=1, verbose=0,
                       warm_start=False)

Unnamed: 0_level_0,weight,std,Perm_Rf_Rank
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sex_male,0.22,0.010851,1.0
Age,0.112113,0.011004,2.0
Pclass,0.10169,0.006068,3.0
Embarked_enc_%,0.037183,0.00276,4.0
Fare_Bucket_enc_%,0.035493,0.002873,5.0
SibSp_enc_%,0.028451,0.004489,6.0
Parch_enc_%,0.027042,0.004121,7.0
Age_Bucket_enc_%,0.024789,0.002288,8.0
Fare_Bucket_25-40,0.013803,0.001054,9.0
Fare_Bucket_10-15,0.007887,0.001436,10.0


In [46]:
feat_imp_rf = pd.concat([imp_rf, perm_imp_rf], axis=1)
rank_columns = [col for col in feat_imp_rf if 'Rank' in col]
feat_imp_rf = feat_imp_rf[rank_columns]
feat_imp_rf['Overall_Rank'] = feat_imp_rf.sum(axis=1)
feat_imp_rf = feat_imp_rf.sort_values('Overall_Rank', ascending=True)
feat_imp_rf.head(30)

Unnamed: 0,Rf_Rank,Perm_Rf_Rank,Overall_Rank
Sex_male,1.0,1.0,2.0
Age,2.0,2.0,4.0
Pclass,3.0,3.0,6.0
Fare_Bucket_enc_%,4.0,5.0,9.0
Embarked_enc_%,7.0,4.0,11.0
SibSp_enc_%,5.0,6.0,11.0
Parch_enc_%,6.0,7.0,13.0
Age_Bucket_enc_%,8.0,8.0,16.0
Fare_Bucket_75-513,9.0,11.5,20.5
Fare_Bucket_25-40,12.0,9.0,21.0


### XGB

In [47]:
xgb = XGBClassifier(n_estimators=200, random_state=1)
xgb.fit(X_train, y_train)
imp_xgb = pd.DataFrame(xgb.feature_importances_, 
                      index=X_train.columns, columns=['Importance']).sort_values('Importance', ascending=False)
imp_xgb['Xgb_Rank'] = imp_xgb['Importance'].rank(ascending=False)
imp_xgb.head(15)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
              max_depth=3, min_child_weight=1, missing=None, n_estimators=200,
              n_jobs=1, nthread=None, objective='binary:logistic',
              random_state=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              seed=None, silent=True, subsample=1)

Unnamed: 0,Importance,Xgb_Rank
Sex_male,0.439127,1.0
Pclass,0.159881,2.0
Age_Bucket_enc_%,0.069712,3.0
Embarked_enc_%,0.0597,4.0
Fare_Bucket_10-15,0.048128,5.0
SibSp_enc_%,0.04257,6.0
Embarked_Q,0.036207,7.0
Fare_Bucket_enc_%,0.030264,8.0
Age,0.025603,9.0
Age_Bucket_30-40,0.024939,10.0


In [48]:
xgb = XGBRegressor(n_estimators=500, random_state=1)
xgb.fit(X_train, y_train)
perm = PermutationImportance(xgb, random_state=1).fit(X_train, y_train)
perm_imp_xgb = eli5.explain_weights_df(perm, feature_names = X_train.columns.tolist(), top=100)
perm_imp_xgb = perm_imp_xgb.set_index('feature')
perm_imp_xgb['Perm_Xgb_Rank'] = perm_imp_xgb['weight'].rank(ascending=False)
perm_imp_xgb.head(15)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bytree=1, gamma=0, importance_type='gain',
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=None, n_estimators=500, n_jobs=1,
             nthread=None, objective='reg:linear', random_state=1, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
             subsample=1)

Unnamed: 0_level_0,weight,std,Perm_Xgb_Rank
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sex_male,0.59778,0.039723,1.0
Age,0.435131,0.028559,2.0
Pclass,0.326514,0.012005,3.0
Fare_Bucket_enc_%,0.101401,0.005974,4.0
SibSp_enc_%,0.08639,0.005225,5.0
Embarked_enc_%,0.040274,0.004137,6.0
Embarked_Q,0.031857,0.004515,7.0
Parch_enc_%,0.030071,0.007148,8.0
Age_Bucket_enc_%,0.029797,0.003752,9.0
Fare_Bucket_25-40,0.018346,0.003046,10.0


In [49]:
feat_imp_xgb = pd.concat([imp_xgb, perm_imp_xgb], axis=1)
rank_columns = [col for col in feat_imp_xgb if 'Rank' in col]
feat_imp_xgb = feat_imp_xgb[rank_columns]
feat_imp_xgb['Overall_Rank'] = feat_imp_xgb.sum(axis=1)
feat_imp_xgb = feat_imp_xgb.sort_values('Overall_Rank', ascending=True)
feat_imp_xgb.head(30)

Unnamed: 0,Xgb_Rank,Perm_Xgb_Rank,Overall_Rank
Sex_male,1.0,1.0,2.0
Pclass,2.0,3.0,5.0
Embarked_enc_%,4.0,6.0,10.0
Age,9.0,2.0,11.0
SibSp_enc_%,6.0,5.0,11.0
Age_Bucket_enc_%,3.0,9.0,12.0
Fare_Bucket_enc_%,8.0,4.0,12.0
Embarked_Q,7.0,7.0,14.0
Fare_Bucket_10-15,5.0,12.0,17.0
Parch_enc_%,13.0,8.0,21.0


In [52]:
kn_selected_features = perm_imp_kn[perm_imp_kn['Perm_Kn_Rank']<=10].index.tolist()
print(kn_selected_features)

['Sex_male', 'SibSp_3andMore', 'Embarked_Q', 'Fare_Bucket_10-15', 'Parch_3andMore', 'Age', 'Fare_Bucket_enc_%', 'Fare_Bucket_40-75', 'Age_Bucket_enc_%', 'SibSp_enc_%']


In [56]:
rf_selected_features = feat_imp_rf[feat_imp_rf['Overall_Rank']<=15].index.tolist()
print(rf_selected_features)

['Sex_male', 'Age', 'Pclass', 'Fare_Bucket_enc_%', 'Embarked_enc_%', 'SibSp_enc_%', 'Parch_enc_%']


In [57]:
xgb_selected_features = feat_imp_xgb[feat_imp_xgb['Overall_Rank']<=15].index.tolist()
print(xgb_selected_features)

['Sex_male', 'Pclass', 'Embarked_enc_%', 'Age', 'SibSp_enc_%', 'Age_Bucket_enc_%', 'Fare_Bucket_enc_%', 'Embarked_Q']


In [58]:
with open(os.path.join(project_path, r'data\kn_selected_features.txt'), "wb") as f:
      pickle.dump(kn_selected_features, f)
with open(os.path.join(project_path, r'data\rf_selected_features.txt'), "wb") as f:
      pickle.dump(rf_selected_features, f)
with open(os.path.join(project_path, r'data\xgb_selected_features.txt'), "wb") as f:
      pickle.dump(xgb_selected_features, f)