In [1]:
import import_ipynb
import pandas as pd
from data_process import X_train, y_train, X_test, y_test
from functions import *

importing Jupyter notebook from data_process.ipynb
importing Jupyter notebook from functions.ipynb


# Feature Selection

## Features separating

From the data description, the features can be classified into two categories: histogram and numerical.

For the features, the prefix is the Identifier and the suffix is the bin_id. aOnly the histogram features will have bin value greater than 0 (if all features follow the same naming convention).

In [None]:
hist_col = []
for i in X_train.columns:
  if i[-1] != '0':
    hist_col.append(i)


In [None]:
hist_features = X_train[hist_col]
num_features = X_train.drop(hist_col,axis=1)

In [None]:
hist_features.head()

### Histogram features analysis

Using Recursive feature elimination to select the top features

In [None]:
top_feature_hist = get_top(hist_features,y_train,15)

In [None]:
top_feature_hist_hardcode = ['ag_001',
 'ag_002',
 'ag_003',
 'ag_006',
 'ay_005',
 'ay_006',
 'ba_003',
 'ba_004',
 'ba_005',
 'cn_001',
 'cn_004',
 'cs_002',
 'cs_004',
 'ee_002',
 'ee_005']

In [None]:
top_features = pd.DataFrame(data=X_train[top_feature_hist],columns=top_feature_hist)
top_features['class'] = y_train

### Numerical feature analysis

In [None]:
top_feature_num = get_top(num_features,y_train,15) # this will take very long to run, refer to the hardcode below

In [None]:
top_feature_num_hardcode = ['aa_000',
 'ai_000',
 'al_000',
 'am_0',
 'aq_000',
 'bb_000',
 'bj_000',
 'bt_000',
 'bv_000',
 'ci_000',
 'ck_000',
 'cl_000',
 'cn_000',
 'cq_000',
 'dn_000']

In [None]:
top_features = pd.DataFrame(data=X_train[top_feature_num],columns=top_feature_num)
top_features['class'] = y_train
top_features.head()

# Experimenting with different imputation strategy

In [4]:
#ignore
strat = ['mean','median','most_frequent']

for i in strat:
    X_train_imputed = impute(X_train,i)
    X_test_imputed = impute(X_test,i)
    X_train_smote, y_train_smote = balance_data(X_train_imputed,y_train)

    iter = 1
    cost_lgbm = 0
    cost_gradient = 0

    #while iter <= 5:
        
    cost_lgbm = cost_lgbm + LGBM(X_train_smote,y_train_smote,X_test_imputed,y_test)
    cost_gradient = cost_gradient + gradient(X_train_smote,y_train_smote,X_test_imputed,y_test)

        #iter += 1
    
    #cost_lgbm = cost_lgbm/(iter-1)
    #cost_gradient = cost_gradient/(iter-1)

    print('Strategy:',i,',', 'Model: LGBM',cost_lgbm)
    print('Strategy:',i,',','Model: Gradient Boosted Decision Tree',cost_gradient)

Strategy: mean , Model: LGBM 23490
Strategy: mean , Model: Gradient Boosted Decision Tree 19730
Strategy: median , Model: LGBM 25120
Strategy: median , Model: Gradient Boosted Decision Tree 18990
Strategy: most_frequent , Model: LGBM 24430
Strategy: most_frequent , Model: Gradient Boosted Decision Tree 20230


Since we know median is the best, will use median for the imputation strategy. Splitting the training data into train and test.

In [2]:
X_train_imputed = impute(X_train,'median')
from sklearn.model_selection import train_test_split
X_train_feature, X_test_feature, y_train_feature, y_test_feature = train_test_split(X_train_imputed,y_train, test_size = 0.2, random_state = 42)

# Scaling

Do not need scaling for random forests and decision trees as these models are scale-invariant

In [2]:
#ignore
strat = 'median'
X_train_imputed = impute(X_train,strat)
X_test_imputed = impute(X_test,strat)
X_train_smote, y_train_smote = balance_data(X_train_imputed,y_train)

## MinMax

In [4]:
X_train_scale,scaler = scale_data_minmax(X_train_feature)
X_test_scale = pd.DataFrame(data = scaler.transform(X_test_feature) , columns=X_test_feature.columns)
print(X_train_scale.shape,X_test_scale.shape)

cost_lgbm = 0
cost_gradient = 0
iter = 1

X_train_smote, y_train_smote = balance_data(X_train_scale,y_train_feature)
#while iter <= 5:
        
cost_lgbm = cost_lgbm + LGBM(X_train_smote,y_train_smote,X_test_scale,y_test_feature)
cost_gradient = cost_gradient + gradient(X_train_smote,y_train_smote,X_test_scale,y_test_feature)
    #print(iter)
    #iter += 1

#cost_lgbm = cost_lgbm/(iter-1)
#cost_gradient = cost_gradient/(iter-1)

print('Model: LGBM',cost_lgbm)
print('Model: Gradient Boosted Decision Tree',cost_gradient)


(48000, 170) (12000, 170)
Model: LGBM 4717710
Model: Gradient Boosted Decision Tree 4664520


## Standard

In [5]:
X_train_scale,scaler = scale_data_standard(X_train_feature)
X_test_scale = pd.DataFrame(data = scaler.transform(X_test_feature) , columns=X_test_feature.columns)
print(X_train_scale.shape,X_test_scale.shape)

cost_lgbm = 0
cost_gradient = 0
iter = 1

X_train_smote, y_train_smote = balance_data(X_train_scale,y_train_feature)

#while iter <= 5:
        
cost_lgbm = cost_lgbm + LGBM(X_train_smote,y_train_smote,X_test_scale,y_test_feature)
cost_gradient = cost_gradient + gradient(X_train_smote,y_train_smote,X_test_scale,y_test_feature)
    #print(iter)
    #iter += 1

#cost_lgbm = cost_lgbm/(iter-1)
#cost_gradient = cost_gradient/(iter-1)

print('Model: LGBM',cost_lgbm)
print('Model: Gradient Boosted Decision Tree',cost_gradient)


(48000, 170) (12000, 170)
Model: LGBM 4719170
Model: Gradient Boosted Decision Tree 4660650
