# Feature Selection Reduction using LDA and PCA

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler

In [3]:
data = pd.read_csv('santander-train.csv', nrows = 20000)
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [4]:
X = data.drop('TARGET', axis = 1)
y = data['TARGET']
X.shape, y.shape

((20000, 370), (20000,))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state =0, stratify = y)


# Remove Constant, Quasi, and Duplicated

In [6]:

constant_filter = VarianceThreshold(threshold=0.01)
constant_filter.fit(X_train)
X_train_filter = constant_filter.transform(X_train)
X_test_filter = constant_filter.transform(X_test)

In [7]:
X_train_T = X_train_filter.T
X_test_T = X_test_filter.T

In [8]:

X_train_T = pd.DataFrame(X_train_T)
X_test_T = pd.DataFrame(X_test_T)

In [9]:
X_train_T.duplicated().sum()

18

In [10]:
duplicated_features = X_train_T.duplicated()
features_to_keep = [not index for index in duplicated_features]


In [11]:
X_train_unique = X_train_T[features_to_keep].T
X_test_unique = X_test_T[features_to_keep].T

In [12]:
X_train_unique.shape, X_test_unique.shape


((16000, 227), (4000, 227))

# Remove correlated feature

In [13]:
corrmat = X_train_unique.corr()

In [14]:
corrmat.shape

(227, 227)

In [15]:
def get_correlation(data, threshold):
    corr_col = set()
    corrmat = data.corr()
    for i in range(len(corrmat.columns)):
        for j in range(i):
            if abs(corrmat.iloc[i, j] > threshold):
                colname = corrmat.columns[i]
                corr_col.add(colname)
    return corr_col

In [16]:
corr_features = get_correlation(X_train_unique, 0.70)
print('correlated features: ', len(set(corr_features)))

correlated features:  147


In [17]:
X_train_uncorr = X_train_unique.drop(labels=corr_features, axis=1)
X_test_uncorr = X_test_unique.drop(labels=corr_features, axis=1)

In [18]:
X_train_uncorr.shape, X_test_uncorr.shape

((16000, 80), (4000, 80))

# Feature Dimension Reduction with LDA

In [19]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

In [20]:
lda = LDA(n_components = 1)
X_train_lda = lda.fit_transform(X_train_uncorr, y_train)

In [21]:
X_train_lda.shape

(16000, 1)

In [22]:
X_test_lda = lda.transform(X_test_uncorr)

In [23]:
def run_randomforest(X_train, X_test, y_train, y_test):
    clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print("Acc on test set")
    print(accuracy_score(y_test, y_pred))

In [24]:
%%time
run_randomforest(X_train_lda, X_test_lda, y_train, y_test)

Acc on test set
0.922
CPU times: user 2.86 s, sys: 101 ms, total: 2.96 s
Wall time: 670 ms


In [25]:
%%time
run_randomforest(X_train, X_test, y_train, y_test)

Acc on test set
0.9585
CPU times: user 8.9 s, sys: 181 ms, total: 9.08 s
Wall time: 1.66 s


# Feature reduction by PCA

In [32]:
from sklearn.decomposition import PCA

In [33]:
pca = PCA(n_components=3, random_state=42)
pca.fit(X_test_uncorr)

PCA(copy=True, iterated_power='auto', n_components=3, random_state=42,
    svd_solver='auto', tol=0.0, whiten=False)

In [29]:
X_train_pca = pca.transform(X_train_uncorr)
X_test_pca = pca.transform(X_test_uncorr)

In [34]:
X_train_pca.shape, X_train_uncorr.shape

((16000, 3), (16000, 80))

In [36]:
%%time
run_randomforest(X_train_pca, X_test_pca, y_train, y_test)

Acc on test set
0.95925
CPU times: user 411 ms, sys: 85.8 ms, total: 497 ms
Wall time: 371 ms


In [37]:
for component in range (1, 79):
    pca = PCA(n_components=component, random_state=42)
    pca.fit(X_test_uncorr)
    X_train_pca = pca.transform(X_train_uncorr)
    X_test_pca = pca.transform(X_test_uncorr)
    print('Selected Comp', component)
    run_randomforest(X_train_pca, X_test_pca, y_train, y_test)
    print()

Selected Comp 1
Acc on test set
0.95925

Selected Comp 2
Acc on test set
0.95925

Selected Comp 3
Acc on test set
0.95925

Selected Comp 4
Acc on test set
0.959

Selected Comp 5
Acc on test set
0.9585

Selected Comp 6
Acc on test set
0.95875

Selected Comp 7
Acc on test set
0.95825

Selected Comp 8
Acc on test set
0.9195

Selected Comp 9
Acc on test set
0.9535

Selected Comp 10
Acc on test set
0.95625

Selected Comp 11
Acc on test set
0.95575

Selected Comp 12
Acc on test set
0.95625

Selected Comp 13
Acc on test set
0.95625

Selected Comp 14
Acc on test set
0.9565

Selected Comp 15
Acc on test set
0.9565

Selected Comp 16
Acc on test set
0.95675

Selected Comp 17
Acc on test set
0.95675

Selected Comp 18
Acc on test set
0.9575

Selected Comp 19
Acc on test set
0.95725

Selected Comp 20
Acc on test set
0.95675

Selected Comp 21
Acc on test set
0.95725

Selected Comp 22
Acc on test set
0.95775

Selected Comp 23
Acc on test set
0.95725

Selected Comp 24
Acc on test set
0.95725

Selected 