In [75]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
 
from sklearn.metrics import roc_auc_score

In [33]:
data = pd.read_csv(r'E:\programming\dataset\Santander Customer Satisfaction\train.csv')
print(data.shape)
data.head()

(76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [35]:
[col for col in data.columns if data[col].isnull().sum() > 0]

[]

In [37]:
#split it into train and test
X_train,X_test,y_train,y_test = train_test_split(data.drop(['TARGET'],axis = 1),data['TARGET'],
                                                test_size = 0.3,random_state = 0)

In [38]:
print(X_train.shape,X_test.shape)

(53214, 370) (22806, 370)


In [39]:
##creating copy of the original datset to compare at last
X_train_orig = X_train.copy()
X_test_orig = X_test.copy()

In [40]:
##Removing const features
const_features = [col for col in X_train.columns if len(X_train[col].unique()) == 1]
len(const_features)

38

In [41]:
X_train.drop(const_features,axis = 1,inplace = True)
X_test.drop(const_features,axis = 1,inplace = True)

In [42]:
print(X_train.shape,X_test.shape)

(53214, 332) (22806, 332)


In [56]:
#quasi const features
quasi_features = [col for col in X_train.columns if (X_train[col].value_counts() / len(X_train)).values[0] > 0.9999]
len(quasi_features)

65

In [57]:
X_train.drop(quasi_features,axis = 1,inplace = True)
X_test.drop(quasi_features,axis = 1,inplace = True)

In [58]:
print(X_train.shape,X_test.shape)

(53214, 267) (22806, 267)


In [62]:
#dublicative features
dub_features = []
tick = 0
for i in range(len(X_train.columns)):
    tick += 1
    if tick%10 == 0:
        print(tick)
    
    for j in range(i+1,len(X_train.columns)):
        col_1 = X_train.columns[i]
        col_2 = X_train.columns[j]
        if X_train[col_1].equals(X_train[col_2]):
            dub_features.append(col_2)
   

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260


In [63]:
len(dub_features)

13

In [64]:
X_train.drop(dub_features,axis = 1,inplace = True)
X_test.drop(dub_features,axis = 1,inplace = True)

In [65]:
X_train_basic = X_train.copy()
X_test_basic = X_test.copy()

In [66]:
#removing corr features
corr_matrix = X_train.corr()


In [69]:
def correlation(dataset,threshold):
    corr_features = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix)):
        for j in range(i):
            if abs(corr_matrix.iloc[i][j]) > threshold:
                corr_features.add(dataset.columns[i])
    return corr_features            

In [70]:
corr_features = correlation(X_train,0.8)
len(corr_features)

143

In [72]:
X_train.drop(corr_features,axis = 1,inplace = True)
X_test.drop(corr_features,axis = 1,inplace = True)

In [73]:
print(X_train.shape,X_test.shape)

(53214, 111) (22806, 111)


In [74]:
#we have three datasets now, X_train,X_trian_basic,X_train_orig
#we will compare their performances on different machine learning models

In [76]:
# create a function to build random forests and compare performance in train and test set
 
def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [78]:
# original
run_randomForests(X_train_orig.drop(labels=['ID'], axis=1),
                  X_test_orig.drop(labels=['ID'], axis=1),
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8012314741948454
Test set
Random Forests roc-auc: 0.7900499757912425


In [80]:
# filter methods - basic
run_randomForests(X_train_basic.drop(labels=['ID'], axis=1),
                  X_test_basic.drop(labels=['ID'], axis=1),
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8011308532239815
Test set
Random Forests roc-auc: 0.7902379634897554


In [81]:
# filter methods - correlation
run_randomForests(X_train.drop(labels=['ID'], axis=1),
                  X_test.drop(labels=['ID'], axis=1),
                  y_train, y_test)


Train set
Random Forests roc-auc: 0.8103617086152701
Test set
Random Forests roc-auc: 0.7982409469042577


In [82]:
# we can see for random forests, the performance becomes better as we reduce the dimensionality to less than half

In [83]:
# create a function to build logistic regression and compare performance in train and test set
 
def run_logistic(X_train, X_test, y_train, y_test):
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [86]:
# original
# for logistic regression features need to be in the same scale
 
# original
scaler = StandardScaler().fit(X_train_orig.drop(labels=['ID'], axis=1))
 
run_logistic(scaler.transform(X_train_orig.drop(labels=['ID'], axis=1)),
             scaler.transform(X_test_orig.drop(labels=['ID'], axis=1)),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.8069992623682241
Test set
Logistic Regression roc-auc: 0.7936268147072499


In [87]:
# filter methods - basic
scaler = StandardScaler().fit(X_train_basic.drop(labels=['ID'], axis=1))
 
run_logistic(scaler.transform(X_train_basic.drop(labels=['ID'], axis=1)),
             scaler.transform(X_test_basic.drop(labels=['ID'], axis=1)),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.8069983095004282
Test set
Logistic Regression roc-auc: 0.7936733430994646


In [88]:
# filter methods - correlation
scaler = StandardScaler().fit(X_train.drop(labels=['ID'], axis=1))
 
run_logistic(scaler.transform(X_train.drop(labels=['ID'], axis=1)),
             scaler.transform(X_test.drop(labels=['ID'], axis=1)),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.7981905694487417
Test set
Logistic Regression roc-auc: 0.7925021619959691


In [89]:
#in regression we can see that even after reducing the model to less than half of the features, there is not much change in the scores