In [48]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv(r'E:\programming\dataset\Santander Customer Satisfaction\train.csv')
print(data.shape)
data.head()

(76020, 371)


Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [3]:
#missing values
len([col for col in data.columns if data[col].isnull().sum()>0])

0

### Split into train and test

In [4]:
X_train,X_test,y_train,y_test = train_test_split(data.drop(['ID','TARGET'],axis = 1),data['TARGET'],
                                                test_size = 0.3,random_state = 0)
print(X_train.shape,X_test.shape)

(53214, 369) (22806, 369)


In [5]:
X_train_original = X_train.copy()
X_test_original = X_test.copy()

## Removing const features

In [6]:
const_features = [col for col in X_train.columns if len(X_train[col].unique()) == 1]
len(const_features)

38

In [7]:
X_train.drop(const_features,axis = 1,inplace = True)
X_test.drop(const_features,axis = 1,inplace = True)

In [8]:
print(X_train.shape,X_test.shape)

(53214, 331) (22806, 331)


## Removing quasi const features

In [14]:
quasi_const = [col for col in X_train.columns if (X_train[col].value_counts() / len(X_train)).values[0] > 0.9997]

X_train.drop(quasi_const,axis = 1,inplace = True)
X_test.drop(quasi_const,axis = 1,inplace = True)

print(X_train.shape,X_test.shape)

(53214, 255) (22806, 255)


## Dublicate features

In [19]:
dub_features = set()
i = 0

for i in range(len(X_train.columns)-1):
    i+= 1
    if i%10 == 0:
        print(i / float(len(X_train.columns)))
    col_1 = X_train.columns[i]
    for col_2 in X_train.columns[i+1:]:
        if X_train[col_1].equals(X_train[col_2]):
            dub_features.add(col_2)

0.0392156862745098
0.0784313725490196
0.11764705882352941
0.1568627450980392
0.19607843137254902
0.23529411764705882
0.27450980392156865
0.3137254901960784
0.35294117647058826
0.39215686274509803
0.43137254901960786
0.47058823529411764
0.5098039215686274
0.5490196078431373
0.5882352941176471
0.6274509803921569
0.6666666666666666
0.7058823529411765
0.7450980392156863
0.7843137254901961
0.8235294117647058
0.8627450980392157
0.9019607843137255
0.9411764705882353
0.9803921568627451


In [21]:
dub_features

{'delta_num_reemb_var13_1y3',
 'ind_var25',
 'ind_var26',
 'ind_var32',
 'ind_var37',
 'ind_var39',
 'num_var25',
 'num_var26',
 'num_var32',
 'num_var37',
 'num_var39'}

In [22]:
X_train.drop(dub_features,axis = 1,inplace = True)
X_test.drop(dub_features,axis = 1,inplace = True)

print(X_train.shape,X_test.shape)

(53214, 244) (22806, 244)


In [23]:
X_train_basic_filters = X_train.copy()
X_test_basic_filters = X_test.copy()

## Correlated features

In [25]:
def correlation(dataset,threshold):
    corr_features = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix)):
        for j in range(i):
            if abs(corr_matrix.iloc[i][j]) > threshold:
                corr_features.add(X_train.columns[i])
    return corr_features            

In [26]:
corr_features = correlation(X_train,0.8)

In [27]:
len(corr_features)

140

In [28]:
X_train.drop(corr_features,axis = 1,inplace = True)
X_test.drop(corr_features,axis = 1,inplace = True)

print(X_train.shape,X_test.shape)

(53214, 104) (22806, 104)


In [29]:
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

## Remove features using univariate roc_auc

In [32]:
roc_values = []
for col in X_train.columns:
    clf = DecisionTreeClassifier()
    clf.fit(X_train[col].to_frame(),y_train)
    prob = clf.predict_proba(X_test[col].to_frame())[:,1]
    roc_values.append(roc_auc_score(y_test,prob))
    

In [33]:
roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
roc_values.sort_values(ascending = False)

var15                            0.697065
num_var4                         0.696372
ind_var5                         0.663066
saldo_var30                      0.657865
saldo_var5                       0.656267
saldo_medio_var5_hace2           0.649150
var36                            0.648116
saldo_medio_var5_hace3           0.647765
num_var30_0                      0.540032
ind_var12_0                      0.527923
ind_var39_0                      0.527372
ind_var13_0                      0.524055
saldo_var12                      0.519468
num_var45_hace3                  0.518231
saldo_var13_corto                0.517414
ind_var5_0                       0.514479
num_var22_ult1                   0.514352
delta_imp_aport_var13_1y3        0.512962
num_med_var45_ult3               0.512698
num_var22_hace3                  0.511082
num_var43_emit_ult1              0.511045
num_aport_var13_hace3            0.510760
imp_aport_var13_hace3            0.510517
num_var22_ult3                   0

In [34]:
random_vars = roc_values[roc_values<=0.5].index
random_vars

Index(['imp_ent_var16_ult1', 'imp_op_var40_efect_ult1', 'imp_sal_var16_ult1',
       'ind_var1_0', 'ind_var37_cte', 'num_op_var40_hace2',
       'num_op_var41_hace2', 'num_op_var41_ult1', 'saldo_var1', 'saldo_var8',
       'saldo_var26', 'saldo_var37', 'delta_imp_aport_var17_1y3',
       'delta_imp_reemb_var13_1y3', 'imp_aport_var17_ult1',
       'imp_var43_emit_ult1', 'ind_var7_recib_ult1', 'num_ent_var16_ult1',
       'saldo_medio_var8_hace2', 'saldo_medio_var8_hace3'],
      dtype='object')

In [35]:
X_train.drop(random_vars,axis = 1,inplace = True)
X_test.drop(random_vars,axis = 1,inplace = True)

print(X_train.shape,X_test.shape)

(53214, 84) (22806, 84)


In [37]:
X_train_roc = X_train.copy()
X_test_roc = X_test.copy()

## Select feature using random_forests

In [44]:
sel = SelectFromModel(RandomForestClassifier(n_estimators=400))
sel.fit(X_train,y_train)
selected_features = X_train.columns[sel.get_support()]

In [45]:
selected_features

Index(['var15', 'imp_op_var39_comer_ult1', 'num_var4', 'saldo_var5',
       'saldo_var30', 'var36', 'num_var22_hace2', 'num_var22_hace3',
       'num_var22_ult3', 'num_med_var45_ult3', 'num_var45_hace3',
       'saldo_medio_var5_hace2', 'saldo_medio_var5_hace3', 'var38'],
      dtype='object')

In [46]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]

In [47]:
print(X_train.shape,X_test.shape)

(53214, 14) (22806, 14)


# Comparing the performances on different models

In [51]:
# create a function to build random forests and compare performance in train and test set

def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [53]:
# original
run_randomForests(X_train_original,
                  X_test_original,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8012314741948454
Test set
Random Forests roc-auc: 0.7900499757912425


In [55]:
# filter methods - basic
run_randomForests(X_train_basic_filters,
                  X_test_basic_filters,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.803209833259346
Test set
Random Forests roc-auc: 0.7895046040618736


In [56]:
# filter methods - correlation
run_randomForests(X_train_corr,
                  X_test_corr,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8079665586376158
Test set
Random Forests roc-auc: 0.7962279817252867


In [57]:
# filter methods - univariate roc-auc
run_randomForests(X_train_roc,
                  X_test_roc,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8093546487990877
Test set
Random Forests roc-auc: 0.7983575427571437


In [58]:
# embedded methods - Random forests
run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.819324565268081
Test set
Random Forests roc-auc: 0.8077074256715181


In [60]:
# create a function to build logistic regression and compare performance in train and test set

def run_logistic(X_train, X_test, y_train, y_test):
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [61]:
# original
scaler = StandardScaler().fit(X_train_original)

run_logistic(scaler.transform(X_train_original),
             scaler.transform(X_test_original),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.8069992623682241
Test set
Logistic Regression roc-auc: 0.7936268147072499


In [62]:
# filter methods - basic
scaler = StandardScaler().fit(X_train_basic_filters)

run_logistic(scaler.transform(X_train_basic_filters),
             scaler.transform(X_test_basic_filters),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.806998225423858
Test set
Logistic Regression roc-auc: 0.7936837382719465


In [63]:
# filter methods - correlation
scaler = StandardScaler().fit(X_train_corr)

run_logistic(scaler.transform(X_train_corr),
             scaler.transform(X_test_corr),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.798176117620506
Test set
Logistic Regression roc-auc: 0.7925062101160221


In [64]:
# filter methods - univariate roc-auc
scaler = StandardScaler().fit(X_train_roc)

run_logistic(scaler.transform(X_train_roc),
             scaler.transform(X_test_roc),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.7943995382141091
Test set
Logistic Regression roc-auc: 0.7941927268976288


In [65]:
# embedded methods - Random Forests importance

scaler = StandardScaler().fit(X_train)

run_logistic(
    scaler.transform(X_train), scaler.transform(X_test), y_train, y_test)



Train set
Logistic Regression roc-auc: 0.7712695272505243
Test set
Logistic Regression roc-auc: 0.7710125417755993
