In [75]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [2]:
data = pd.read_csv(r'E:\programming\dataset\Santander Customer Satisfaction\train.csv')
data.shape

(76020, 371)

In [3]:
data.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [12]:
#splitting it into train and test
X_train,X_test,y_train,y_test = train_test_split(data.drop(['ID','TARGET'],axis=1),data['TARGET'],test_size = 0.33,random_state = 0)
print(X_train.shape,X_test.shape)

(50933, 369) (25087, 369)


In [13]:
X_train_orig = X_train.copy()
X_test_orig = X_test.copy()

## Removing constant features

In [16]:
const_features = [col for col in X_train.columns if len(X_train[col].unique()) == 1]
len(const_features)

48

In [17]:
X_train.drop(const_features,axis = 1,inplace = True)
X_test.drop(const_features,axis = 1,inplace=True)
print(X_train.shape,X_test.shape)

(50933, 321) (25087, 321)


## Quasi constant features

In [30]:
quasi_const = [col for col in X_train.columns if (X_train[col].value_counts() / len(X_train)).values[0] > 0.9996]
len(quasi_const)

73

In [31]:
X_train.drop(quasi_const,axis = 1,inplace = True)
X_test.drop(quasi_const,axis = 1,inplace = True)
print(X_train.shape,X_test.shape)

(50933, 248) (25087, 248)


## Dublicative Features

In [32]:
dub_features = set()
tick = 0
for i in range(len(X_train.columns)):
    tick += 1
    if tick %10 == 0:
        print(tick)
    
    col_1 = X_train.columns[i]
    
    for col_2 in X_train.columns[i+1:]:
        if X_train[col_1].equals(X_train[col_2]):
            dub_features.add(col_2)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240


In [33]:
print('Number of dub features are {}'.format(len(dub_features)))

Number of dub features are 11


In [34]:
X_train.drop(dub_features,axis = 1,inplace = True)
X_test.drop(dub_features,axis = 1,inplace = True)

In [35]:
print(X_train.shape,X_test.shape)

(50933, 237) (25087, 237)


In [36]:
X_train_basic_filter = X_train.copy()
X_test_basic_filter = X_test.copy()

## Correlated features

In [78]:
def correlation(dataset,threshold):
    corr_features = set()
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            col_name = corr_matrix.columns[i]
            if abs(corr_matrix.iloc[i][j]) > threshold:
                corr_features.add(col_name)
    return corr_features        

corr_features = correlation(X_train,0.8)
len(corr_features)

0

In [79]:
X_train.drop(corr_features,axis=1,inplace = True)
X_test.drop(corr_features,axis = 1,inplace = True)
print(X_train.shape,X_test.shape)

(50933, 98) (25087, 98)


In [81]:
X_train_corr = X_train.copy()
X_test_corr = X_test.copy()

(50933, 98)

## ROC AUC

In [82]:
roc_values = []
for feature in X_train.columns:
    clf = DecisionTreeClassifier()
    clf.fit(X_train[feature].to_frame(),y_train)
    pos_probas = clf.predict_proba(X_test[feature].to_frame())
    roc_values.append(roc_auc_score(y_test,pos_probas[:,1]))
    

In [83]:
roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
roc_values.sort_values(ascending = False)

var15                            0.699158
num_var4                         0.696358
ind_var5                         0.663285
saldo_var30                      0.660496
saldo_var5                       0.657735
saldo_medio_var5_hace3           0.650620
saldo_medio_var5_hace2           0.647028
var36                            0.646219
num_var30_0                      0.541427
ind_var12_0                      0.528749
ind_var39_0                      0.526881
ind_var13_0                      0.524216
saldo_var12                      0.519732
num_var45_hace3                  0.519547
saldo_var13_corto                0.517600
ind_var5_0                       0.514093
num_med_var45_ult3               0.513720
delta_imp_aport_var13_1y3        0.513125
num_var22_ult1                   0.512692
num_var22_hace3                  0.511921
ind_var43_recib_ult1             0.511888
num_aport_var13_hace3            0.511022
imp_aport_var13_hace3            0.510765
num_var43_recib_ult1             0

In [84]:
random_vars = roc_values[roc_values <= 0.5].index
len(random_vars)

22

In [70]:
X_train.drop(random_vars,axis = 1,inplace = True)
X_test.drop(random_vars,axis = 1,inplace = True)
print(X_train.shape,X_test.shape)

(50933, 76) (25087, 76)


In [85]:
X_train_roc = X_train.copy()
X_test_roc = X_test.copy()

## Lasso

In [86]:
sc = StandardScaler()
sc.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [87]:
sel = SelectFromModel(LogisticRegression(C = 1,penalty = 'l1'))
sel.fit(sc.transform(X_train),y_train)



SelectFromModel(estimator=LogisticRegression(C=1, class_weight=None, dual=False,
                                             fit_intercept=True,
                                             intercept_scaling=1, l1_ratio=None,
                                             max_iter=100, multi_class='warn',
                                             n_jobs=None, penalty='l1',
                                             random_state=None, solver='warn',
                                             tol=0.0001, verbose=0,
                                             warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [88]:
selected_features = X_train.columns[sel.get_support()]
len(selected_features)

84

In [89]:
X_train = X_train[selected_features]
X_test = X_test[selected_features]
print(X_train.shape,X_test.shape)

(50933, 84) (25087, 84)


## Comparision in performance in machine learning algorithms

In [90]:
# create a function to build random forests and compare performance in train and test set

def run_randomForests(X_train, X_test, y_train, y_test):
    rf = RandomForestClassifier(n_estimators=200, random_state=39, max_depth=4)
    rf.fit(X_train, y_train)
    print('Train set')
    pred = rf.predict_proba(X_train)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = rf.predict_proba(X_test)
    print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [94]:
# original
run_randomForests(X_train_orig,
                  X_test_orig,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.7967685008560546
Test set
Random Forests roc-auc: 0.786253373188857


In [95]:
# filter methods - basic
run_randomForests(X_train_basic_filter,
                  X_test_basic_filter,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8006651116535392
Test set
Random Forests roc-auc: 0.7910516045999917


In [96]:
# filter methods - correlation
run_randomForests(X_train_corr,
                  X_test_corr,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8075785819420327
Test set
Random Forests roc-auc: 0.7966668534894341


In [97]:
# filter methods - univariate roc-auc
run_randomForests(X_train_roc,
                  X_test_roc,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8075785819420327
Test set
Random Forests roc-auc: 0.7966668534894341


In [98]:
# embedded methods - Lasso
run_randomForests(X_train,
                  X_test,
                  y_train, y_test)

Train set
Random Forests roc-auc: 0.8092740433259298
Test set
Random Forests roc-auc: 0.7983699713538424


### Even while reducing the dimensions from 371 to 65, the performance is not effected

In [99]:
# create a function to build logistic regression and compare performance in train and test set

def run_logistic(X_train, X_test, y_train, y_test):
    # function to train and test the performance of logistic regression
    logit = LogisticRegression(random_state=44)
    logit.fit(X_train, y_train)
    print('Train set')
    pred = logit.predict_proba(X_train)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
    print('Test set')
    pred = logit.predict_proba(X_test)
    print('Logistic Regression roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

In [103]:
# original
scaler = StandardScaler().fit(X_train_orig)

run_logistic(scaler.transform(X_train_orig),
             scaler.transform(X_test_orig),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.8062326117095236
Test set
Logistic Regression roc-auc: 0.7958048739984225


In [104]:
# filter methods - basic
scaler = StandardScaler().fit(X_train_basic_filter)

run_logistic(scaler.transform(X_train_basic_filter),
             scaler.transform(X_test_basic_filter),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.8062266366318069
Test set
Logistic Regression roc-auc: 0.7959336571594636


In [105]:
# filter methods - correlation
scaler = StandardScaler().fit(X_train_corr)

run_logistic(scaler.transform(X_train_corr),
             scaler.transform(X_test_corr),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.7976855429584677
Test set
Logistic Regression roc-auc: 0.7935033212936439


In [106]:
# filter methods - univariate roc-auc
scaler = StandardScaler().fit(X_train_roc)

run_logistic(scaler.transform(X_train_roc),
             scaler.transform(X_test_roc),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.7976855429584677
Test set
Logistic Regression roc-auc: 0.7935033212936439


In [107]:
# embedded method - lasso
scaler = StandardScaler().fit(X_train)

run_logistic(scaler.transform(X_train),
             scaler.transform(X_test),
                  y_train, y_test)



Train set
Logistic Regression roc-auc: 0.7976460382282826
Test set
Logistic Regression roc-auc: 0.7934795740440902
