In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import os
os.chdir('/kaggle/input/santander-customer-transaction-prediction/')

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [4]:
df.shape

(200000, 202)

In [5]:
df = df.sample(20000)

In [11]:
df.isnull().any().sum()

0

There are no null values in our dataset which is convenient.

In [12]:
df.dtypes.value_counts()

float64    200
object       1
int64        1
dtype: int64

In [15]:
df.select_dtypes('object').head()

Unnamed: 0,ID_code
178471,train_178471
36607,train_36607
149686,train_149686
1117,train_1117
40849,train_40849


In [16]:
df = df.drop('ID_code', axis=1)

Now all of our features are anonymized, so we have to rely on pure statistics to build our model.

In [19]:
df.corr()['target'].sort_values(ascending=False)

target     1.000000
var_6      0.069652
var_99     0.066368
var_22     0.065495
var_53     0.064583
             ...   
var_76    -0.065915
var_148   -0.068489
var_139   -0.071014
var_12    -0.076848
var_81    -0.079558
Name: target, Length: 201, dtype: float64

In [28]:
target = df['target']
counts = target.value_counts()
print(counts)

ratio = counts[0] / (counts[0] + counts[1])
ratio

0    18016
1     1984
Name: target, dtype: int64


0.9008

Which metric matters the most here? 
- Accuracy will tell us straight up what proportion of predictions were correct
- Precision will tell us how many ...
- Recall will tell us what proportion of the minority was detected

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, cross_val_score
from sklearn.utils import resample

In [31]:
df_majority = df[df.target == 0]
df_minority = df[df.target == 1]

df_minority_upscaled = resample(df_minority, n_samples=counts[0], random_state=0)
df_minority_upscaled.shape

(18016, 201)

In [37]:
df_comb = pd.concat([df_majority, df_minority_upscaled])

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
178471,0,10.7614,4.3037,10.456,6.7627,11.9034,-0.6043,6.5466,9.9414,1.1601,...,5.571,9.8422,0.2577,-2.5075,22.2664,3.101,-4.2487,9.4405,19.011,-10.7113
36607,0,12.6224,-2.6422,7.9529,1.8974,13.4356,0.3812,5.2895,19.8614,-0.7113,...,-4.3995,2.0334,3.1227,1.6966,17.376,-2.1311,4.0612,8.8405,15.5782,0.8693
149686,0,13.9051,4.4288,8.638,8.5513,12.7183,0.5743,6.7361,16.5582,0.4578,...,6.6234,3.3257,4.4323,0.744,18.3795,-0.1314,12.2985,9.4374,14.6194,-3.4668
1117,0,12.8058,-6.5805,11.238,3.4815,8.1225,2.7216,4.1572,16.2814,-4.574,...,5.2624,12.3323,-0.1639,6.86,16.5619,0.2527,-8.2213,9.6763,17.6293,0.2062
40849,0,11.6183,0.1253,12.1024,9.3638,11.4353,-15.8996,5.9184,22.2114,6.1681,...,1.884,9.7733,4.6251,-1.1309,20.7768,2.8402,-0.0666,9.7559,15.4779,5.1799


In [43]:
X_train, X_test, y_train, y_test = train_test_split(df_comb.drop('target', axis=1), df_comb['target'], test_size=.2, random_state=0)

In [48]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.77      0.78      3571
           1       0.78      0.78      0.78      3636

    accuracy                           0.78      7207
   macro avg       0.78      0.78      0.78      7207
weighted avg       0.78      0.78      0.78      7207



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [49]:
from sklearn.decomposition import PCA

In [50]:
pca = PCA(n_components=20)
components = pca.fit(X_train, y_train)

PCA(copy=True, iterated_power='auto', n_components=20, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [52]:
pca = PCA(.9)
components = pca.fit(X_train, y_train)

In [53]:
components.n_components_

89

In [57]:
train_pca = pca.transform(X_train)
test_pca = pca.transform(X_test)

In [58]:
logreg2 = LogisticRegression()

logreg2.fit(train_pca, y_train)
y_pred = logreg2.predict(test_pca)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.71      0.71      3571
           1       0.71      0.71      0.71      3636

    accuracy                           0.71      7207
   macro avg       0.71      0.71      0.71      7207
weighted avg       0.71      0.71      0.71      7207



In [51]:
pca.singular_values_

array([3652.63369869, 2429.34449737, 2254.41012569, 2234.30359273,
       2161.54576123, 2075.10747537, 2053.43922118, 1972.08188055,
       1963.59122832, 1930.3646377 , 1864.32019288, 1806.68701668,
       1776.56247344, 1764.39864031, 1749.84659886, 1618.39826426,
       1540.4523876 , 1533.61828673, 1531.18811324, 1510.22924116])

Downsampling

In [59]:
df_majority_downsampled = resample(df_majority, n_samples=counts[1], random_state=0)
df_comb2 = pd.concat([df_minority, df_majority_downsampled])

df_comb2.shape

(3968, 201)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(df_comb2.drop('target', axis=1), df_comb2['target'], test_size=.2, random_state=0)

In [61]:
logreg = LogisticRegression()

logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.75      0.76       383
           1       0.77      0.79      0.78       411

    accuracy                           0.77       794
   macro avg       0.77      0.77      0.77       794
weighted avg       0.77      0.77      0.77       794



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [63]:
from sklearn.metrics import roc_auc_score

In [67]:
probs = logreg.predict_proba(X_train)
probs = [p[1] for p in probs]

In [68]:
roc_auc_score(y_train, probs)

0.8786271930329622

Tree-Based Model

In [70]:
from sklearn.ensemble import RandomForestClassifier

In [71]:
rfc = RandomForestClassifier()

rfc.fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)

print(classification_report(y_test, rfc_pred))

              precision    recall  f1-score   support

           0       0.73      0.78      0.75       383
           1       0.78      0.73      0.75       411

    accuracy                           0.75       794
   macro avg       0.75      0.75      0.75       794
weighted avg       0.76      0.75      0.75       794



Kappa