In [1]:
import pandas as pd
import numpy as np
train_identity = pd.read_csv('Data/train_identity.csv')
train_transaction = pd.read_csv('Data/train_transaction.csv')

In [2]:
#Storing variable names
identity_vars = list(train_identity)
transaction_vars = list(train_transaction)

#Storing id, fraud and separating from explanatory variables
trans_id = train_transaction['TransactionID']
fraud = train_transaction['isFraud']
x_trans = train_transaction.drop(['TransactionID','isFraud'],axis=1)

In [3]:
#Getting dummies from strings
strings = train_transaction.select_dtypes(include='object')
numerics = train_transaction.select_dtypes(exclude='object')
dummies = pd.get_dummies(strings)
x_trans = pd.concat([dummies, numerics],axis=1)
x_trans = x_trans.drop(['TransactionID','isFraud'],axis=1)

In [4]:
#Getting a count of NaNs
transaction_na_count = x_trans.isnull().sum()
transaction_na_prop = x_trans.isnull().sum()/x_trans.shape[0]*100
transaction_na = pd.concat([transaction_na_count, transaction_na_prop], axis=1)
transaction_na.columns = ['Count','Percentage']

In [5]:
#Filling nas with mean
x_trans = x_trans.fillna(x_trans.mean())
corrs = x_trans.corrwith(fraud)
corrs = abs(corrs.sort_values(ascending=False))
print(corrs)

V257           0.262946
V246           0.251838
V244           0.249951
V242           0.247522
V45            0.236688
V201           0.234520
V200           0.227926
V86            0.224530
V87            0.224450
V189           0.220374
V44            0.218669
V188           0.217058
V258           0.203975
V52            0.201111
V51            0.187440
V228           0.184556
V170           0.178601
V40            0.178413
V79            0.173097
V39            0.170565
V94            0.167984
V38            0.167128
V43            0.166514
V33            0.165534
V199           0.164959
V17            0.164800
V18            0.164689
V74            0.164684
V34            0.162660
V81            0.162608
                 ...   
M8_F           0.043108
V75            0.046516
D5             0.046812
V12            0.047279
M6_F           0.048760
D2             0.051839
D4             0.056450
V36            0.058682
D7             0.063238
M7_F           0.063570
V35            0

Just copying stuff from the EDA notebook, nothing new. We'll first train a logistic regression on the entire dataset and get a ballpark figure for the AUC score, then we'll:
1. Properly set up procedures for cross-validation and calculation of the AUC score
2. Train a regluarized (l1) logistic regression
3. Evaluate it using cross-validation

In [6]:
strong_vars = corrs.index.values[:20]
print(strong_vars)

['V257' 'V246' 'V244' 'V242' 'V45' 'V201' 'V200' 'V86' 'V87' 'V189' 'V44'
 'V188' 'V258' 'V52' 'V51' 'V228' 'V170' 'V40' 'V79' 'V39']


In [7]:
x_subset = x_trans[strong_vars]

In [8]:
from sklearn.metrics import roc_auc_score
from sklearn import model_selection as ms
from sklearn.metrics import make_scorer
from sklearn.linear_model import LogisticRegression

In [9]:
fraud_logit = LogisticRegression(penalty='l1')

fraud_logit_fit = fraud_logit.fit(X=x_subset,y=fraud)



In [10]:
fitted = fraud_logit_fit.predict_proba(X=x_subset)
fraud_prob = pd.DataFrame(fitted[:,1])
print(fraud_prob)
print(fraud)

               0
0       0.025443
1       0.021101
2       0.021101
3       0.021101
4       0.024571
5       0.021101
6       0.021101
7       0.021101
8       0.024571
9       0.021101
10      0.025903
11      0.059644
12      0.025118
13      0.021101
14      0.021101
15      0.025443
16      0.024571
17      0.024571
18      0.021101
19      0.021101
20      0.021101
21      0.021101
22      0.025761
23      0.021101
24      0.021101
25      0.061239
26      0.021101
27      0.021101
28      0.025443
29      0.021101
...          ...
590510  0.021101
590511  0.021101
590512  0.021101
590513  0.021101
590514  0.021101
590515  0.104315
590516  0.021101
590517  0.021101
590518  0.021101
590519  0.021101
590520  0.021101
590521  0.001546
590522  0.021101
590523  0.021101
590524  0.021101
590525  0.021101
590526  0.025761
590527  0.034847
590528  0.035845
590529  0.059644
590530  0.021101
590531  0.025761
590532  0.021101
590533  0.021101
590534  0.062902
590535  0.025443
590536  0.0211

In [11]:
print(roc_auc_score(fraud,fraud_prob))

0.7066583427623736


Initial estimates of the AUC score, not that great considering the Kaggle leaderboard have scores of 0.94+. This is probably higher than what the actual score will be given that I didn't do any cross-validation. Now I'm going to evaluate it using cross-validation to get a proper feel for our out of sample AUC score.

In [13]:
cv_preds = ms.cross_val_predict(fraud_logit, X=x_subset, y=fraud, cv=3,method='predict_proba')



In [14]:
cv_score = roc_auc_score(fraud,cv_preds[:,1])
print(cv_score)

0.7072765140773767


Ok, seems like it generalises reasonably well. Note that using cross_val_predict isn't the recommended way to evaluate generalisation error (at least according to the documentation), I'm doing it this way because cross_val_score does not allow me to use predict_proba as the predicted values but uses the predict() method and returns either 0 or 1. This is not desirable as the competition metric is the roc_auc_score based on the predicted probabilities, not the \[0,1\] classification.

Next, we'll try playing around with the number of variables included in the explanatory variable set. I cut it down to 20 variables based on their absolute correlation with the dependent variable. Let's see how increasing/decreasing it has an effect on the cross-validated ROC/AUC score

In [15]:
var_count = np.arange(20,30,2)
cv_scores = []
for var in var_count:
    strong_vars = corrs.index.values[:var]
    x_subset = x_trans[strong_vars]
    cv_preds = ms.cross_val_predict(fraud_logit, X=x_subset, y=fraud, cv=3,method='predict_proba')
    cv_score = roc_auc_score(fraud,cv_preds[:,1])
    cv_scores.append(cv_score)



In [16]:
print(cv_scores)

[0.7073550353571013, 0.7113295167667276, 0.7126862469248098, 0.7125710671743295, 0.7108850213335886]


Not much difference in the AUC scores, seems like the optimum is around 20-30 variables. 

We'll also play around with the regularisation parameter, however I don't think that there will be much potential for improvement but we'll see how we go. The logistic regression has a few options for regularisation, l1, l2 and elastic net (a combination of l1 and l2 I think). We'll try the three different options and different values for C, the inverse of regularisation strength.

In [20]:
print(x_subset)

            V257      V246      V244      V242       V45      V201      V200  \
0       1.250993  1.183723  1.118562  1.113463  1.120779  1.159106  1.119977   
1       1.250993  1.183723  1.118562  1.113463  1.000000  1.159106  1.119977   
2       1.250993  1.183723  1.118562  1.113463  1.000000  1.159106  1.119977   
3       1.250993  1.183723  1.118562  1.113463  1.000000  1.159106  1.119977   
4       1.000000  1.000000  1.000000  1.000000  1.120779  1.000000  1.000000   
5       1.250993  1.183723  1.118562  1.113463  1.000000  1.159106  1.119977   
6       1.250993  1.183723  1.118562  1.113463  1.000000  1.159106  1.119977   
7       1.250993  1.183723  1.118562  1.113463  1.000000  1.159106  1.119977   
8       1.000000  1.000000  1.000000  1.000000  1.120779  1.000000  1.000000   
9       1.250993  1.183723  1.118562  1.113463  1.000000  1.159106  1.119977   
10      1.000000  1.000000  1.000000  1.000000  1.000000  1.000000  1.000000   
11      1.000000  1.000000  1.000000  1.

In [24]:
reg_params = [10, 5, 2, 1, 0.5, 0.1, 0.05, 0.01, 0.001, 0.0001]

cv_scores = []
for c in reg_params:
    strong_vars = corrs.index.values[:25]
    x_subset = x_trans[strong_vars]
    fraud_logit = LogisticRegression(penalty='l1',solver='saga', C=c)
    cv_preds = ms.cross_val_predict(fraud_logit, X=x_subset, y=fraud, cv=3,method='predict_proba')
    cv_score = roc_auc_score(fraud,cv_preds[:,1])
    cv_scores.append(cv_score)



In [25]:
print(cv_scores)

[0.7149914231163619, 0.7150168693689567, 0.7150039055283233, 0.715050968514276, 0.715104815576696, 0.7143748083980866, 0.7145054272832961, 0.7129334684883738, 0.7116303422156077, 0.5909718872880813]


In all honesty, very little difference at all. We'll try using l2 and elastic net regularisation as well, but I think this is a dead end. We'll try some other algorithms such as random forests, kNN support and boosting methods (XGB/LGB/GB).

In [26]:
reg_params = [10, 5, 2, 1, 0.5, 0.1, 0.05, 0.01, 0.001]

cv_scores = []
for c in reg_params:
    strong_vars = corrs.index.values[:25]
    x_subset = x_trans[strong_vars]
    fraud_logit = LogisticRegression(penalty='l2',solver='saga', C=c)
    cv_preds = ms.cross_val_predict(fraud_logit, X=x_subset, y=fraud, cv=3,method='predict_proba')
    cv_score = roc_auc_score(fraud,cv_preds[:,1])
    cv_scores.append(cv_score)



In [27]:
print(cv_scores)

[0.7147704652745053, 0.7148037511968637, 0.7147992965591834, 0.71479571662874, 0.7148091787128064, 0.7148363747195667, 0.7139170667552304, 0.7137825881606463, 0.7115933563665607]
