In [1]:
import pandas as pd
import numpy as np
train_identity = pd.read_csv('Data/train_identity.csv')
train_transaction = pd.read_csv('Data/train_transaction.csv')

In [2]:
#Storing variable names
identity_vars = list(train_identity)
transaction_vars = list(train_transaction)

#Storing id, fraud and separating from explanatory variables
trans_id = train_transaction['TransactionID']
fraud = train_transaction['isFraud']
x_trans = train_transaction.drop(['TransactionID','isFraud'],axis=1)

In [3]:
#Getting dummies from strings
strings = train_transaction.select_dtypes(include='object')
numerics = train_transaction.select_dtypes(exclude='object')
dummies = pd.get_dummies(strings)
x_trans = pd.concat([dummies, numerics],axis=1)
x_trans = x_trans.drop(['TransactionID','isFraud'],axis=1)

In [4]:
#Getting a count of NaNs
transaction_na_count = x_trans.isnull().sum()
transaction_na_prop = x_trans.isnull().sum()/x_trans.shape[0]*100
transaction_na = pd.concat([transaction_na_count, transaction_na_prop], axis=1)
transaction_na.columns = ['Count','Percentage']

In [5]:
#Filling nas with mean and calculating correlations with dependent variable
x_trans = x_trans.fillna(x_trans.mean())
corrs = x_trans.corrwith(fraud)
corrs = abs(corrs.sort_values(ascending=False))
print(corrs)

V257           0.262946
V246           0.251838
V244           0.249951
V242           0.247522
V45            0.236688
V201           0.234520
V200           0.227926
V86            0.224530
V87            0.224450
V189           0.220374
V44            0.218669
V188           0.217058
V258           0.203975
V52            0.201111
V51            0.187440
V228           0.184556
V170           0.178601
V40            0.178413
V79            0.173097
V39            0.170565
V94            0.167984
V38            0.167128
V43            0.166514
V33            0.165534
V199           0.164959
V17            0.164800
V18            0.164689
V74            0.164684
V34            0.162660
V81            0.162608
                 ...   
M8_F           0.043108
V75            0.046516
D5             0.046812
V12            0.047279
M6_F           0.048760
D2             0.051839
D4             0.056450
V36            0.058682
D7             0.063238
M7_F           0.063570
V35            0

So in this notebook we're going to play around with the random forest classifier. This will be my first time using it, so hopefully things go well. I will also work on a random subset of data (<20%) to speed up computation time. 

In [6]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score
from sklearn import model_selection as ms
from sklearn.metrics import make_scorer

Initial evaluation with all default settings

In [9]:
data = pd.concat([fraud,x_trans],axis=1)
print(data)

        isFraud  ProductCD_C  ProductCD_H  ProductCD_R  ProductCD_S  \
0             0            0            0            0            0   
1             0            0            0            0            0   
2             0            0            0            0            0   
3             0            0            0            0            0   
4             0            0            1            0            0   
5             0            0            0            0            0   
6             0            0            0            0            0   
7             0            0            0            0            0   
8             0            0            1            0            0   
9             0            0            0            0            0   
10            0            1            0            0            0   
11            0            1            0            0            0   
12            0            0            0            0            0   
13    

In [11]:
print(data.iloc[:,0])

        ProductCD_C  ProductCD_H  ProductCD_R  ProductCD_S  ProductCD_W  \
0                 0            0            0            0            1   
1                 0            0            0            0            1   
2                 0            0            0            0            1   
3                 0            0            0            0            1   
4                 0            1            0            0            0   
5                 0            0            0            0            1   
6                 0            0            0            0            1   
7                 0            0            0            0            1   
8                 0            1            0            0            0   
9                 0            0            0            0            1   
10                1            0            0            0            0   
11                1            0            0            0            0   
12                0      

In [14]:
data_sub = data.sample(frac=0.1)

fraud_RFC = RFC()
fraud_RFC_fit = fraud_RFC.fit(X=data_sub.iloc[:,1:],y=data_sub.iloc[:,0])

pred_prob = fraud_RFC_fit.predict_proba(X=data_sub.iloc[:,1:])
sample_score = roc_auc_score(data_sub.iloc[:,0],pred_prob[:,1])
print(sample_score)



0.9999221112338066


Obviously we're overfitting here. But it does seem to point us in the right direction, before we were underfitting, now we're overfitting. Imo, it's easier to regularise and simplify a flexible model than it is to upgrade a simple one. 

Let's see what the CV score is to get a feel for the out of sample roc/auc score.

In [21]:
data_sub = data.sample(frac=1)
cv_preds = ms.cross_val_predict(fraud_RFC, X=data_sub.iloc[:,1:], y=data_sub.iloc[:,0], cv=3,method='predict_proba')
cv_score = roc_auc_score(data_sub.iloc[:,0],cv_preds[:,1])
print(cv_score)

0.8723673461043703


Let's start regularising now, starting with the number of estimators


In [26]:
n_estimators = [15,30,50,100,200,500,1000,2000,5000]
data_sub = data.sample(frac=0.01)

cv_scores = []
for n in n_estimators:
    fraud_RFC = RFC(n_estimators=n)
    cv_preds = ms.cross_val_predict(fraud_RFC, X=data_sub.iloc[:,1:], y=data_sub.iloc[:,0], cv=3,method='predict_proba')
    cv_score = roc_auc_score(data_sub.iloc[:,0],cv_preds[:,1])
    cv_scores.append(cv_score)

print(cv_scores)

[0.7855546727975009, 0.7942716874548228, 0.8051488499909247, 0.8145309848077161, 0.8200517607573533, 0.8254809881512286, 0.8221000601832245, 0.8229247932594789, 0.8238398807798982]


Benefit of additional estimators seems to tapier off after around 500. We'll start tuning the other parameters as well