In [1]:
import pandas as pd
import numpy as np
train_identity = pd.read_csv('Data/train_identity.csv')
train_transaction = pd.read_csv('Data/train_transaction.csv')

In [2]:
#Storing variable names
identity_vars = list(train_identity)
transaction_vars = list(train_transaction)

#Storing id, fraud and separating from explanatory variables
trans_id = train_transaction['TransactionID']
fraud = train_transaction['isFraud']
x_trans = train_transaction.drop(['TransactionID','isFraud'],axis=1)

In [3]:
#Getting dummies from strings
strings = train_transaction.select_dtypes(include='object')
numerics = train_transaction.select_dtypes(exclude='object')
dummies = pd.get_dummies(strings)
x_trans = pd.concat([dummies, numerics],axis=1)
x_trans = x_trans.drop(['TransactionID','isFraud'],axis=1)

In [4]:
#Getting a count of NaNs
transaction_na_count = x_trans.isnull().sum()
transaction_na_prop = x_trans.isnull().sum()/x_trans.shape[0]*100
transaction_na = pd.concat([transaction_na_count, transaction_na_prop], axis=1)
transaction_na.columns = ['Count','Percentage']

In [5]:
#Filling nas with mean and calculating correlations with dependent variable
x_trans = x_trans.fillna(x_trans.mean())
corrs = x_trans.corrwith(fraud)
corrs = abs(corrs.sort_values(ascending=False))
print(corrs)

V257           0.262946
V246           0.251838
V244           0.249951
V242           0.247522
V45            0.236688
V201           0.234520
V200           0.227926
V86            0.224530
V87            0.224450
V189           0.220374
V44            0.218669
V188           0.217058
V258           0.203975
V52            0.201111
V51            0.187440
V228           0.184556
V170           0.178601
V40            0.178413
V79            0.173097
V39            0.170565
V94            0.167984
V38            0.167128
V43            0.166514
V33            0.165534
V199           0.164959
V17            0.164800
V18            0.164689
V74            0.164684
V34            0.162660
V81            0.162608
                 ...   
M8_F           0.043108
V75            0.046516
D5             0.046812
V12            0.047279
M6_F           0.048760
D2             0.051839
D4             0.056450
V36            0.058682
D7             0.063238
M7_F           0.063570
V35            0

So in this notebook we're going to play around with the random forest classifier. This will be my first time using it, so hopefully things go well. I will also work on a random subset of data (<20%) to speed up computation time. 

In [6]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score
from sklearn import model_selection as ms
from sklearn.metrics import make_scorer

Initial evaluation with all default settings

In [7]:
data = pd.concat([fraud,x_trans],axis=1)
print(data)

        isFraud  ProductCD_C  ProductCD_H  ProductCD_R  ProductCD_S  \
0             0            0            0            0            0   
1             0            0            0            0            0   
2             0            0            0            0            0   
3             0            0            0            0            0   
4             0            0            1            0            0   
5             0            0            0            0            0   
6             0            0            0            0            0   
7             0            0            0            0            0   
8             0            0            1            0            0   
9             0            0            0            0            0   
10            0            1            0            0            0   
11            0            1            0            0            0   
12            0            0            0            0            0   
13    

In [8]:
print(data.iloc[:,0])

0         0
1         0
2         0
3         0
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        0
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        0
25        0
26        0
27        0
28        0
29        0
         ..
590510    0
590511    0
590512    0
590513    0
590514    0
590515    0
590516    0
590517    0
590518    0
590519    0
590520    0
590521    0
590522    0
590523    0
590524    0
590525    0
590526    1
590527    0
590528    0
590529    0
590530    0
590531    0
590532    0
590533    0
590534    0
590535    0
590536    0
590537    0
590538    0
590539    0
Name: isFraud, Length: 590540, dtype: int64


In [9]:
data_sub = data.sample(frac=0.1)

fraud_RFC = RFC()
fraud_RFC_fit = fraud_RFC.fit(X=data_sub.iloc[:,1:],y=data_sub.iloc[:,0])

pred_prob = fraud_RFC_fit.predict_proba(X=data_sub.iloc[:,1:])
sample_score = roc_auc_score(data_sub.iloc[:,0],pred_prob[:,1])
print(sample_score)



0.9999188200056541


Obviously we're overfitting here. But it does seem to point us in the right direction, before we were underfitting, now we're overfitting. Imo, it's easier to regularise and simplify a flexible model than it is to upgrade a simple one. 

Let's see what the CV score is to get a feel for the out of sample roc/auc score.

In [21]:
data_sub = data.sample(frac=1)
cv_preds = ms.cross_val_predict(fraud_RFC, X=data_sub.iloc[:,1:], y=data_sub.iloc[:,0], cv=3,method='predict_proba')
cv_score = roc_auc_score(data_sub.iloc[:,0],cv_preds[:,1])
print(cv_score)

0.8723673461043703


Let's start regularising now, starting with the number of estimators


In [26]:
n_estimators = [15,30,50,100,200,500,1000,2000,5000]
data_sub = data.sample(frac=0.01)

cv_scores = []
for n in n_estimators:
    fraud_RFC = RFC(n_estimators=n)
    cv_preds = ms.cross_val_predict(fraud_RFC, X=data_sub.iloc[:,1:], y=data_sub.iloc[:,0], cv=3,method='predict_proba')
    cv_score = roc_auc_score(data_sub.iloc[:,0],cv_preds[:,1])
    cv_scores.append(cv_score)

print(cv_scores)

[0.7855546727975009, 0.7942716874548228, 0.8051488499909247, 0.8145309848077161, 0.8200517607573533, 0.8254809881512286, 0.8221000601832245, 0.8229247932594789, 0.8238398807798982]


Benefit of additional estimators seems to taper off after around 500. We'll start tuning the other parameters as well, but I will assume that they can be tuned in parallel (at least temporarily) to reduce computational burden.

In [31]:
max_depth = np.arange(15,30)
data_sub = data.sample(frac=0.1)

cv_scores = []
for max_d in max_depth:
    fraud_RFC = RFC(n_estimators=200, max_depth=max_d)
    cv_preds = ms.cross_val_predict(fraud_RFC, X=data_sub.iloc[:,1:], y=data_sub.iloc[:,0], cv=3,method='predict_proba')
    cv_score = roc_auc_score(data_sub.iloc[:,0],cv_preds[:,1])
    cv_scores.append(cv_score)

print(cv_scores)

[0.8709675864714834, 0.8725975663472798, 0.8748197621951953, 0.8758425734285644, 0.8770501349457891, 0.8779217146894642, 0.8770893493756097, 0.877948696299363, 0.877764457885277, 0.880177229789089, 0.8809409438234813, 0.8784754493333821, 0.8784669427518689, 0.8804884986636685, 0.8777492319285656]


Seems that about 20-25 is the optimal max depth for each of the trees in the RF model. Let's make predictions on the test set and make an initial submission to Kaggle. Then we'll consider tuning the other hyperparameters with a bit more precision.

In [51]:
test_transaction = pd.read_csv('Data/test_transaction.csv')

In [52]:
test_transaction = test_transaction.drop(['TransactionID'],axis=1)
strings = test_transaction.select_dtypes(include='object')
numerics = test_transaction.select_dtypes(exclude='object')
dummies = pd.get_dummies(strings)
test_x_trans = pd.concat([dummies, numerics],axis=1)
test_x_trans = test_x_trans.drop(['P_emaildomain_scranton.edu'],axis=1) 

In [59]:
debit_credit = pd.Series(0, index=range(506691))
test_x_trans['card6_debit or credit'] = debit_credit
test_x_trans = test_x_trans[x_trans.columns]

In [65]:
test_x_trans = test_x_trans.fillna(test_x_trans.mean())

In [13]:
data_sub = data.sample(frac=1)
fraud_RFC = RFC(n_estimators=500, max_depth=25)
fraud_RFC_fit = fraud_RFC.fit(X=x_trans,y=fraud)


ValueError: Number of features of the model must match the input. Model n_features is 529 and input n_features is 530 

In [60]:
test_x_trans_cols = list(test_x_trans.columns)
x_trans_cols = list(x_trans.columns)

x_cols = list(set(test_x_trans_cols) & set(x_trans_cols))
print(set(test_x_trans_cols) - set(x_trans_cols))
print(set(x_trans_cols)-set(test_x_trans_cols))

set()
set()


In [66]:
preds = fraud_RFC_fit.predict_proba(X=test_x_trans)

In [68]:
#Creating submission
submission = pd.read_csv('Data/sample_submission.csv')
submission['isFraud'] = preds[:,1]
submission.to_csv('Data/submission.csv',index=False)

Score on Kaggle was: 0.8949, bit lower than what I was expecting, but not too bad. However, in actually calculating the test predictions, I realised that the approach I used to generate dummies for the training dataset gives slightly different results for the test data set because some categories for the categorical variables do not show up in the test and vice versa. I'll need to do some manual manipulations to deal with that. 

I also think that perhaps the CV approach may not be the best idea in this context because of time series dependencies. I will try using a time series split 

We also want to consider other hyper parameters as well. We could continue to do so in parallel, but this might not give us the best combination as each dimension may be interrelated. Doing a grid search would be madness, we could try a random search, but I think that using Bayesian optimisation might be the way to go.

# Back again after a long time

So I'm coming back to this workbook after a while to investigate the features I created and pruned in Phase II. I trained them an xgboost model, so it's hard to tell whether the improved performance was due to the new features, or switching to xgboost. So I'm going to retrain a random forests ensemble with the processed data, calculate a validation score and submit predictions to the leaderboard.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score

In [2]:
train = pd.read_csv('Data/train_preprocessed_1.csv')
test = pd.read_csv('Data/test_preprocessed_1.csv')

In [3]:
y = train['isFraud']
train.drop('isFraud',axis=1,inplace=True)

X_train, X_val, y_train, y_val = train_test_split(train, y, shuffle=False)

In [4]:
fraud_RFC = RFC(n_estimators=500, max_depth=25)
fraud_RFC.fit(X=X_train,y=y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [5]:
val_preds = fraud_RFC.predict_proba(X=X_val)
val_auc = roc_auc_score(y_val,val_preds[:,1])

In [7]:
val_auc

0.8825471487223295

In [10]:
test_preds = fraud_RFC.predict_proba(test)
submission = pd.read_csv('Data/sample_submission.csv')
submission['isFraud'] = test_preds[:,1]
submission.to_csv('Data/submission.csv',index=False)

This gives us a public AUC of 0.897861, a slight improvement by 0.03. While this doesn't appear very large, this is pretty good news because despite cutting about two hundred variables, our new features actually give us a net increase. 