In [1]:
import pandas as pd
import numpy as np
#train_identity = pd.read_csv('Data/train_identity.csv')
train_transaction = pd.read_csv('Data/train_transaction.csv')
#test_identity = pd.read_csv('Data/test_identity.csv')
test_transaction = pd.read_csv('Data/test_transaction.csv')

In [2]:
train_trans_rows = train_transaction.shape[0]
test_trans_rows = test_transaction.shape[0]

One mistake I made in the previous notebook was not combining the test and train matrices together and applying the operations simultaneously. It lead to some dummies appearing in the training set, but not the test set and vice versa. We'll try and correct it here. We'll also create a separate dummy variable for categorical variables with NaNs

In [3]:
fraud = train_transaction['isFraud']
train_x_trans = train_transaction.drop(['isFraud','TransactionID'], axis=1)
test_x_trans = test_transaction.drop('TransactionID', axis=1)
X_trans = pd.concat([train_x_trans,test_x_trans])
del train_transaction
del test_transaction
print(train_x_trans.shape)
print(test_x_trans.shape)

(590540, 392)
(506691, 392)


We'll extract the numerics and strings and convert the strings to dummies. We'll convert the NaNs to dummies as well.  I don't think multicollinearity will be an issue with random forests. Any remaining NaNs, we'll fill with the mean.

In [4]:
numerics = X_trans.select_dtypes(exclude='object')
strings = X_trans.select_dtypes(include='object')
dummies = pd.get_dummies(strings,dummy_na=True)
X_trans = pd.concat([numerics, dummies], sort=False, axis=1)
X_trans = X_trans.fillna(X_trans.mean())
del numerics, strings, dummies

Now we'll separate back into training and test sets and start optimising our random forest by tuning the hyperparameters.  

In [5]:
train_x_trans = X_trans.iloc[:train_trans_rows,:]
test_x_trans = X_trans.iloc[train_trans_rows:,:]
del X_trans
#Number of rows should be the same
print(train_x_trans.shape)
print(test_x_trans.shape)
data = pd.concat([fraud, train_x_trans], axis=1)
del train_x_trans

(590540, 544)
(506691, 544)


In [35]:
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import roc_auc_score
from sklearn import model_selection as ms
import noisyopt as no 


In [52]:
#Creating our black box Random Forests function
#def RF_score(x):
    
    ##Naming hyperparemter inputs
    #n_estimators = x[0]
    #max_depth = x[1]
    #min_samples_split = x[2]
    #min_samples_leaf = x[3]
    #max_features = x[4]

def RF_score(n_estimators,max_depth,min_samples_split,min_samples_leaf,max_features):
    
    #Contraining hyperparameters to be converted to integers (e.g. number of decision trees can't be continuous!)
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    max_features = int(max_features)
    
    assert type(n_estimators) == int
    assert type(max_depth) == int
    assert type(min_samples_split) == int
    assert type(min_samples_leaf) == int
    assert type(max_features) == int
    
    param = {
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf,
            'max_features': max_features
            }
    
    #Setting up Random Forest with input parameters
    fraud_RFC = RFC(
                    n_estimators=n_estimators, 
                    max_depth=max_depth, 
                    min_samples_split=min_samples_split,
                    min_samples_leaf = min_samples_leaf,
                    max_features = max_features)
    
    #Subsetting data    
    data_sub = data
    data_sub['RNG'] = np.random.random_sample(data_sub.shape[0])
    data_sub = data_sub[data_sub['RNG'] <=0.01]
    X = data_sub.iloc[:,1:]
    y = data_sub.iloc[:,0]
    
    #Evaluating configuration using time series split
    tscv = ms.TimeSeriesSplit(n_splits=3)
    score = []
    
    for train_index, test_index in tscv.split(X):
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        fraud_RFC_fit = fraud_RFC.fit(X=X_train, y=y_train)
        pred_probs = fraud_RFC_fit.predict_proba(X=X_test)
        score.append(roc_auc_score(y_test, pred_probs[:,1]))


    #cv_preds = ms.cross_val_predict(fraud_RFC, X=data_sub.iloc[:,1:], y=data_sub.iloc[:,0], cv=3,method='predict_proba')
    #cv_score = roc_auc_score(data_sub.iloc[:,0],cv_preds[:,1])
    return(np.mean(score))
    
    

#Setting the boundaries for the hyperparemters to be tuned
bounds_RF = {
    'n_estimators': (10,3000),
    'max_depth': (1,100),
    'min_samples_split': (2,200),
    'min_samples_leaf': (1,200),
    'max_features': (1,544)
}

bounds = [
    [10,3000],
    [1,100],
    [2,200],
    [1,200],
    [1,544]    
]


RF_BO = BayesianOptimization(RF_score, bounds_RF)

In [None]:
RF_BO.maximize(n_iter=100,alpha=1e-3)

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8575  [0m | [0m 92.26   [0m | [0m 87.93   [0m | [0m 53.84   [0m | [0m 35.29   [0m | [0m 2.44e+03[0m |
| [0m 2       [0m | [0m 0.8362  [0m | [0m 27.3    [0m | [0m 497.8   [0m | [0m 41.61   [0m | [0m 93.92   [0m | [0m 1.677e+0[0m |


Bayesian optimisation seems to do a reasonable job at finding good hyperparameter combinations. It's clear that having a large number of estimators helps the model to generalise well, the relationship with the other hyperparameters is not so clear. 