## Imports

In [2]:
import pandas as pd
import numpy as np
from datetime import date
import datetime as dt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import QuantileTransformer
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.metrics import precision_recall_curve, confusion_matrix, accuracy_score
from imblearn import over_sampling

In [3]:
fraud = pd.read_csv('data/fraudTrain.csv')
fraud_test = pd.read_csv('data/fraudTest.csv')

## Data Preprocessing

In [4]:
print('% of data in the train set:', len(fraud)*100/(len(fraud)+len(fraud_test)))

% of data in the train set: 69.99995681264353


In [5]:
# No null values in train
fraud.isna().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [6]:
# No null values in test
fraud_test.isna().sum()

Unnamed: 0               0
trans_date_trans_time    0
cc_num                   0
merchant                 0
category                 0
amt                      0
first                    0
last                     0
gender                   0
street                   0
city                     0
state                    0
zip                      0
lat                      0
long                     0
city_pop                 0
job                      0
dob                      0
trans_num                0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [7]:
# High class imbalance in both train and test set 

print('Train set class balance:')
print(fraud.is_fraud.value_counts(normalize=True))
print('-------------------------')
print('Test set class balance:')
print(fraud_test.is_fraud.value_counts(normalize=True))

Train set class balance:
0    0.994211
1    0.005789
Name: is_fraud, dtype: float64
-------------------------
Test set class balance:
0    0.99614
1    0.00386
Name: is_fraud, dtype: float64


In [8]:
# Drop columns with little significance to determining fraud 
fraud.drop(['cc_num', 'first', 'last', 'street', 'trans_num'], axis=1, inplace=True)
fraud.drop(fraud.iloc[:,[0]], axis=1, inplace=True)
fraud_test.drop(['cc_num', 'first', 'last', 'street', 'trans_num'], axis=1, inplace=True)
fraud_test.drop(fraud_test.iloc[:,[0]], axis=1, inplace=True)

In [9]:
# Converting date of birth (dob) to age
fraud['dob'] = pd.to_datetime(fraud['dob'])
fraud['age'] = (pd.to_datetime('now') - fraud['dob'])/ np.timedelta64(1, 'Y')
fraud['age'] = fraud['age'].astype(int)
fraud.drop(['dob'], axis=1, inplace=True)

fraud_test['dob'] = pd.to_datetime(fraud_test['dob'])
fraud_test['age'] = (pd.to_datetime('now') - fraud_test['dob'])/ np.timedelta64(1, 'Y')
fraud_test['age'] = fraud_test['age'].astype(int)
fraud_test.drop(['dob'], axis=1, inplace=True)

In [10]:
# Splitting trans_date_trans_time column into trans_date and trans_time
fraud['trans_date'] = pd.DatetimeIndex(fraud['trans_date_trans_time']).date
fraud['trans_time'] = pd.DatetimeIndex(fraud['trans_date_trans_time']).time
fraud.drop(['trans_date_trans_time'], axis=1, inplace=True)

fraud_test['trans_date'] = pd.DatetimeIndex(fraud_test['trans_date_trans_time']).date
fraud_test['trans_time'] = pd.DatetimeIndex(fraud_test['trans_date_trans_time']).time
fraud_test.drop(['trans_date_trans_time'], axis=1, inplace=True)

In [11]:
# Transform "merchant" into numeric variable
label_encoder = LabelEncoder()
fraud.merchant = label_encoder.fit_transform(fraud.merchant)
fraud_test.merchant = label_encoder.fit_transform(fraud_test.merchant)

In [12]:
# Transform "city" into numeric variable
fraud.city = label_encoder.fit_transform(fraud.city)
fraud_test.city = label_encoder.fit_transform(fraud_test.city)

In [13]:
# Transform "category" into numeric variable
fraud.category = label_encoder.fit_transform(fraud.category)
fraud_test.category = label_encoder.fit_transform(fraud_test.category)

In [14]:
# Transform "gender" into numeric variable
fraud.gender = fraud.gender.map({'M': 1, "F": 0})
fraud_test.gender = fraud_test.gender.map({'M': 1, "F": 0})

In [15]:
# Transform "state" into numeric variable
fraud.state = label_encoder.fit_transform(fraud.state)
fraud_test.state = label_encoder.fit_transform(fraud_test.state)

In [16]:
# Transform "job" into numeric variable
fraud.job = label_encoder.fit_transform(fraud.job)
fraud_test.job = label_encoder.fit_transform(fraud_test.job)

In [17]:
# Convert trans_time into seconds
fraud['trans_date'] =  pd.to_datetime(fraud['trans_date'])
fraud.trans_date = fraud.trans_date.map(dt.datetime.toordinal)
fraud.trans_time = pd.to_datetime(fraud.trans_time,format='%H:%M:%S')
fraud.trans_time = 3600 * pd.DatetimeIndex(fraud.trans_time).hour + 60 * pd.DatetimeIndex(fraud.trans_time).minute + pd.DatetimeIndex(fraud.trans_time).second

fraud_test['trans_date'] =  pd.to_datetime(fraud_test['trans_date'])
fraud_test.trans_date = fraud_test.trans_date.map(dt.datetime.toordinal)
fraud_test.trans_time = pd.to_datetime(fraud_test.trans_time,format='%H:%M:%S')
fraud_test.trans_time = 3600 * pd.DatetimeIndex(fraud_test.trans_time).hour + 60 * pd.DatetimeIndex(fraud_test.trans_time).minute + pd.DatetimeIndex(fraud_test.trans_time).second

In [18]:
# Seperate target from variables
X_train = fraud.drop('is_fraud', axis=1)
y_train = fraud['is_fraud']

X_test = fraud_test.drop('is_fraud', axis=1)
y_test = fraud_test['is_fraud']

In [19]:
# Variables to be scaled
vars_to_scale = ['merchant', 'category', 'amt', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'unix_time', 'merch_lat', 'merch_long', 'age', 'trans_date', 'trans_time']

# Scale the variables
scaler = QuantileTransformer(output_distribution='normal')

X_train[vars_to_scale] = scaler.fit_transform(X_train[vars_to_scale])
X_test[vars_to_scale] = scaler.transform(X_test[vars_to_scale])

In [20]:
# Address imbalance using over sampling
ro = over_sampling.RandomOverSampler(random_state=100)
X_train_ro, y_train_ro = ro.fit_resample(X_train, y_train)
print (X_train_ro.shape)
print (y_train_ro.shape)
print (y_train_ro.value_counts())

(2578338, 17)
(2578338,)
0    1289169
1    1289169
Name: is_fraud, dtype: int64


## Model (XGBoost)

In [21]:
# Fit the model
xgb = XGBClassifier(learning_rate=0.5, max_depth=10, n_estimators=15, max_features = 14)
xgb.fit(X_train_ro, y_train_ro)

XGBClassifier(learning_rate=0.5, max_depth=10, max_features=14, n_estimators=15)

In [22]:
# Function to return various standard metrics for a model
def model_metrics(a, p):
    confusion = confusion_matrix(a, p)
    TP = confusion[1,1] # true positive 
    TN = confusion[0,0] # true negatives
    FP = confusion[0,1] # false positives
    FN = confusion[1,0] # false negatives
    print ('Accuracy    : ', metrics.accuracy_score(a, p ))
    print ('Sensitivity : ', TP / float(TP+FN))
    print ('Specificity : ', TN / float(TN+FP))
    print ('Pricision   : ', TP / float(TP + FP))
    print ('Recall      : ', TP / float(TP + FN))
    print(confusion)

    return None

In [23]:
# Performance of model on original train (not oversampled) data
y_train_pred = xgb.predict(X_train)
print ('AUC         : ', metrics.roc_auc_score(y_train, y_train_pred))
model_metrics(y_train, y_train_pred)

AUC         :  0.997641503945565
Accuracy    :  0.9953103129157268
Sensitivity :  1.0
Specificity :  0.9952830078911299
Pricision   :  0.552439832192537
Recall      :  1.0
[[1283088    6081]
 [      0    7506]]


In [24]:
# Performance of model on test data
y_test_pred = xgb.predict(X_test)
print ('AUC         : ', metrics.roc_auc_score(y_test, y_test_pred))
model_metrics(y_test, y_test_pred)

AUC         :  0.9427826087571668
Accuracy    :  0.9956434816876875
Sensitivity :  0.8895104895104895
Specificity :  0.9960547280038441
Pricision   :  0.4662756598240469
Recall      :  0.8895104895104895
[[551390   2184]
 [   237   1908]]


In [25]:
# Feature importances
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": xgb.feature_importances_})

imp_df.sort_values(by="Imp", ascending=False)

Unnamed: 0,Varname,Imp
2,amt,0.54616
16,trans_time,0.158121
1,category,0.111523
14,age,0.041383
9,city_pop,0.018153
11,unix_time,0.013385
6,zip,0.013239
8,long,0.012753
3,gender,0.012549
12,merch_lat,0.012103
