# IEEE Card Fraud Detection

## Contents
1. Load Data
2. Exploratory Data Analysis
3. Building the base model
4. Building the tuned model

## 1. Load Data

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import math

In [2]:
train_id = pd.read_csv("data/train_identity.csv")
train_txn = pd.read_csv("data/train_transaction.csv")

print(f"Shape of train identity: {train_id.shape}")
print(f"Shape of train transaction: {train_txn.shape}")

Shape of train identity: (144233, 41)
Shape of train transaction: (590540, 394)


In [3]:
test_id = pd.read_csv("data/test_identity.csv")
test_txn = pd.read_csv("data/test_transaction.csv")

In [4]:
# primary key is TransactionID
train_df = pd.merge(train_txn,train_id, how = 'left', on = 'TransactionID')
test_df = pd.merge(test_txn,test_id, how = 'left', on = 'TransactionID')

In [5]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


## Build the Model

In [6]:
# Importing the Keras libraries and packages
from sklearn.metrics import auc
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold

#### Split data

In [7]:
train_df_clean = train_df.copy(deep=True)
X_train = train_df_clean.drop("isFraud", axis=1)
y_train = train_df_clean["isFraud"]

In [8]:
X_train.shape

(590540, 433)

In [9]:
y_train.shape

(590540,)

#### Remove irrelevant columns

In [10]:
col_to_drop = ['TransactionID']
for col in col_to_drop:
    X_train.drop(col, axis=1, inplace=True)

#### Handle categorical variables
We will onehotencode all categorical data that are strings (e.g. product type), and label encode the rest. `cat_cols` comes from the data description: https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203#latest-614363

In [11]:
cat_cols = ['ProductCD',
            'card1', 'card2','card3','card4','card5','card6', 
            'addr1', 'addr2', 
            'Pemaildomain', 'Remaildomain', 
            'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

In [12]:
cat_cols_to_encode = [col for col in cat_cols if col in list(X_train)]

In [13]:
onehot_cols = [col for col in cat_cols_to_encode if X_train[col].dtype==object]
# Check that the cols for one hot encoding dont have too many unique values (to avoid sparse matrix)
for col in onehot_cols:
    num_unique = len(X_train[col].unique())
    print(f"{col} has {num_unique} unique values")
    
label_cols = [col for col in cat_cols_to_encode if col not in onehot_cols]

ProductCD has 5 unique values
card4 has 5 unique values
card6 has 5 unique values
M1 has 3 unique values
M2 has 3 unique values
M3 has 3 unique values
M4 has 4 unique values
M5 has 3 unique values
M6 has 3 unique values
M7 has 3 unique values
M8 has 3 unique values
M9 has 3 unique values


#### Label encode

In [14]:
def encode_features(data):
    for f in data.columns:
        if data[f].dtype=='object': 
            lbl = LabelEncoder()
            lbl.fit(list(data[f].values))
            data[f] = lbl.transform(list(data[f].values))
    
    print(f"Shape after encoding categorical variables: {data.shape}]")
    
    return data

In [15]:
X_train = encode_features(X_train)

Shape after encoding categorical variables: (590540, 432)]


### Train

In [16]:
import xgboost as xgb

In [17]:
# Check lenggth of x and y
if X_train.shape[0] == y_train.shape[0]:
    print("Number of rows MATCH between x and y")
else:
    print("Number of rows DO NOT MATCH between x and y")

Number of rows MATCH between x and y


In [20]:
model = xgb.XGBClassifier(
    n_estimators=300,
#     max_depth=10,
#     min_child_weight=3,
#     max_delta_step=1, # for imbalanced class
#     learning_rate=0.1
)

model.fit(X_train, y_train)

MemoryError: 

In [None]:
# Save model
import pickle
pickle.dump(model, open("ieee_fraud_model.pickle", "wb"))

### Validate

In [None]:
from sklearn.metrics import roc_curve
y_pred = model.predict_proba(X_test)
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred)

### Test

In [None]:
X_test = encode_features(test_df)

In [None]:
X_test

In [None]:
txn_id_list = X_test['TransactionID']
col_to_drop = ['TransactionID']
for col in col_to_drop:
    X_test.drop(col, axis=1, inplace=True)
    
X_test = X_test[list(X_train)]

In [None]:
y_pred = model.predict_proba(X_test)[:,1] 

In [None]:
my_submission = pd.DataFrame({"TransactionID": txn_id_list,
                              "isFraud": y_pred})

In [None]:
my_submission

In [None]:
my_submission.to_csv("submission.csv", index=False)

## Feature Engineering
Potential Features
* Time of day compared to normal
* Time between transactions
* If the same transaction was just made/attempted