<a href="https://colab.research.google.com/github/jpandersen61/Machine-Learning/blob/main/ieee_xgboost_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# XGBoost for IEEE-CIS Fraud Detection


In [2]:
import pandas as pd
import numpy as np
import datetime
import os
import joblib
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score, roc_curve
from pandas.api.types import is_numeric_dtype

In [2]:
##import os
##dataset_path = r'C:\Users\dalia\OneDrive\Desktop\PhD\Teaching (Zealand)\Tuesday\ieee-fraud-detection'
##for dirname, _, filenames in os.walk(dataset_path):
##    for filename in filenames:
##        print(os.path.join(dirname, filename))
##train_transaction = pd.read_csv(dataset_path+"/train_transaction.csv")
##train_identity = pd.read_csv(dataset_path+"/train_identity.csv")

In [4]:
import pandas as pd
from pathlib import Path

from google.colab import drive

In [5]:
##Loading from Google drive
from google.colab import drive
drive.mount('/content/drive')
train_transaction = pd.read_csv("drive/My Drive/Colab Notebooks/train_transaction.csv")
train_identity = pd.read_csv("drive/My Drive/Colab Notebooks/train_identity.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:

df = train_transaction.merge(train_identity, on="TransactionID", how="left")
print("Train Shape:",df.shape)


Train Shape: (590540, 434)


In [7]:
#column names lower case
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace("-", "_")
df = df.sort_values(by="transactiondt")

In [8]:
target_column = 'isfraud'

features = ['transactionamt', 'productcd', 'card1', 'card2', 'card3', 'card5', 'card6', 'addr1', 'dist1', 'p_emaildomain', 'r_emaildomain',
              'c1', 'c2', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14',
              'v62', 'v70', 'v76', 'v78', 'v82', 'v91', 'v127', 'v130', 'v139', 'v160', 'v165', 'v187', 'v203',
              'v207', 'v209', 'v210', 'v221', 'v234', 'v257', 'v258', 'v261', 'v264', 'v266', 'v267', 'v271', 'v274',
              'v277', 'v283', 'v285', 'v289', 'v291', 'v294', 'id_01', 'id_02', 'id_05', 'id_06', 'id_09', 'id_13', 'id_17', 'id_19', 'id_20', 'devicetype', 'deviceinfo']

In [9]:
df.head(5)

Unnamed: 0,transactionid,isfraud,transactiondt,transactionamt,productcd,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,devicetype,deviceinfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [10]:
# split time based on transactiondt. First 80% of the data is used for training and the rest for testing
train = df[df['transactiondt'] < df['transactiondt'].quantile(0.8)][features + [target_column]]
test = df[df['transactiondt'] >= df['transactiondt'].quantile(0.8)][features + [target_column]]

print("Train Shape:",train.shape)
print("Test Shape:",test.shape)

Train Shape: (472432, 68)
Test Shape: (118108, 68)


In [11]:
# ### Identify categorical and numerical features
numeric_features = [f for f in features if is_numeric_dtype(train[f])]
categorical_features = [f for f in features if f not in numeric_features]

print("Categorical features:", categorical_features)
print("Numerical features:", numeric_features)

# ### Handle missing values and categorical encoding
# Force all categorical_features to be treated as object first
train[categorical_features] = train[categorical_features].astype('object')
test[categorical_features] = test[categorical_features].astype('object')

# Now fill missing values
train[categorical_features] = train[categorical_features].fillna('<nan>')
test[categorical_features] = test[categorical_features].fillna('<nan>')

# Then convert to category
train[categorical_features] = train[categorical_features].astype('category')
test[categorical_features] = test[categorical_features].astype('category')

train[numeric_features] = train[numeric_features].fillna(-999)
test[numeric_features] = test[numeric_features].fillna(-999)

# ### Train/Test split
X_train = train[features]
y_train = train[target_column]
X_test = test[features]
y_test = test[target_column]


from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[categorical_features] = encoder.fit_transform(X_train[categorical_features])
X_test[categorical_features] = encoder.transform(X_test[categorical_features])

# ### Fit XGBoost Model
model = XGBClassifier( eval_metric='logloss')
print("Fitting the model...")
model.fit(X_train, y_train)

# ### Predict and evaluate
y_pred_prob = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC on training data:", auc_score)

# ### Get TPR at 1% FPR
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
recall_at_fpr_001 = np.interp(0.01, fpr, tpr)
print("TPR at 1% FPR:", recall_at_fpr_001)



Categorical features: ['productcd', 'card6', 'p_emaildomain', 'r_emaildomain', 'devicetype', 'deviceinfo']
Numerical features: ['transactionamt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'dist1', 'c1', 'c2', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'v62', 'v70', 'v76', 'v78', 'v82', 'v91', 'v127', 'v130', 'v139', 'v160', 'v165', 'v187', 'v203', 'v207', 'v209', 'v210', 'v221', 'v234', 'v257', 'v258', 'v261', 'v264', 'v266', 'v267', 'v271', 'v274', 'v277', 'v283', 'v285', 'v289', 'v291', 'v294', 'id_01', 'id_02', 'id_05', 'id_06', 'id_09', 'id_13', 'id_17', 'id_19', 'id_20']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[categorical_features] = encoder.fit_transform(X_train[categorical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[categorical_features] = encoder.transform(X_test[categorical_features])


Fitting the model...
AUC on training data: 0.8921648689968948
TPR at 1% FPR: 0.44709645669291337


In [12]:
# print F1 precision and recall
from sklearn.metrics import f1_score, precision_score, recall_score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = (y_pred == y_test).mean()
print(y_test.mean())
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

0.034409184813899145
Accuracy: 0.9743201137941545
F1 Score: 0.4939095611546805
Precision: 0.767236910316226
Recall: 0.3641732283464567


In [13]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df[features + [target_column]], test_size=0.2, random_state=42)

print("Train Shape:", train.shape)
print("Test Shape:", test.shape)

Train Shape: (472432, 68)
Test Shape: (118108, 68)


In [14]:
# ### Identify categorical and numerical features
numeric_features = [f for f in features if is_numeric_dtype(train[f])]
categorical_features = [f for f in features if f not in numeric_features]

print("Categorical features:", categorical_features)
print("Numerical features:", numeric_features)

# ### Handle missing values and categorical encoding
# Force all categorical_features to be treated as object first
train[categorical_features] = train[categorical_features].astype('object')
test[categorical_features] = test[categorical_features].astype('object')

# Now fill missing values
train[categorical_features] = train[categorical_features].fillna('<nan>')
test[categorical_features] = test[categorical_features].fillna('<nan>')

# Then convert to category
train[categorical_features] = train[categorical_features].astype('category')
test[categorical_features] = test[categorical_features].astype('category')

train[numeric_features] = train[numeric_features].fillna(-999)
test[numeric_features] = test[numeric_features].fillna(-999)

# ### Train/Test split
X_train = train[features]
y_train = train[target_column]
X_test = test[features]
y_test = test[target_column]


from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[categorical_features] = encoder.fit_transform(X_train[categorical_features])
X_test[categorical_features] = encoder.transform(X_test[categorical_features])

# ### Fit XGBoost Model
model = XGBClassifier( eval_metric='logloss')
print("Fitting the model...")
model.fit(X_train, y_train)

# ### Predict and evaluate
y_pred_prob = model.predict_proba(X_test)[:, 1]
auc_score = roc_auc_score(y_test, y_pred_prob)
print("AUC on training data:", auc_score)

# ### Get TPR at 1% FPR
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
recall_at_fpr_001 = np.interp(0.01, fpr, tpr)
print("TPR at 1% FPR:", recall_at_fpr_001)

Categorical features: ['productcd', 'card6', 'p_emaildomain', 'r_emaildomain', 'devicetype', 'deviceinfo']
Numerical features: ['transactionamt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'dist1', 'c1', 'c2', 'c4', 'c5', 'c6', 'c7', 'c8', 'c9', 'c10', 'c11', 'c12', 'c13', 'c14', 'v62', 'v70', 'v76', 'v78', 'v82', 'v91', 'v127', 'v130', 'v139', 'v160', 'v165', 'v187', 'v203', 'v207', 'v209', 'v210', 'v221', 'v234', 'v257', 'v258', 'v261', 'v264', 'v266', 'v267', 'v271', 'v274', 'v277', 'v283', 'v285', 'v289', 'v291', 'v294', 'id_01', 'id_02', 'id_05', 'id_06', 'id_09', 'id_13', 'id_17', 'id_19', 'id_20']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[categorical_features] = encoder.fit_transform(X_train[categorical_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[categorical_features] = encoder.transform(X_test[categorical_features])


Fitting the model...
AUC on training data: 0.9382106953813876
TPR at 1% FPR: 0.6278576478906435


In [15]:
# print F1 precision and recall
from sklearn.metrics import f1_score, precision_score, recall_score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
accuracy = (y_pred == y_test).mean()
print(y_test.mean())
print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Precision:", precision)
print("Recall:", recall)

0.03592474684187354
Accuracy: 0.979950553730484
F1 Score: 0.6332094175960347
Precision: 0.9236330772706733
Recall: 0.4817346217299081
