In [26]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
import pandas as pd
import scipy.stats as stats

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.utils import resample

In [27]:
import warnings
warnings.filterwarnings('ignore')

In [55]:
import os
print(os.cpu_count())

32


In [66]:
test_url = 'test.csv'
train_url = 'train.csv'

test = pd.read_csv(test_url)
train = pd.read_csv(train_url)

test_submission = test.copy()

In [67]:
print(train.columns)

Index(['id', 'trans_num', 'trans_date', 'trans_time', 'unix_time', 'category',
       'amt', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state',
       'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'merchant', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')


In [68]:
train['is_train'] = 1
test['is_train'] = 0
data = pd.concat([train, test], sort=False)

In [69]:
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date'] + ' ' + data['trans_time'])
data['dob'] = pd.to_datetime(data['dob'])

data['age'] = data['trans_date_trans_time'].dt.year - data['dob'].dt.year

data['trans_hour'] = data['trans_date_trans_time'].dt.hour

le = LabelEncoder()
for col in ['gender', 'category']:
    data[col] = le.fit_transform(data[col])

for col in ['merchant', 'category', 'job', 'city', 'state', 'street']:
    freq_enc = data[col].value_counts().to_dict()
    data[col + '_freq_enc'] = data[col].map(freq_enc)

data['amt'] = data['amt'].astype(float)

data.drop(
  ['time_since_last_trans', 'trans_date_trans_time', 'trans_num', 'first', 'last', 'trans_time', 'dob', 'trans_date', 'merchant', 'job', 'city', 'state', 'street', 'zip', 'lat', 'long', 'merch_lat', 'merch_long'], 
  axis=1, 
  inplace=True, 
  errors='ignore'
)

In [70]:
train = data[data['is_train'] == 1]
test = data[data['is_train'] == 0]

# Drop helper column
train.drop(['is_train'], axis=1, inplace=True)
test.drop(['is_train', 'is_fraud'], axis=1, inplace=True)

In [71]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler

X = train.drop('is_fraud', axis=1)
y = train['is_fraud']

scaler = StandardScaler()

X_resampled_scaled = scaler.fit_transform(X)

test_scaled = scaler.transform(test)

In [72]:
X_train, X_val, y_train, y_val = train_test_split(X_resampled_scaled, y, test_size=0.2, random_state=42)

In [81]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

xgb = XGBClassifier(
    tree_method='gpu_hist', 
    n_jobs=-1, 
    verbosity=2, 
    random_state=42,
    n_estimators= 1000,
    max_depth= 10,
    learning_rate= 0.2,
    subsample= 1.0,
    colsample_bytree= 0.8
)

In [82]:
xgb.fit(X_resampled_scaled, y)

In [83]:
best_hgb = xgb

y_pred = best_hgb.predict(X_val)
y_pred_proba = best_hgb.predict_proba(X_val)[:, 1]

print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))
print(roc_auc_score(y_val, y_pred_proba))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     65592
         1.0       1.00      1.00      1.00      8549

    accuracy                           1.00     74141
   macro avg       1.00      1.00      1.00     74141
weighted avg       1.00      1.00      1.00     74141

[[65592     0]
 [    0  8549]]
1.0


In [79]:
# Predict on test data
test_predictions = best_hgb.predict(test_scaled)

# Prepare submission
submission = pd.DataFrame({
    'id': test_submission['id'],
    'is_fraud': test_predictions
})

# make them integers
submission['is_fraud'] = submission['is_fraud'].astype(int)

# Save submission to CSV
submission.to_csv('submission.csv', index=False)