In [12]:
!pip install openpyxl

In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [14]:
# reading in the data
data = pd.read_excel('/kaggle/input/fraud-gaming/sample_dataset_data_scientist.xlsx')

In [15]:
data

In [16]:
data.info()

In [17]:
# drop columns that will not be used
data.drop(columns = ['Email_Id', 'Transaction_Id', 'FirstEmailDate'], inplace = True)

In [18]:
# frequency mapping hashed columns
data['ClientIP'] = data['ClientIP'].map(data['ClientIP'].value_counts().to_dict())
data['User_Id'] = data['User_Id'].map(data['User_Id'].value_counts().to_dict())
data['UserAgent'] = data['UserAgent'].map(data['UserAgent'].value_counts().to_dict())

In [19]:
# make new feature transaction date to calculate the transaction time
data['Txn_date'] = data['TxnInitTime'].dt.date
data['Txn_date'] = data['Txn_date'].astype('datetime64')
data['TxnTime'] = (data['TxnCompleteTime'] - data['TxnInitTime']).dt.seconds

In [20]:
# drop columns that have been used
data.drop(columns = ['TxnInitTime', 'TxnCompleteTime'], inplace = True)

In [21]:
# extract numbers from labels
data['PaymentChannel'] = data['PaymentChannel'].str.extract('(\d+)')
data['ItemName'] = data['ItemName'].str.extract('(\d+)')

In [22]:
# make new feature for days between first transaction and current transaction
data['daysbtwfirstactual'] = (data['Txn_date'] - data['FirstTransactionDate']).dt.days
data.drop(columns = ['FirstTransactionDate'], inplace = True)

In [23]:
# make new features from the date for day, month and year
data['TxnDay'] = data['Txn_date'].dt.day
data['TxnMonth'] = data['Txn_date'].dt.month
data['TxnYear'] = data['Txn_date'].dt.year
data.drop(columns = ['Txn_date'], inplace = True)

In [24]:
import seaborn as sns
sns.set(rc = {'figure.figsize':(40,8)})
sns.barplot(x = data["Alpha2Code"], y = data["Flag"])

In [25]:
# rank countries according to increasing probability of fraud
vals_to_replace = {'IN': 7, 'ID': 6, 'SG': 5, 'MY': 4, 'KH': 3, 'PH': 2, 'TH': 1}
data['Alpha2Code'] = data['Alpha2Code'].map(vals_to_replace)
data.drop(columns = ['GeoIpCountry', 'CountryCode'], inplace = True)

In [26]:
data['PaymentChannel'] = data['PaymentChannel'].astype(int)
data['ItemName'] = data['ItemName'].astype(int)

In [27]:
cols_norm = ['TxnTime', 'daysbtwfirstactual']
# apply normalization techniques
for column in cols_norm:
    data[column] = (data[column] - data[column].mean()) / data[column].std()

In [28]:
sns.barplot(x = data["ChannelType"], y = data["Flag"])

In [29]:
# rank channel type according to increasing probability of fraud
vals_to_replace = {5: 4, 2: 3, 6: 2, 1: 1, 3: 1, 4: 1, 7: 1, 8: 1, 9: 1}
data['ChannelType'] = data['ChannelType'].map(vals_to_replace)

In [30]:
sns.barplot(x = data["PaymentChannel"], y = data["Flag"])

In [31]:
# rank payment channel according to increasing probability of fraud
vals_to_replace = {2: 9, 5: 8, 1: 7, 6: 6, 4: 5, 8: 4, 7: 3, 3: 2, 9: 1}
data['PaymentChannel'] = data['PaymentChannel'].map(vals_to_replace)

In [32]:
data

In [33]:
data.info()

# XGBoost

In [34]:
import imblearn
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler

y = data.Flag              
data.drop(['Flag'], axis=1, inplace=True)

# Break off validation set from training data

data_train, data_test, y_train, y_test = train_test_split(data, y, test_size=0.3, stratify = y, random_state=11)


pipeline = imbpipeline(steps = [['classifier', XGBClassifier(objective = "binary:logistic", tree_method = 'gpu_hist')]])

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
    
param_grid = {'classifier__n_estimators' : [50, 500, 1000],
              'classifier__learning_rate' : [0.5, 0.1, 0.01],
              'classifier__min_child_weight': [1, 5, 10],
              'classifier__gamma': [0.5, 2, 5],
              'classifier__max_depth': [3, 5]}

In [35]:
# fit the model

xgb = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='f1', cv=stratified_kfold)
xgb.fit(data_train, y_train)
pred_xgb = xgb.predict(data_test)

print(confusion_matrix(pred_xgb,y_test))
print(classification_report(pred_xgb, y_test))