In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier
import pickle

# Read the dataset
df = pd.read_csv('paysim_dataset.csv')

# Rename columns for consistency
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
                        'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})

In [26]:
import numpy as np

In [34]:
# Convert object columns to string
object_columns = df.select_dtypes(include=['object']).columns
object_columns.to_list()
df[object_columns] = df[object_columns].astype(str)




In [37]:
X = df

In [35]:
print(df.dtypes)

step                int64
type               object
amount            float64
nameOrig           object
oldBalanceOrig    float64
newBalanceOrig    float64
nameDest           object
oldBalanceDest    float64
newBalanceDest    float64
isFraud             int64
dtype: object


In [38]:
Y = X['isFraud']
del X['isFraud']

X.loc[X.type == 'PAYMENT', 'type'] = 0
X.loc[X.type == 'TRANSFER', 'type'] = 1
X.loc[X.type == 'CASH_OUT', 'type'] = 2
X.loc[X.type == 'DEBIT', 'type'] = 3
X.loc[X.type == 'CASH_IN', 'type'] = 4


X['type'] = X['type'].astype(int)

In [42]:
# Drop irrelevant columns
X = X.drop(['nameOrig', 'nameDest'], axis=1)

In [44]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldBalanceOrig,newBalanceOrig,nameDest,oldBalanceDest,newBalanceDest,errorBalanceOrig,errorBalanceDest
0,1,0,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0.0,9839.64
1,1,0,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0.0,1864.28
2,1,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,0.0,181.0
3,1,2,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,0.0,21363.0
4,1,0,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0.0,11668.14


In [39]:
# Calculate errorBalanceOrig and errorBalanceDest
X['errorBalanceOrig'] = X.newBalanceOrig + X.amount - X.oldBalanceOrig
X['errorBalanceDest'] = X.oldBalanceDest + X.amount - X.newBalanceDest
print(X.columns)

Index(['step', 'type', 'amount', 'nameOrig', 'oldBalanceOrig',
       'newBalanceOrig', 'nameDest', 'oldBalanceDest', 'newBalanceDest',
       'errorBalanceOrig', 'errorBalanceDest'],
      dtype='object')


In [43]:
# Split data into train and test sets
randomState = 5
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.2, random_state=randomState)

# Train XGBoost classifier
weights = (trainY == 0).sum() / (1.0 * (trainY == 1).sum())
clf = XGBClassifier(max_depth=3, scale_pos_weight=weights, n_jobs=4)
clf.fit(trainX, trainY)

pickle.dump(clf,open('model.pkl','wb'))

In [48]:
X.columns

Index(['step', 'type', 'amount', 'oldBalanceOrig', 'newBalanceOrig',
       'oldBalanceDest', 'newBalanceDest', 'errorBalanceOrig',
       'errorBalanceDest'],
      dtype='object')

In [51]:
input_df = X.head(20)
input_df.to_csv("input_df.csv",index=False)
