

Links de Referência:
- Código: https://www.kaggle.com/code/arjunjoshua/predicting-fraud-in-financial-payment-services
- Dataset: https://www.kaggle.com/datasets/ealaxi/paysim1

In [1]:
import pickle
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import classification_report
import sys

In [2]:
df = pd.read_csv('../../datasets/fraud.csv')
df = df.rename(columns={'oldbalanceOrg':'oldBalanceOrig', 'newbalanceOrig':'newBalanceOrig', \
                        'oldbalanceDest':'oldBalanceDest', 'newbalanceDest':'newBalanceDest'})
print(df.head())

   step      type    amount     nameOrig  oldBalanceOrig  newBalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815        170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295         21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145           181.0            0.00   
3     1  CASH_OUT    181.00   C840083671           181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720         41554.0        29885.86   

      nameDest  oldBalanceDest  newBalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  


In [3]:
X = df.loc[(df.type == 'TRANSFER') | (df.type == 'CASH_OUT')]

randomState = 5
np.random.seed(randomState)

#X = X.loc[np.random.choice(X.index, 100000, replace = False)]

Y = X['isFraud']
del X['isFraud']

# Eliminate columns shown to be irrelevant for analysis in the EDA
X = X.drop(['nameOrig', 'nameDest', 'isFlaggedFraud'], axis = 1)

# Binary-encoding of labelled data in 'type'
X.loc[X.type == 'TRANSFER', 'type'] = 0
X.loc[X.type == 'CASH_OUT', 'type'] = 1
X.type = X.type.astype(int) # convert dtype('O') to dtype(int)
X = X.fillna(X.mean())


# New Section

In [4]:
X['errorBalanceOrig'] = X.newBalanceOrig + X.amount - X.oldBalanceOrig
X['errorBalanceDest'] = X.oldBalanceDest + X.amount - X.newBalanceDest

In [5]:
trainX, testX, trainY, testY = train_test_split(X, Y, test_size = 0.2, \
                                                random_state = randomState)

In [6]:
from xgboost.sklearn import XGBClassifier
from xgboost import plot_importance, to_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler




Criação do XGBoost

In [79]:
weights = (Y == 0).sum() / (1.0 * (Y == 1).sum())
clf = XGBClassifier(max_depth = 3, scale_pos_weight = weights, \
                n_jobs = 4)
probabilities = clf.fit(trainX, trainY).predict_proba(testX)
print('AUPRC = {}'.format(average_precision_score(testY, \
                                              probabilities[:, 1])))

AUPRC = 0.9883569773548841


Criação do Logist Regression e do Random Forest

In [8]:
# LR = LogisticRegression(random_state=0).fit(trainX, trainY)
# RF = RandomForestClassifier(max_depth=2, random_state=0).fit(trainX, trainY)
sc = StandardScaler()
X_train = sc.fit_transform(trainX)
X_test = sc.transform(testX)
svc = SVC(kernel='poly', probability = True)
svc.fit(X_train, trainY)
Y_pred = svc.predict(X_train)

In [80]:
y_pred = clf.predict(testX)
print(classification_report(testY, y_pred))
print(clf.classes_)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    135409
         1.0       0.85      0.98      0.91       340

    accuracy                           1.00    135749
   macro avg       0.92      0.99      0.96    135749
weighted avg       1.00      1.00      1.00    135749

[0. 1.]


In [82]:
y_pred_LR = LR.predict(testX)
print(classification_report(testY, y_pred_LR))
print(LR.classes_)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    135409
         1.0       0.69      0.44      0.54       340

    accuracy                           1.00    135749
   macro avg       0.84      0.72      0.77    135749
weighted avg       1.00      1.00      1.00    135749

[0. 1.]


In [84]:
y_pred_RF = RF.predict(testX)
print(classification_report(testY, y_pred_RF))
print(RF.classes_)

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    135409
         1.0       1.00      0.16      0.27       340

    accuracy                           1.00    135749
   macro avg       1.00      0.58      0.64    135749
weighted avg       1.00      1.00      1.00    135749

[0. 1.]


In [None]:
# Save Models
with open('XGBoost/XGboost.pkl', 'wb') as f:
    pickle.dump(clf, f)
with open('RandomForest/RandomForest.pkl', 'wb') as f:
    pickle.dump(RF, f)
with open('LogisticRegression/LogisticRegression.pkl', 'wb') as f:
    pickle.dump(LR, f)
with open('SVM/SVC.pkl', 'wb') as f:
    pickle.dump(svc, f)