# Track A: SWIFT-only example baseline model

This notebook, provided by SWIFT, contains two example baseline models for Track A: Financial Crime Prevention for the [PETs Prize Challenge](https://petsprizechallenges.com/).

These two models are intended as an example to help you get started on developing a model for this use case. Note that these example models only incorporate features from the SWIFT transactions dataset. These models are not examples of full centralized solutions in the context of the challenge, as they do not incorporate the account data from the banks. 

## Imports

In [1]:
### Libraries for Data Handling

from pathlib import Path

import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

In [2]:
### Libraries for Algorithms

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sklearn.utils

from xgboost import XGBClassifier

# Fetch Data

In [3]:
DATA_DIR = Path.cwd().parent / "data"

In [5]:
train = pd.read_csv(
    DATA_DIR / "swift_transaction_train_dataset.csv", index_col="MessageId"
)
train["Timestamp"] = train["Timestamp"].astype("datetime64[ns]")
test = pd.read_csv(DATA_DIR / "swift_transaction_test_dataset.csv", index_col="MessageId")
test["Timestamp"] = test["Timestamp"].astype("datetime64[ns]")

In [6]:
train.head(3)

Unnamed: 0_level_0,Timestamp,UETR,Sender,Receiver,TransactionReference,OrderingAccount,OrderingName,OrderingStreet,OrderingCountryCityZip,BeneficiaryAccount,BeneficiaryName,BeneficiaryStreet,BeneficiaryCountryCityZip,SettlementDate,SettlementCurrency,SettlementAmount,InstructedCurrency,InstructedAmount,Label
MessageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
TRA7CGN3FF,2022-01-01,f474fdb3-4675-4fff-ab7e-3469f82bd6a7,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-7054,FR90714755422956984353,PHACELIA HETEROPHYLLA,3| RUE HAMON,FR/42859 SAINTE AURÉLIE,611024064274704358,PAPAVER CALIFORNICUM,2584 CHARLES PLACE,US/ROJASLAND| DC 58442,220101,USD,1746319000.0,EUR,1560189000.0,0
TRPNEMZIR7,2022-01-01,c9158def-dab1-4bfb-a31f-7f51c6679d60,BRRGPTPL,CBLHESMM,PETX22-NO-FX-1736,PT8895792452733129969,GONOLOBUS STEPHANOTRICHUS,AV RITA ALVES| 60,PT/5863-752 CANTANHEDE,ES61897100852916932423,MINUARTIA NUTTALLII-GREGARIA,ACCESO DE CARMINA ARAGÓN 83 PUERTA 4,ES/ÁVILA| 02281,220101,EUR,4711420.0,EUR,4711420.0,0
TR6S6A5JYL,2022-01-01,d371ba0a-823f-4243-98ba-94ff18523420,BRRGPTPL,CBLHESMM,PETX22-NO-FX-1687,PT92895792452733126420,LECHEA INTERMEDIA-INTERMEDIA,PRAÇA VALENTE| 85,PT/1100-087 BARCELOS,ES31897100852916935097,ASTRAGALUS MAGDALENAE,PASADIZO ANÍBAL LUJÁN 57,ES/SEGOVIA| 40727,220101,EUR,752821.6,EUR,752821.6,0


In [7]:
train.groupby("Label").size()

Label
0    4686825
1       4900
dtype: int64

In [8]:
test.groupby("Label").size()

Label
0    704347
1       761
dtype: int64

In [9]:
train.columns

Index(['Timestamp', 'UETR', 'Sender', 'Receiver', 'TransactionReference',
       'OrderingAccount', 'OrderingName', 'OrderingStreet',
       'OrderingCountryCityZip', 'BeneficiaryAccount', 'BeneficiaryName',
       'BeneficiaryStreet', 'BeneficiaryCountryCityZip', 'SettlementDate',
       'SettlementCurrency', 'SettlementAmount', 'InstructedCurrency',
       'InstructedAmount', 'Label'],
      dtype='object')

## Add Features for Model Training

In [10]:
%%time
# Hour
train["hour"] = train["Timestamp"].dt.hour
test["hour"] = test["Timestamp"].dt.hour

# Hour frequency for each sender
senders = train["Sender"].unique()
train["sender_hour"] = train["Sender"] + train["hour"].astype(str)
test["sender_hour"] = test["Sender"] + test["hour"].astype(str)
sender_hour_frequency = {}
for s in senders:
    sender_rows = train[train["Sender"] == s]
    for h in range(24):
        sender_hour_frequency[s + str(h)] = len(sender_rows[sender_rows["hour"] == h])

train["sender_hour_freq"] = train["sender_hour"].map(sender_hour_frequency)
test["sender_hour_freq"] = test["sender_hour"].map(sender_hour_frequency)

CPU times: total: 8.31 s
Wall time: 8.37 s


In [12]:
%%time
# Sender-Currency Frequency and Average Amount per Sender-Currency
train["sender_currency"] = train["Sender"] + train["InstructedCurrency"]
test["sender_currency"] = test["Sender"] + test["InstructedCurrency"]

sender_currency_freq = {}
sender_currency_avg = {}

for sc in set(
    list(train["sender_currency"].unique()) + list(test["sender_currency"].unique())
):
    sender_currency_freq[sc] = len(train[train["sender_currency"] == sc])
    sender_currency_avg[sc] = train[train["sender_currency"] == sc][
        "InstructedAmount"
    ].mean()

train["sender_currency_freq"] = train["sender_currency"].map(sender_currency_freq)
test["sender_currency_freq"] = test["sender_currency"].map(sender_currency_freq)

train["sender_currency_amount_average"] = train["sender_currency"].map(
    sender_currency_avg
)
test["sender_currency_amount_average"] = test["sender_currency"].map(sender_currency_avg)

CPU times: total: 19.2 s
Wall time: 19.2 s


In [13]:
%%time
# Sender-Receiver Frequency
train["sender_receiver"] = train["Sender"] + train["Receiver"]
test["sender_receiver"] = test["Sender"] + test["Receiver"]

sender_receiver_freq = {}

for sr in set(
    list(train["sender_receiver"].unique()) + list(test["sender_receiver"].unique())
):
    sender_receiver_freq[sr] = len(train[train["sender_receiver"] == sr])

train["sender_receiver_freq"] = train["sender_receiver"].map(sender_receiver_freq)
test["sender_receiver_freq"] = test["sender_receiver"].map(sender_receiver_freq)

CPU times: total: 22.3 s
Wall time: 22.4 s


In [12]:
# Account previously used

In [14]:
train.head(3)

Unnamed: 0_level_0,Timestamp,UETR,Sender,Receiver,TransactionReference,OrderingAccount,OrderingName,OrderingStreet,OrderingCountryCityZip,BeneficiaryAccount,BeneficiaryName,BeneficiaryStreet,BeneficiaryCountryCityZip,SettlementDate,SettlementCurrency,SettlementAmount,InstructedCurrency,InstructedAmount,Label,hour,sender_hour,sender_hour_freq,sender_currency,sender_currency_freq,sender_currency_amount_average,sender_receiver,sender_receiver_freq
MessageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1
TRA7CGN3FF,2022-01-01,f474fdb3-4675-4fff-ab7e-3469f82bd6a7,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-7054,FR90714755422956984353,PHACELIA HETEROPHYLLA,3| RUE HAMON,FR/42859 SAINTE AURÉLIE,611024064274704358,PAPAVER CALIFORNICUM,2584 CHARLES PLACE,US/ROJASLAND| DC 58442,220101,USD,1746319000.0,EUR,1560189000.0,0,0,DPSUFRPP0,16519,DPSUFRPPEUR,1598751,169246500.0,DPSUFRPPABVVUS6S,1708051
TRPNEMZIR7,2022-01-01,c9158def-dab1-4bfb-a31f-7f51c6679d60,BRRGPTPL,CBLHESMM,PETX22-NO-FX-1736,PT8895792452733129969,GONOLOBUS STEPHANOTRICHUS,AV RITA ALVES| 60,PT/5863-752 CANTANHEDE,ES61897100852916932423,MINUARTIA NUTTALLII-GREGARIA,ACCESO DE CARMINA ARAGÓN 83 PUERTA 4,ES/ÁVILA| 02281,220101,EUR,4711420.0,EUR,4711420.0,0,0,BRRGPTPL0,4214,BRRGPTPLEUR,36690,1667354.0,BRRGPTPLCBLHESMM,36690
TR6S6A5JYL,2022-01-01,d371ba0a-823f-4243-98ba-94ff18523420,BRRGPTPL,CBLHESMM,PETX22-NO-FX-1687,PT92895792452733126420,LECHEA INTERMEDIA-INTERMEDIA,PRAÇA VALENTE| 85,PT/1100-087 BARCELOS,ES31897100852916935097,ASTRAGALUS MAGDALENAE,PASADIZO ANÍBAL LUJÁN 57,ES/SEGOVIA| 40727,220101,EUR,752821.6,EUR,752821.6,0,0,BRRGPTPL0,4214,BRRGPTPLEUR,36690,1667354.0,BRRGPTPLCBLHESMM,36690


In [15]:
# Exclude below categorical columns for training and testing

columns_to_drop = [
    "UETR",
    "Sender",
    "Receiver",
    "TransactionReference",
    "OrderingAccount",
    "OrderingName",
    "OrderingStreet",
    "OrderingCountryCityZip",
    "BeneficiaryAccount",
    "BeneficiaryName",
    "BeneficiaryStreet",
    "BeneficiaryCountryCityZip",
    "SettlementDate",
    "SettlementCurrency",
    "InstructedCurrency",
    "Timestamp",
    "sender_hour",
    "sender_currency",
    "sender_receiver",
]

train = train.drop(columns_to_drop, axis=1)
test = test.drop(columns_to_drop, axis=1)
train.head(3)

Unnamed: 0_level_0,SettlementAmount,InstructedAmount,Label,hour,sender_hour_freq,sender_currency_freq,sender_currency_amount_average,sender_receiver_freq
MessageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TRA7CGN3FF,1746319000.0,1560189000.0,0,0,16519,1598751,169246500.0,1708051
TRPNEMZIR7,4711420.0,4711420.0,0,0,4214,36690,1667354.0,36690
TR6S6A5JYL,752821.6,752821.6,0,0,4214,36690,1667354.0,36690


In [16]:
train[train["Label"] == 1]

Unnamed: 0_level_0,SettlementAmount,InstructedAmount,Label,hour,sender_hour_freq,sender_currency_freq,sender_currency_amount_average,sender_receiver_freq
MessageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TRHBNUNW24,1.516912e+08,1.697910e+08,1,0,44,366914,1.102004e+08,240731
TR5PG7D9PB,2.433016e+06,2.433016e+06,1,1,65,529744,1.674064e+06,267752
TRO5CCVOGF,3.768829e+06,3.367130e+06,1,1,29223,1598751,1.692465e+08,1708051
TRKMGKXV70,9.325149e+05,9.325149e+05,1,1,65,529744,1.674064e+06,267752
TR93KY2K2Z,1.959181e+06,1.959181e+06,1,1,65,529744,1.674064e+06,267752
...,...,...,...,...,...,...,...,...
TR3XXUN0LT,1.802557e+06,1.802557e+06,1,23,93,529744,1.674064e+06,267752
TR2NJMCD81,4.121070e+06,3.311958e+06,1,23,68571,218987,3.691763e+06,1708051
TR0LV8DHE3,6.291664e+06,5.056388e+06,1,23,68571,218987,3.691763e+06,1708051
TREJEA6AT3,4.345422e+05,4.345422e+05,1,23,93,529744,1.674064e+06,267752


In [17]:
test[test["Label"] == 1]

Unnamed: 0_level_0,SettlementAmount,InstructedAmount,Label,hour,sender_hour_freq,sender_currency_freq,sender_currency_amount_average,sender_receiver_freq
MessageId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
TEKE9CP94L,2.544254e+06,2.273076e+06,1,22,4174,1598751,1.692465e+08,1708051
TE25TS2GOM,1.454529e+08,1.628083e+08,1,4,43,366914,1.102004e+08,240731
TE43OHEG32,2.768278e+06,2.224767e+06,1,11,552300,58,2.861585e+08,1708051
TEHRG80QLN,4.762671e+06,4.255044e+06,1,15,63231,1598751,1.692465e+08,126189
TEGAOGU1QK,5.506012e+06,4.919156e+06,1,8,169510,1598751,1.692465e+08,1708051
...,...,...,...,...,...,...,...,...
TEIFN9H1VG,3.658144e+06,2.939922e+06,1,4,113181,218987,3.691763e+06,1708051
TEDKORH5M2,2.723516e+06,2.188794e+06,1,11,552300,218987,3.691763e+06,1708051
TEFKDNVAKL,3.186411e+06,2.560806e+06,1,22,4174,218987,3.691763e+06,1708051
TE3DIV307H,4.485390e+06,4.007317e+06,1,6,139377,1598751,1.692465e+08,1708051


# Separate Label and Normalize

In [18]:
Y_train = train["Label"].values
X_train = train.drop(["Label"], axis=1).values
Y_test = test["Label"].values
X_test = test.drop(["Label"], axis=1).values

In [19]:
# Normalize

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Random Forest

In [32]:
%%time
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=7, random_state=0, n_estimators=100)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(clf, X_train, Y_train, cv=kfold, scoring="f1")

clf.fit(X_train, Y_train)
print("Minimum:", cv_results.min())
print("Maximum:", cv_results.max())
print("StanDev:", cv_results.std())

y_pred = clf.predict(X_test)
print("Classification Report=\n\n", classification_report(Y_test, y_pred))
print("Confusion Matrix=\n\n", confusion_matrix(Y_test, y_pred))
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=y_pred_proba))

Minimum: 0.45561665357423414
Maximum: 0.5265519820493643
StanDev: 0.024935986949270916
Classification Report=

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    704347
           1       0.99      0.34      0.51       761

    accuracy                           1.00    705108
   macro avg       0.99      0.67      0.75    705108
weighted avg       1.00      1.00      1.00    705108

Confusion Matrix=

 [[704344      3]
 [   501    260]]
AUPRC: 0.5009128295564667
CPU times: total: 1min 55s
Wall time: 1min 57s


KeyboardInterrupt: 

## Naive Bayes

In [36]:
%%time

from sklearn.naive_bayes import GaussianNB

clf = GaussianNB(var_smoothing = 1e-4)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(clf, X_train, Y_train, cv=kfold, scoring="f1")

clf.fit(X_train, Y_train)
print("Minimum:", cv_results.min())
print("Maximum:", cv_results.max())
print("StanDev:", cv_results.std())

y_pred = clf.predict(X_test)
print("Classification Report=\n\n", classification_report(Y_test, y_pred))
print("Confusion Matrix=\n\n", confusion_matrix(Y_test, y_pred))
y_pred_proba = clf.predict_proba(X_test)[:, 1]
print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=y_pred_proba))

Minimum: 0.002029060755597025
Maximum: 0.0020713328831694433
StanDev: 1.5577268839513897e-05
Classification Report=

               precision    recall  f1-score   support

           0       1.00      0.02      0.04    704347
           1       0.00      0.95      0.00       761

    accuracy                           0.02    705108
   macro avg       0.50      0.49      0.02    705108
weighted avg       1.00      0.02      0.04    705108

Confusion Matrix=

 [[ 13704 690643]
 [    35    726]]
AUPRC: 0.002104982486001808
CPU times: total: 6.69 s
Wall time: 6.69 s


# XGBoost

In [23]:
%%time

xgb = XGBClassifier(n_estimators=100)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(xgb, X_train, Y_train, cv=kfold, scoring="f1")

xgb.fit(X_train, Y_train)
print("Minimum:", cv_results.min())
print("Maximum:", cv_results.max())
print("StanDev:", cv_results.std())

Minimum: 0.5997304582210243
Maximum: 0.6382699868938402
StanDev: 0.016072356051374286
CPU times: user 1h 18min 52s, sys: 2min 23s, total: 1h 21min 15s
Wall time: 15min 31s


In [24]:
pred_xgb = xgb.predict(X_test)
print("XGBoost Classification Report=\n\n", classification_report(Y_test, pred_xgb))

XGBoost Classification Report=

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    704347
           1       0.91      0.47      0.62       761

    accuracy                           1.00    705108
   macro avg       0.95      0.73      0.81    705108
weighted avg       1.00      1.00      1.00    705108



In [25]:
print("XGBoost Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_xgb))

XGBoost Confusion Matrix=

 [[704310     37]
 [   404    357]]


In [26]:
pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]

print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb))

AUPRC: 0.5982779139029439
