In [1]:
import pandas as pd
import utils.data_utils as data_utils
from pathlib import Path

## Preparing Data

In [2]:
# laoding data
DATA_DIR = Path.cwd().parent / "data"
train_path = DATA_DIR / "swift_HA_train.csv"
test_path = DATA_DIR / "swift_HA_test.csv"
train, test = data_utils.load_swift_data(train_path, test_path)
bank_data = data_utils.load_bank_data(DATA_DIR)

In [3]:
print(train.shape, test.shape)

(4690135, 19) (704856, 19)


In [4]:
# Getting merged dataset
train_merged = data_utils.merge_swift_bank_data(train, bank_data)
test_merged = data_utils.merge_swift_bank_data(test, bank_data)
del bank_data
del train
del test

In [5]:
# fill missing data for flag as another category '12'
train_merged['Flag_ordering'] = train_merged['Flag_ordering'].fillna('12')
train_merged['Flag_beneficiary'] = train_merged['Flag_beneficiary'].fillna('12')
test_merged['Flag_ordering'] = test_merged['Flag_ordering'].fillna('12')
test_merged['Flag_beneficiary'] = test_merged['Flag_beneficiary'].fillna('12')
print(train_merged.shape, test_merged.shape)

(4690135, 21) (704856, 21)


In [6]:
print(train_merged.columns)
print(train_merged.shape)
print(test_merged.shape)

Index(['Timestamp', 'UETR', 'Sender', 'Receiver', 'TransactionReference',
       'OrderingAccount', 'OrderingName', 'OrderingStreet',
       'OrderingCountryCityZip', 'BeneficiaryAccount', 'BeneficiaryName',
       'BeneficiaryStreet', 'BeneficiaryCountryCityZip', 'SettlementDate',
       'SettlementCurrency', 'SettlementAmount', 'InstructedCurrency',
       'InstructedAmount', 'Label', 'Flag_ordering', 'Flag_beneficiary'],
      dtype='object')
(4690135, 21)
(704856, 21)


## Features

In [7]:
# util funcs
def generate_feature(train, test, pivot_name, new_feature_name, func, agg_col=None):
    if func == 'value_count':
        d = train[pivot_name].value_counts()
        d.name = new_feature_name
        train = train.merge(d, left_on=pivot_name, right_index=True, how = 'left')
        test = test.merge(d, left_on=pivot_name, right_index=True, how = 'left')
    elif func == 'mean':
        d = train.groupby(pivot_name).agg(**{
                new_feature_name: pd.NamedAgg(column=agg_col, aggfunc='mean')})
        train=train.merge(d, left_on=pivot_name, right_index=True, how = 'left')
        test=test.merge(d, left_on=pivot_name, right_index=True, how = 'left')
    elif func == 'n_unique':
        d=train.groupby(pivot_name).agg(**{
                new_feature_name: pd.NamedAgg(column=agg_col, aggfunc=lambda x: len(x.unique()))})
        train=train.merge(d, left_on=pivot_name, right_index=True, how = 'left')
        test=test.merge(d, left_on=pivot_name, right_index=True, how = 'left')
    else:
        raise ValueError("func is not a valid option.")


    return train, test

### Hops

In [8]:
d_train = train_merged['UETR'].value_counts()
d_train.name = 'num_hops'
d_test = test_merged['UETR'].value_counts()
d_test.name = 'num_hops'

train_merged = train_merged.merge(d_train, left_on = 'UETR', right_index = True)
test_merged = test_merged.merge(d_test, left_on = 'UETR', right_index = True)
print(train_merged.shape)
print(test_merged.shape)

(4690135, 22)
(704856, 22)


### Bank Network Features

- node feature:
    - number of transaction assoicate to the node
    - number of currecy used and avg amount money transferred per currency for a node
    - in and out degree
- edge feature:
    - number of transaction between two nodes
    - number of currency used and avg amount money transferred per currency between two nodes

#### total number of transactions of sender and receiver bank

In [9]:
# sender freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='Sender', new_feature_name="Sender_freq", func = 'value_count')

# receiver freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='Receiver', new_feature_name="Receiver_freq", func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

(4690135, 24)
(704856, 24)


#### total number of currecy and avg amount per currecy of sender and receiver bank

In [10]:
def generate_currency_feature(train, test, name):
    
    # currecy freq
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_freq", func = 'value_count')

    # currecy avg amount
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_avg_amount", func = 'mean', agg_col='InstructedAmount')
    
    return train, test

#######################################################################################################
# sender currecy
train_merged["sender_currency"] = train_merged["Sender"] + train_merged["InstructedCurrency"]
test_merged["sender_currency"] = test_merged["Sender"] + test_merged["InstructedCurrency"]

name = 'sender_currency'
train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name)


#######################################################################################################
# receiver currency
train_merged["receiver_currency"] = train_merged["Receiver"] + train_merged["InstructedCurrency"]
test_merged["receiver_currency"] = test_merged["Receiver"] + test_merged["InstructedCurrency"]

name = 'receiver_currency'
train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name)
print(train_merged.shape)
print(test_merged.shape)

(4690135, 30)
(704856, 30)


#### total number of banks sender and receiver connect to (in and out degree)

In [11]:
def generate_in_out_degree(train, test, from_node_name, to_node_name):
    
    # sender out degree
    train, test = generate_feature(
        train, test, pivot_name=from_node_name, new_feature_name = from_node_name + "_out_degree", 
        func = 'n_unique', agg_col = to_node_name)

    # sender in degree
    train, test = generate_feature(
        train, test, pivot_name=to_node_name, new_feature_name = from_node_name + "_in_degree", 
        func = 'n_unique', agg_col = from_node_name)

    # receiver out degree
    train, test = generate_feature(
        train, test, pivot_name=from_node_name, new_feature_name = to_node_name + "_out_degree", 
        func = 'n_unique', agg_col = to_node_name)

    # receiver in degree
    train, test = generate_feature(
        train, test, pivot_name=to_node_name, new_feature_name = to_node_name + "_in_degree", 
        func = 'n_unique', agg_col = from_node_name)
    
    return train, test

train_merged, test_merged = generate_in_out_degree(train_merged, test_merged, 'Sender', 'Receiver')
print(train_merged.shape)
print(test_merged.shape)

(4690135, 34)
(704856, 34)


#### total number of transactions  between sender and receiver bank

In [12]:
train_merged["sender_receiver"] = train_merged["Sender"] + train_merged["Receiver"]
test_merged["sender_receiver"] = test_merged["Sender"] + test_merged["Receiver"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_receiver', new_feature_name="sender_receiver_freq", func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

(4690135, 36)
(704856, 36)


#### total number of currecy and avg amount money between sender and receiver bank

In [13]:
# number of currency
train_merged["sender_receiver_currency"] = train_merged["Sender"] + train_merged["Receiver"] + train_merged['InstructedCurrency']
test_merged["sender_receiver_currency"] = test_merged["Sender"] + test_merged["Receiver"] + test_merged['InstructedCurrency']

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_receiver_currency', 
    new_feature_name="sender_receiver_currency_freq", func = 'value_count')

# avg amount currency
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name="sender_receiver_currency", 
    new_feature_name="sender_receiver_currency_avg_amount", func = "mean", agg_col="InstructedAmount")
print(train_merged.shape)
print(test_merged.shape)

(4690135, 39)
(704856, 39)


### Acount Network Features

#### total number of transaction of ordering and beneficary account

In [14]:
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'

# sender freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender, new_feature_name=sender + '_freq', func = 'value_count')

# receiver freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=receiver, new_feature_name=receiver + "_freq", func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

(4690135, 41)
(704856, 41)


#### total number of currecy and avg amount money per currecy of ordering and beneficiary account

In [15]:
def generate_currency_feature(train, test, name):
    
    # currecy freq
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_freq", func = 'value_count')

    # currecy avg amount
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_avg_amount", func = 'mean', agg_col='InstructedAmount')
    
    return train, test

#######################################################################################################
# sender currecy
train_merged["OrderingAccount_currency"] = train_merged["OrderingAccount"] + train_merged["InstructedCurrency"]
test_merged["OrderingAccount_currency"] = test_merged["OrderingAccount"] + test_merged["InstructedCurrency"]
name = 'OrderingAccount_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name)


#######################################################################################################
# receiver currency
train_merged["BeneficiaryAccount_currency"] = train_merged["BeneficiaryAccount"] + train_merged["InstructedCurrency"]
test_merged["BeneficiaryAccount_currency"] = test_merged["BeneficiaryAccount"] + test_merged["InstructedCurrency"]
name = 'BeneficiaryAccount_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name)
print(train_merged.shape)
print(test_merged.shape)

(4690135, 47)
(704856, 47)


#### total number of accounts ordering and beneficiary account connect to (in and out degree)

In [16]:
train_merged, test_merged = generate_in_out_degree(train_merged, test_merged, 'OrderingAccount', 'BeneficiaryAccount')
print(train_merged.shape)
print(test_merged.shape)

(4690135, 51)
(704856, 51)


#### total number of transactions between sender and receiver account

In [17]:
%%time
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver = sender + "_" + receiver
train_merged[sender_receiver] = train_merged[sender] + train_merged[receiver]
test_merged[sender_receiver] = test_merged[sender] + test_merged[receiver]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver, 
    new_feature_name=sender_receiver + "_freq", func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

(4690135, 53)
(704856, 53)
CPU times: total: 11 s
Wall time: 11.3 s


#### total number of currecy and avg amount money between sender and receiver bank

In [18]:
%%time
# number of currency
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver_currency = sender + "_" + receiver + "_currency"
train_merged[sender_receiver_currency] = train_merged[sender] + train_merged[receiver] + train_merged['InstructedCurrency']
test_merged[sender_receiver_currency] = test_merged[sender] + test_merged[receiver] + test_merged['InstructedCurrency']

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver_currency, 
    new_feature_name=sender_receiver_currency + "_freq", func = 'value_count')

# avg amount currency
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver_currency, 
    new_feature_name=sender_receiver_currency + "_avg_amount", func = "mean", agg_col="InstructedAmount")
print(train_merged.shape)
print(test_merged.shape)

(4690135, 56)
(704856, 56)
CPU times: total: 41 s
Wall time: 45.8 s


### Account and Bank network features over time feature (hout, day, week)

In [19]:
# get hour day week from timestamp
train_merged["hour"] = train_merged["Timestamp"].dt.hour.astype(str)
test_merged["hour"] = test_merged["Timestamp"].dt.hour.astype(str)
train_merged["day"] = train_merged["Timestamp"].dt.day.astype(str)
test_merged["day"] = test_merged["Timestamp"].dt.day.astype(str)
train_merged["week"] = train_merged["Timestamp"].dt.isocalendar().week.astype(str)
test_merged["week"] = test_merged["Timestamp"].dt.isocalendar().week.astype(str)

In [20]:
train_merged["sender_hour"] = train_merged["Sender"] + train_merged["hour"].astype(str)
test_merged["sender_hour"] = test_merged["Sender"] + test_merged["hour"].astype(str)
train_merged, test_merged = generate_feature(
        train_merged, test_merged, pivot_name='sender_hour', new_feature_name='sender_hour_freq', func = 'value_count')

In [30]:
train_merged = train_merged.drop(['sender_hour', 'sender_hour_freq'], axis = 1)
test_merged = test_merged.drop(['sender_hour', 'sender_hour_freq'], axis = 1)

In [31]:
def generate_features_based_on_time(train, test, from_node_name, to_node_name, time_col):
    
    ##############################################################################################
    # construct feature
    from_ = from_node_name + "_" + time_col
    to_ = to_node_name + "_" + time_col
    train[from_] = train[from_node_name] + train[time_col]
    test[from_] = test[from_node_name] + test[time_col]
    
    train[to_] = train[to_node_name] + train[time_col]
    test[to_] = test[to_node_name] + test[time_col]
    
    ##############################################################################################
    # total number of transactions
    # sender freq
    train, test = generate_feature(
        train, test, pivot_name=from_, new_feature_name=from_ + '_freq', func = 'value_count')

    # receiver freq
    train, test = generate_feature(
        train, test, pivot_name=to_, new_feature_name=to_ + "_freq", func = 'value_count')
    
    
    ################################################################################################
    # currency and amount
    # sender
    feature_name = from_node_name + "_currency_" + time_col
    train[feature_name] = train[from_node_name] + train["InstructedCurrency"] + train[time_col]
    test[feature_name] = test[from_node_name] + test["InstructedCurrency"] + test[time_col]
    
    train, test = generate_currency_feature(train, test, feature_name)
    
    train = train.drop([feature_name], axis = 1)
    test = test.drop([feature_name], axis = 1)
    
    # receiver
    feature_name = to_node_name + "_currency_" + time_col
    train[feature_name] = train[to_node_name] + train["InstructedCurrency"] + train[time_col]
    test[feature_name] = test[to_node_name] + test["InstructedCurrency"] + test[time_col]
    train, test = generate_currency_feature(train, test, feature_name)
    
    train = train.drop([feature_name], axis = 1)
    test = test.drop([feature_name], axis = 1)
    
    ################################################################################################
    # in and out degree
    train, test = generate_in_out_degree(train, test, from_, to_)
    
    ###############################################################################################
    # edge - number transaction
    train[from_ + "_" + to_] = train[from_] + train[to_]
    test[from_ + "_" + to_] = test[from_] + test[to_]
    
    train, test = generate_feature( train, test, 
               pivot_name=from_ + "_" + to_, new_feature_name=from_ + "_" + to_ + "_freq", func = 'value_count')
    
    train =train.drop([from_ + "_" + to_],axis = 1)
    test = test.drop([from_ + "_" + to_], axis = 1)
    
    ##############################################################################################
    # edge - currency and amount
    # number of currency
    train[from_ + "_" + to_ + "_currency"] = train[from_] + train[to_] + train['InstructedCurrency']
    test[from_ + "_" + to_ + "_currency"] = test[from_] + test[to_] + train['InstructedCurrency']

    train, test = generate_feature(
        train, test, pivot_name=from_ + "_" + to_ + "_currency", 
        new_feature_name=from_ + "_" + to_ + "_currency_freq", func = 'value_count')

    # avg amount currency
    train, test = generate_feature(
        train, test, pivot_name=from_ + "_" + to_ + "_currency", 
        new_feature_name=from_ + "_" + to_ + "_currency_avg_amount", func = "mean", agg_col="InstructedAmount")
    
    train = train.drop([from_ + "_" + to_ + "_currency"], axis = 1)
    test = test.drop([from_ + "_" + to_ + "_currency"], axis = 1)

    
    ###############################################################################################
    # drop columns
    train = train.drop([from_, to_], axis = 1)
    test = test.drop([from_, to_], axis = 1)
    
    
    return train, test

#### Fine-grained by hour of day 1 - 24

In [32]:
%%time
# bank
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'hour')

CPU times: total: 19min 23s
Wall time: 33min 41s


#### Fine-grained by day of month 1 - 31

In [24]:
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'day')


KeyboardInterrupt



#### Fine-grained by week of month 1 - 4

In [None]:
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'week')

## Drop unnecessary columns and save data

In [33]:
train_merged.columns

Index(['Timestamp', 'UETR', 'Sender', 'Receiver', 'TransactionReference',
       'OrderingAccount', 'OrderingName', 'OrderingStreet',
       'OrderingCountryCityZip', 'BeneficiaryAccount', 'BeneficiaryName',
       'BeneficiaryStreet', 'BeneficiaryCountryCityZip', 'SettlementDate',
       'SettlementCurrency', 'SettlementAmount', 'InstructedCurrency',
       'InstructedAmount', 'Label', 'Flag_ordering', 'Flag_beneficiary',
       'num_hops', 'Sender_freq', 'Receiver_freq', 'sender_currency',
       'sender_currency_freq', 'sender_currency_avg_amount',
       'receiver_currency', 'receiver_currency_freq',
       'receiver_currency_avg_amount', 'Sender_out_degree', 'Sender_in_degree',
       'Receiver_out_degree', 'Receiver_in_degree', 'sender_receiver',
       'sender_receiver_freq', 'sender_receiver_currency',
       'sender_receiver_currency_freq', 'sender_receiver_currency_avg_amount',
       'OrderingAccount_freq', 'BeneficiaryAccount_freq',
       'OrderingAccount_currency', 'Order

In [None]:
columns_to_drop = [
    "UETR",
    "Sender",
    "Receiver",
    "TransactionReference",
    "OrderingAccount",
    "OrderingName",
    "OrderingStreet",
    "OrderingCountryCityZip",
    "BeneficiaryAccount",
    "BeneficiaryName",
    "BeneficiaryStreet",
    "BeneficiaryCountryCityZip",
    "SettlementDate",
    "SettlementCurrency",
    "InstructedCurrency",
    "Timestamp",
    "sender_receiver",
    'OrderingAccount_BeneficiaryAccount_currency',
    'OrderingAccount_BeneficiaryAccount',
    'BeneficiaryAccount_currency',
    'OrderingAccount_currency',
    'receiver_currency',
    'sender_currency',
    'sender_hour'
]

train_merged = train_merged.drop(columns_to_drop, axis=1)
test_merged = test_merged.drop(columns_to_drop, axis=1)

## baseline verification

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sklearn.utils

In [35]:
retained_feature = ['SettlementAmount', 'InstructedAmount', 'Label', 'Sender_hour_freq', 'hour', 
                    'sender_currency_freq', 'sender_currency_avg_amount', 'sender_receiver_freq']
train_rt = train_merged[retained_feature].copy()
test_rt = test_merged[retained_feature].copy()

In [36]:
train_rt = train_rt.fillna(0)
test_rt = test_rt.fillna(0)
train_rt['hour'] = train_rt['hour'].astype('int64')
test_rt['hour'] = test_rt['hour'].astype('int64')

In [37]:
del train_merged
del test_merged

In [38]:
Y_train = train_rt["Label"].values
X_train = train_rt.drop(["Label"], axis=1).values
Y_test = test_rt["Label"].values
X_test = test_rt.drop(["Label"], axis=1).values

# Normalize

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
%%time
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(xgb, X_train, Y_train, cv=kfold, scoring="f1")

xgb.fit(X_train, Y_train)
print("Minimum:", cv_results.min())
print("Maximum:", cv_results.max())
print("StanDev:", cv_results.std())

Minimum: 0.7734241908006814
Maximum: 0.8044406490179333
StanDev: 0.010830083353441794
CPU times: total: 1h 46min 7s
Wall time: 7min 59s


In [40]:
pred_xgb = xgb.predict(X_test)
print("XGBoost Classification Report=\n\n", classification_report(Y_test, pred_xgb))
print("XGBoost Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_xgb))
pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]
print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb))

XGBoost Classification Report=

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    704357
           1       0.90      0.68      0.78       499

    accuracy                           1.00    704856
   macro avg       0.95      0.84      0.89    704856
weighted avg       1.00      1.00      1.00    704856

XGBoost Confusion Matrix=

 [[704320     37]
 [   158    341]]
AUPRC: 0.8689991047437784


In [None]:
train_merged.to_csv('train_features.csv')
test_merged.to_csv('test_features.csv')