In [1]:
import pandas as pd
import utils.data_utils as data_utils
from pathlib import Path

## Preparing Data

In [2]:
# laoding data
DATA_DIR = Path.cwd().parent / "data"
train, test = data_utils.load_swift_data(DATA_DIR)
bank_data = data_utils.load_bank_data(DATA_DIR)

In [3]:
# Getting merged dataset
train_merged = data_utils.merge_swift_bank_data(train, bank_data)
test_merged = data_utils.merge_swift_bank_data(test, bank_data)
del bank_data
del train
del test

In [4]:
# fill missing data for flag as another category '12'
train_merged['Flag_ordering'] = train_merged['Flag_ordering'].fillna('12')
train_merged['Flag_beneficiary'] = train_merged['Flag_beneficiary'].fillna('12')
test_merged['Flag_ordering'] = test_merged['Flag_ordering'].fillna('12')
test_merged['Flag_beneficiary'] = test_merged['Flag_beneficiary'].fillna('12')

In [5]:
train_merged.columns

Index(['Timestamp', 'UETR', 'Sender', 'Receiver', 'TransactionReference',
       'OrderingAccount', 'OrderingName', 'OrderingStreet',
       'OrderingCountryCityZip', 'BeneficiaryAccount', 'BeneficiaryName',
       'BeneficiaryStreet', 'BeneficiaryCountryCityZip', 'SettlementDate',
       'SettlementCurrency', 'SettlementAmount', 'InstructedCurrency',
       'InstructedAmount', 'Label', 'Flag_ordering', 'Flag_beneficiary'],
      dtype='object')

## Features

### Hops

In [24]:
d_train = train_merged['UETR'].value_counts()
d_train.name = 'num_hops'
d_test = test_merged['UETR'].value_counts()
d_test.name = 'num_hops'

train_merged = train_merged.merge(d_train, left_on = 'UETR', right_index = True)
test_merged = test_merged.merge(d_test, left_on = 'UETR', right_index = True)  

### Bank Network Features

- node feature:
    - number of transaction assoicate to the node
    - number of currecy used and avg amount money transferred per currency for a node
    - in and out degree
- edge feature:
    - number of transaction between two nodes
    - number of currency used and avg amount money transferred per currency between two nodes

#### total number of transactions of sender and receiver bank

In [7]:
sender = 'Sender'
receiver = 'Receiver'
sender_feature_name = sender + '_freq'
receiver_feature_name = receiver + "_freq"

sender_freq = train_merged[sender].value_counts() # count number of transactions of sender
sender_freq.name = sender_feature_name  
train_merged = train_merged.merge(sender_freq, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_freq, left_on = sender, right_index = True) 

receiver_freq = train_merged[receiver].value_counts()
receiver_freq.name = receiver_feature_name
train_merged = train_merged.merge(receiver_freq, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_freq, left_on = receiver, right_index = True)

#### total number of currecy and avg amount per currecy of sender and receiver bank

In [8]:
# sender currecy
train_merged["sender_currency"] = train_merged["Sender"] + train_merged["InstructedCurrency"]
test_merged["sender_currency"] = test_merged["Sender"] + test_merged["InstructedCurrency"]
sender = 'sender_currency'

# currecy freq
sender_freq = train_merged[sender].value_counts() # count number of records
sender_freq.name = 'sender_currecy_freq'
train_merged = train_merged.merge(sender_freq, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_freq, left_on = sender, right_index = True)
# currecy avg amount
sender_avg_amount = train_merged.groupby(sender).agg(
    sender_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(sender_avg_amount, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_avg_amount, left_on = sender, right_index = True)

#######################################################################################################
# receiver currency
train_merged["receiver_currency"] = train_merged["Receiver"] + train_merged["InstructedCurrency"]
test_merged["receiver_currency"] = test_merged["Receiver"] + test_merged["InstructedCurrency"]
receiver = 'receiver_currency'
# currecy freq
receiver_freq = train_merged[receiver].value_counts() # count number of records
receiver_freq.name = 'receiver_currecy_freq'
train_merged = train_merged.merge(receiver_freq, left_on = receiver, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(receiver_freq, left_on = receiver, right_index = True)
# currecy avg amount
receiver_avg_amount = train_merged.groupby(receiver).agg(
    receiver_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(receiver_avg_amount, left_on = receiver, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(receiver_avg_amount, left_on = receiver, right_index = True)

#### total number of banks sender and receiver connect to (in and out degree)

In [9]:
sender = 'Sender'
receiver = 'Receiver'

# sender out degree
sender_out_degree = train_merged.groupby(sender).agg(
    sender_out_degree=pd.NamedAgg(column=receiver, aggfunc=lambda x: len(x.unique())))     
train_merged = train_merged.merge(sender_out_degree, left_on = sender, right_index = True)
test_merged = test_merged.merge(sender_out_degree, left_on = sender, right_index = True)

# sender in degree
sender_in_degree = train_merged.groupby(receiver).agg(
    sender_in_degree=pd.NamedAgg(column=sender, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(sender_in_degree, left_on = sender, right_index = True)
test_merged = test_merged.merge(sender_in_degree, left_on = sender, right_index = True)

# receiver out degree
receiver_out_degree = train_merged.groupby(sender).agg(
    receiver_out_degree=pd.NamedAgg(column=receiver, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(receiver_out_degree, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_out_degree, left_on = receiver, right_index = True)

# receiver in degree
receiver_in_degree = train_merged.groupby(receiver).agg(
    receiver_in_degree=pd.NamedAgg(column=sender, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(receiver_in_degree, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_in_degree, left_on = receiver, right_index = True)

#### total number of transactions  between sender and receiver bank

In [10]:
%%time
train_merged["sender_receiver"] = train_merged["Sender"] + train_merged["Receiver"]
test_merged["sender_receiver"] = test_merged["Sender"] + test_merged["Receiver"]

sender_receiver_freq = train_merged['sender_receiver'].value_counts()
sender_receiver_freq.name = 'sender_receiver_freq'

train_merged = train_merged.merge(sender_receiver_freq, left_on = 'sender_receiver', right_index = True)
test_merged = test_merged.merge(sender_receiver_freq, left_on = 'sender_receiver', right_index = True)

CPU times: total: 10.4 s
Wall time: 10.4 s


#### total number of currecy and avg amount money between sender and receiver bank

In [11]:
%%time
# number of currency
train_merged["sender_receiver_currency"] = train_merged["Sender"] + train_merged["Receiver"] + train_merged['InstructedCurrency']
test_merged["sender_receiver_currency"] = test_merged["Sender"] + test_merged["Receiver"] + train_merged['InstructedCurrency']

sender_receiver_freq = train_merged['sender_receiver_currency'].value_counts()
sender_receiver_freq.name = 'sender_receiver_currency_freq'

train_merged = train_merged.merge(sender_receiver_freq, left_on = 'sender_receiver_currency', right_index = True)
test_merged = test_merged.merge(sender_receiver_freq, left_on = 'sender_receiver_currency', right_index = True)

# avg amount currency
sender_receiver_currency = "sender_receiver_currency"
sender_receiver_avg_amount = train_merged.groupby(sender_receiver_currency).agg(
    sender_receiver_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(sender_receiver_avg_amount, left_on = sender_receiver_currency, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_receiver_avg_amount, left_on = sender_receiver_currency, right_index = True)

CPU times: total: 19.2 s
Wall time: 19.3 s


### Acount Network Features

#### total number of transaction of ordering and beneficary account

In [12]:
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_feature_name = sender + '_freq'
receiver_feature_name = receiver + "_freq"

sender_freq = train_merged[sender].value_counts() # count number of transactions of sender
sender_freq.name = sender_feature_name  
train_merged = train_merged.merge(sender_freq, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_freq, left_on = sender, right_index = True)   

receiver_freq = train_merged[receiver].value_counts()
receiver_freq.name = receiver_feature_name
train_merged = train_merged.merge(receiver_freq, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_freq, left_on = receiver, right_index = True)

#### total number of currecy and avg amount money per currecy of ordering and beneficiary account

In [13]:
# sender currecy
train_merged["OrderingAccount_currency"] = train_merged["OrderingAccount"] + train_merged["InstructedCurrency"]
test_merged["OrderingAccount_currency"] = test_merged["OrderingAccount"] + test_merged["InstructedCurrency"]
sender = 'OrderingAccount_currency'

# currecy freq
sender_freq = train_merged[sender].value_counts() # count number of records
sender_freq.name = 'OrderingAccount_currecy_freq'
train_merged = train_merged.merge(sender_freq, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_freq, left_on = sender, right_index = True)
# currecy avg amount
sender_avg_amount = train_merged.groupby(sender).agg(
    OrderingAccount_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(sender_avg_amount, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_avg_amount, left_on = sender, right_index = True)

#######################################################################################################
# receiver currency
train_merged["BeneficiaryAccount_currency"] = train_merged["BeneficiaryAccount"] + train_merged["InstructedCurrency"]
test_merged["BeneficiaryAccount_currency"] = test_merged["BeneficiaryAccount"] + test_merged["InstructedCurrency"]
receiver = 'BeneficiaryAccount_currency'
# currecy freq
receiver_freq = train_merged[receiver].value_counts() # count number of records
receiver_freq.name = 'BeneficiaryAccount_currecy_freq'
train_merged = train_merged.merge(receiver_freq, left_on = receiver, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(receiver_freq, left_on = receiver, right_index = True)
# currecy avg amount
receiver_avg_amount = train_merged.groupby(receiver).agg(
    BeneficiaryAccount_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(receiver_avg_amount, left_on = receiver, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(receiver_avg_amount, left_on = receiver, right_index = True)

#### total number of accounts ordering and beneficiary account connect to (in and out degree)

In [14]:
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'

# sender out degree
sender_out_degree = train_merged.groupby(sender).agg(
    sender_out_degree=pd.NamedAgg(column=receiver, aggfunc=lambda x: len(x.unique())))     
train_merged = train_merged.merge(sender_out_degree, left_on = sender, right_index = True)
test_merged = test_merged.merge(sender_out_degree, left_on = sender, right_index = True)

# sender in degree
sender_in_degree = train_merged.groupby(receiver).agg(
    sender_in_degree=pd.NamedAgg(column=sender, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(sender_in_degree, left_on = sender, right_index = True)
test_merged = test_merged.merge(sender_in_degree, left_on = sender, right_index = True)

# receiver out degree
receiver_out_degree = train_merged.groupby(sender).agg(
    receiver_out_degree=pd.NamedAgg(column=receiver, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(receiver_out_degree, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_out_degree, left_on = receiver, right_index = True)

# receiver in degree
receiver_in_degree = train_merged.groupby(receiver).agg(
    receiver_in_degree=pd.NamedAgg(column=sender, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(receiver_in_degree, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_in_degree, left_on = receiver, right_index = True)

#### total number of transactions between sender and receiver account

In [15]:
%%time
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver = sender + "_" + receiver
feature_name = sender_receiver + '_freq'

train_merged[sender_receiver] = train_merged[sender] + train_merged[receiver]
test_merged[sender_receiver] = test_merged[sender] + test_merged[receiver]

sender_receiver_freq = train_merged[sender_receiver].value_counts()
sender_receiver_freq.name = feature_name

train_merged = train_merged.merge(sender_receiver_freq, left_on = sender_receiver, right_index = True)
test_merged = test_merged.merge(sender_receiver_freq, left_on = sender_receiver, right_index = True)

CPU times: total: 21.6 s
Wall time: 21.6 s


#### total number of currecy and avg amount money between sender and receiver bank

In [16]:
%%time
# number of currency
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver_currency = sender + "_" + receiver + "_currency"
train_merged[sender_receiver_currency] = train_merged[sender] + train_merged[receiver] + train_merged['InstructedCurrency']
test_merged[sender_receiver_currency] = test_merged[sender] + test_merged[receiver] + train_merged['InstructedCurrency']

sender_receiver_freq = train_merged[sender_receiver_currency].value_counts()
sender_receiver_freq.name = sender_receiver_currency + "_freq"

train_merged = train_merged.merge(sender_receiver_freq, left_on = sender_receiver_currency, right_index = True)
test_merged = test_merged.merge(sender_receiver_freq, left_on = sender_receiver_currency, right_index = True)

# avg amount currency
sender_receiver_avg_amount = train_merged.groupby(sender_receiver_currency).agg(
    OrderingAccount_BeneficiaryAccount_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(sender_receiver_avg_amount, left_on = sender_receiver_currency, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_receiver_avg_amount, left_on = sender_receiver_currency, right_index = True)

CPU times: total: 41.3 s
Wall time: 41.4 s


### Account and Bank network features over time feature (hout, day, week)

In [22]:
test_merged["Timestamp"].dt.isocalendar().week.value_counts()

4    214
Name: week, dtype: Int64

#### Fine-grained by hour of day 1 - 24

In [None]:
%%time
# Hour
train_merged["hour"] = train_merged["Timestamp"].dt.hour
test_merged["hour"] = test_merged["Timestamp"].dt.hour

# Hour frequency for each sender
senders = train_merged["Sender"].unique()
train_merged["sender_hour"] = train_merged["Sender"] + train_merged["hour"].astype(str)
test_merged["sender_hour"] = test_merged["Sender"] + test_merged["hour"].astype(str)
sender_hour_frequency = {}
for s in senders:
    sender_rows = train_merged[train_merged["Sender"] == s]
    for h in range(24):
        sender_hour_frequency[s + str(h)] = len(sender_rows[sender_rows["hour"] == h])

train_merged["sender_hour_freq"] = train_merged["sender_hour"].map(sender_hour_frequency)
test_merged["sender_hour_freq"] = test_merged["sender_hour"].map(sender_hour_frequency)

#### Fine-grained by day of month 1 - 31