In [1]:
import pandas as pd
import utils.data_utils as data_utils
from pathlib import Path

## Preparing Data

In [2]:
# laoding data
DATA_DIR = Path.cwd().parent / "data"
train, test = data_utils.load_swift_data(DATA_DIR)
bank_data = data_utils.load_bank_data(DATA_DIR)

In [3]:
# Getting merged dataset
train_merged = data_utils.merge_swift_bank_data(train, bank_data)
test_merged = data_utils.merge_swift_bank_data(test, bank_data)
del bank_data
del train
del test

In [4]:
# fill missing data for flag as another category '12'
train_merged['Flag_ordering'] = train_merged['Flag_ordering'].fillna('12')
train_merged['Flag_beneficiary'] = train_merged['Flag_beneficiary'].fillna('12')
test_merged['Flag_ordering'] = test_merged['Flag_ordering'].fillna('12')
test_merged['Flag_beneficiary'] = test_merged['Flag_beneficiary'].fillna('12')

In [5]:
train_merged.head()

Unnamed: 0,Timestamp,UETR,Sender,Receiver,TransactionReference,OrderingAccount,OrderingName,OrderingStreet,OrderingCountryCityZip,BeneficiaryAccount,...,BeneficiaryStreet,BeneficiaryCountryCityZip,SettlementDate,SettlementCurrency,SettlementAmount,InstructedCurrency,InstructedAmount,Label,Flag_ordering,Flag_beneficiary
0,2022-01-01,f474fdb3-4675-4fff-ab7e-3469f82bd6a7,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-7054,FR90714755422956984353,PHACELIA HETEROPHYLLA,3| RUE HAMON,FR/42859 SAINTE AURÉLIE,611024064274704358,...,2584 CHARLES PLACE,US/ROJASLAND| DC 58442,220101,USD,1746319000.0,EUR,1560189000.0,0,0.0,0.0
1,2022-01-01,c9158def-dab1-4bfb-a31f-7f51c6679d60,BRRGPTPL,CBLHESMM,PETX22-NO-FX-1736,PT8895792452733129969,GONOLOBUS STEPHANOTRICHUS,AV RITA ALVES| 60,PT/5863-752 CANTANHEDE,ES61897100852916932423,...,ACCESO DE CARMINA ARAGÓN 83 PUERTA 4,ES/ÁVILA| 02281,220101,EUR,4711420.0,EUR,4711420.0,0,0.0,0.0
2,2022-01-01,d371ba0a-823f-4243-98ba-94ff18523420,BRRGPTPL,CBLHESMM,PETX22-NO-FX-1687,PT92895792452733126420,LECHEA INTERMEDIA-INTERMEDIA,PRAÇA VALENTE| 85,PT/1100-087 BARCELOS,ES31897100852916935097,...,PASADIZO ANÍBAL LUJÁN 57,ES/SEGOVIA| 40727,220101,EUR,752821.6,EUR,752821.6,0,0.0,0.0
3,2022-01-01,5a53a257-4dc9-4800-abb2-4cd1d55c8345,DPSUFRPP,ABVVUS6S,DPSU22-FXIYA-517,358727697099645998,SCLERANTHUS,341 4 CHOME 4 BAN 2 GO,JP/FUKUOKA PREFECTURE|ŌKAWA,611024064274698543,...,7864 MORRIS MEWS APT. 464,US/DPO AE 78549,220101,USD,6371209.0,JPY,649048700.0,0,0.0,0.0
4,2022-01-01,f27867ac-35e2-46af-8248-0a2d0d9bf00d,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-11878,FR71714755422956985471,SELAGINELLA ASPRELLA,28| BOULEVARD LÉVÊQUE,FR/36357 TURPIN,611024064274707099,...,363 ROBERT GARDENS,US/NEW KAREN| MS 49461,220101,USD,5179423.0,EUR,4627377.0,0,0.0,0.0


## Features

### transaction frequency of sender bank per hour

In [28]:
%%time
# Hour
train_merged["hour"] = train_merged["Timestamp"].dt.hour
test_merged["hour"] = test_merged["Timestamp"].dt.hour

# Hour frequency for each sender
senders = train_merged["Sender"].unique()
train_merged["sender_hour"] = train_merged["Sender"] + train_merged["hour"].astype(str)
test_merged["sender_hour"] = test_merged["Sender"] + test_merged["hour"].astype(str)
sender_hour_frequency = {}
for s in senders:
    sender_rows = train_merged[train_merged["Sender"] == s]
    for h in range(24):
        sender_hour_frequency[s + str(h)] = len(sender_rows[sender_rows["hour"] == h])

train_merged["sender_hour_freq"] = train_merged["sender_hour"].map(sender_hour_frequency)
test_merged["sender_hour_freq"] = test_merged["sender_hour"].map(sender_hour_frequency)

CPU times: total: 11.8 s
Wall time: 11.9 s


### Bank Network Features

- node feature:
    - number of transaction assoicate to the node
    - number of currecy used and avg amount money transferred per currency for a node
    - in and out degree
- edge feature:
    - number of transaction between two nodes

#### total number of transactions of sender and receiver bank

In [7]:
sender = 'Sender'
receiver = 'Receiver'
sender_feature_name = sender + '_freq'
receiver_feature_name = receiver + "_freq"

sender_freq = train_merged[sender].value_counts() # count number of transactions of sender
sender_freq.name = sender_feature_name  
train_merged = train_merged.merge(sender_freq, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_freq, left_on = sender, right_index = True)   

receiver_freq = train_merged[receiver].value_counts()
receiver_freq.name = receiver_feature_name
train_merged = train_merged.merge(receiver_freq, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_freq, left_on = receiver, right_index = True)

#### total number of currecy and avg amount per currecy of sender and receiver bank

In [11]:
# sender currecy
train_merged["sender_currency"] = train_merged["Sender"] + train_merged["InstructedCurrency"]
test_merged["sender_currency"] = test_merged["Sender"] + test_merged["InstructedCurrency"]
sender = 'sender_currency'

# currecy freq
sender_freq = train_merged[sender].value_counts() # count number of records
sender_freq.name = 'sender_currecy_freq'
train_merged = train_merged.merge(sender_freq, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_freq, left_on = sender, right_index = True)
# currecy avg amount
sender_avg_amount = train_merged.groupby(sender).agg(
    sender_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(sender_avg_amount, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_avg_amount, left_on = sender, right_index = True)

#######################################################################################################
# receiver currency
train_merged["receiver_currency"] = train_merged["Receiver"] + train_merged["InstructedCurrency"]
test_merged["receiver_currency"] = test_merged["Receiver"] + test_merged["InstructedCurrency"]
receiver = 'receiver_currency'
# currecy freq
receiver_freq = train_merged[receiver].value_counts() # count number of records
receiver_freq.name = 'receiver_currecy_freq'
train_merged = train_merged.merge(receiver_freq, left_on = receiver, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(receiver_freq, left_on = receiver, right_index = True)
# currecy avg amount
receiver_avg_amount = train_merged.groupby(receiver).agg(
    receiver_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(receiver_avg_amount, left_on = receiver, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(receiver_avg_amount, left_on = receiver, right_index = True)

#### total number of banks sender and receiver connect to (in and out degree)

In [12]:
sender = 'Sender'
receiver = 'Receiver'

# sender out degree
sender_out_degree = train_merged.groupby(sender).agg(
    sender_out_degree=pd.NamedAgg(column=receiver, aggfunc=lambda x: len(x.unique())))     
train_merged = train_merged.merge(sender_out_degree, left_on = sender, right_index = True)
test_merged = test_merged.merge(sender_out_degree, left_on = sender, right_index = True)

# sender in degree
sender_in_degree = train_merged.groupby(receiver).agg(
    sender_in_degree=pd.NamedAgg(column=sender, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(sender_in_degree, left_on = sender, right_index = True)
test_merged = test_merged.merge(sender_in_degree, left_on = sender, right_index = True)

# receiver out degree
receiver_out_degree = train_merged.groupby(sender).agg(
    receiver_out_degree=pd.NamedAgg(column=receiver, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(receiver_out_degree, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_out_degree, left_on = receiver, right_index = True)

# receiver in degree
receiver_in_degree = train_merged.groupby(receiver).agg(
    receiver_in_degree=pd.NamedAgg(column=sender, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(receiver_in_degree, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_in_degree, left_on = receiver, right_index = True)

#### total number of transactions  between sender and receiver bank

In [13]:
%%time
train_merged["sender_receiver"] = train_merged["Sender"] + train_merged["Receiver"]
test_merged["sender_receiver"] = test_merged["Sender"] + test_merged["Receiver"]

sender_receiver_freq = train_merged['sender_receiver'].value_counts()
sender_receiver_freq.name = 'sender_receiver_freq'

train_merged = train_merged.merge(sender_receiver_freq, left_on = 'sender_receiver', right_index = True)
test_merged = test_merged.merge(sender_receiver_freq, left_on = 'sender_receiver', right_index = True)

CPU times: total: 5.42 s
Wall time: 5.45 s


### Acount Network Features

#### total number of transaction of ordering and beneficary account

In [15]:
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_feature_name = sender + '_freq'
receiver_feature_name = receiver + "_freq"

sender_freq = train_merged[sender].value_counts() # count number of transactions of sender
sender_freq.name = sender_feature_name  
train_merged = train_merged.merge(sender_freq, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_freq, left_on = sender, right_index = True)   

receiver_freq = train_merged[receiver].value_counts()
receiver_freq.name = receiver_feature_name
train_merged = train_merged.merge(receiver_freq, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_freq, left_on = receiver, right_index = True)

#### total number of currecy and avg amount money per currecy of ordering and beneficiary account

In [17]:
# sender currecy
train_merged["OrderingAccount_currency"] = train_merged["OrderingAccount"] + train_merged["InstructedCurrency"]
test_merged["OrderingAccount_currency"] = test_merged["OrderingAccount"] + test_merged["InstructedCurrency"]
sender = 'OrderingAccount_currency'

# currecy freq
sender_freq = train_merged[sender].value_counts() # count number of records
sender_freq.name = 'OrderingAccount_currecy_freq'
train_merged = train_merged.merge(sender_freq, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_freq, left_on = sender, right_index = True)
# currecy avg amount
sender_avg_amount = train_merged.groupby(sender).agg(
    OrderingAccount_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(sender_avg_amount, left_on = sender, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(sender_avg_amount, left_on = sender, right_index = True)

#######################################################################################################
# receiver currency
train_merged["BeneficiaryAccount_currency"] = train_merged["BeneficiaryAccount"] + train_merged["InstructedCurrency"]
test_merged["BeneficiaryAccount_currency"] = test_merged["BeneficiaryAccount"] + test_merged["InstructedCurrency"]
receiver = 'BeneficiaryAccount_currency'
# currecy freq
receiver_freq = train_merged[receiver].value_counts() # count number of records
receiver_freq.name = 'BeneficiaryAccount_currecy_freq'
train_merged = train_merged.merge(receiver_freq, left_on = receiver, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(receiver_freq, left_on = receiver, right_index = True)
# currecy avg amount
receiver_avg_amount = train_merged.groupby(receiver).agg(
    BeneficiaryAccount_currecy_avg_amount=pd.NamedAgg(column="InstructedAmount", aggfunc='mean'))
train_merged = train_merged.merge(receiver_avg_amount, left_on = receiver, right_index = True) # merge to transaction level data
test_merged = test_merged.merge(receiver_avg_amount, left_on = receiver, right_index = True)

#### total number of accounts ordering and beneficiary account connect to (in and out degree)

In [18]:
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'

# sender out degree
sender_out_degree = train_merged.groupby(sender).agg(
    sender_out_degree=pd.NamedAgg(column=receiver, aggfunc=lambda x: len(x.unique())))     
train_merged = train_merged.merge(sender_out_degree, left_on = sender, right_index = True)
test_merged = test_merged.merge(sender_out_degree, left_on = sender, right_index = True)

# sender in degree
sender_in_degree = train_merged.groupby(receiver).agg(
    sender_in_degree=pd.NamedAgg(column=sender, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(sender_in_degree, left_on = sender, right_index = True)
test_merged = test_merged.merge(sender_in_degree, left_on = sender, right_index = True)

# receiver out degree
receiver_out_degree = train_merged.groupby(sender).agg(
    receiver_out_degree=pd.NamedAgg(column=receiver, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(receiver_out_degree, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_out_degree, left_on = receiver, right_index = True)

# receiver in degree
receiver_in_degree = train_merged.groupby(receiver).agg(
    receiver_in_degree=pd.NamedAgg(column=sender, aggfunc=lambda x: len(x.unique())))
train_merged = train_merged.merge(receiver_in_degree, left_on = receiver, right_index = True)
test_merged = test_merged.merge(receiver_in_degree, left_on = receiver, right_index = True)

#### total number of transactions between sender and receiver account

In [19]:
%%time
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver = sender + "_" + receiver
feature_name = sender_receiver + '_freq'

train_merged[sender_receiver] = train_merged[sender] + train_merged[receiver]
test_merged[sender_receiver] = test_merged[sender] + test_merged[receiver]

sender_receiver_freq = train_merged[sender_receiver].value_counts()
sender_receiver_freq.name = feature_name

train_merged = train_merged.merge(sender_receiver_freq, left_on = sender_receiver, right_index = True)
test_merged = test_merged.merge(sender_receiver_freq, left_on = sender_receiver, right_index = True)

CPU times: total: 10.1 s
Wall time: 10.2 s
