In [3]:
import pandas as pd
import utils.data_utils as data_utils
from pathlib import Path

## Preparing Data

In [4]:
# laoding data
DATA_DIR = Path.cwd().parent / "data"
train, test = data_utils.load_swift_data(DATA_DIR)
bank_data = data_utils.load_bank_data(DATA_DIR)

In [5]:
# Getting merged dataset
train_merged = data_utils.merge_swift_bank_data(train, bank_data)
test_merged = data_utils.merge_swift_bank_data(test, bank_data)
del bank_data
del train
del test

In [6]:
# fill missing data for flag as another category '12'
train_merged['Flag_ordering'] = train_merged['Flag_ordering'].fillna('12')
train_merged['Flag_beneficiary'] = train_merged['Flag_beneficiary'].fillna('12')
test_merged['Flag_ordering'] = test_merged['Flag_ordering'].fillna('12')
test_merged['Flag_beneficiary'] = test_merged['Flag_beneficiary'].fillna('12')

In [7]:
train_merged.columns

Index(['Timestamp', 'UETR', 'Sender', 'Receiver', 'TransactionReference',
       'OrderingAccount', 'OrderingName', 'OrderingStreet',
       'OrderingCountryCityZip', 'BeneficiaryAccount', 'BeneficiaryName',
       'BeneficiaryStreet', 'BeneficiaryCountryCityZip', 'SettlementDate',
       'SettlementCurrency', 'SettlementAmount', 'InstructedCurrency',
       'InstructedAmount', 'Label', 'Flag_ordering', 'Flag_beneficiary'],
      dtype='object')

## Features

In [8]:
# util funcs
def generate_feature(train, test, pivot_name, new_feature_name, func, agg_col=None):
    if func == 'value_count':
        d = train[pivot_name].value_counts()
        d.name = new_feature_name
        train = train.merge(d, left_on=pivot_name, right_index=True)
        test = test.merge(d, left_on=pivot_name, right_index=True)
    elif func == 'mean':
        d = train.groupby(pivot_name).agg(**{
                new_feature_name: pd.NamedAgg(column=agg_col, aggfunc='mean')})
        train=train.merge(d, left_on=pivot_name, right_index=True)
        test=test.merge(d, left_on=pivot_name, right_index=True)
    elif func == 'n_unique':
        d=train.groupby(pivot_name).agg(**{
                new_feature_name: pd.NamedAgg(column=agg_col, aggfunc=lambda x: len(x.unique()))})
        train=train.merge(d, left_on=pivot_name, right_index=True)
        test=test.merge(d, left_on=pivot_name, right_index=True)
    else:
        raise ValueError("func is not a valid option.")


    return train, test

### Hops

In [9]:
d_train = train_merged['UETR'].value_counts()
d_train.name = 'num_hops'
d_test = test_merged['UETR'].value_counts()
d_test.name = 'num_hops'

train_merged = train_merged.merge(d_train, left_on = 'UETR', right_index = True)
test_merged = test_merged.merge(d_test, left_on = 'UETR', right_index = True)

### Bank Network Features

- node feature:
    - number of transaction assoicate to the node
    - number of currecy used and avg amount money transferred per currency for a node
    - in and out degree
- edge feature:
    - number of transaction between two nodes
    - number of currency used and avg amount money transferred per currency between two nodes

#### total number of transactions of sender and receiver bank

In [10]:
# sender freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='Sender', new_feature_name="Sender_freq", func = 'value_count')

# receiver freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='Receiver', new_feature_name="Receiver_freq", func = 'value_count')

#### total number of currecy and avg amount per currecy of sender and receiver bank

In [11]:
def generate_currency_feature(train, test, name):
    
    # currecy freq
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_freq", func = 'value_count')

    # currecy avg amount
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_avg_amount", func = 'mean', agg_col='InstructedAmount')
    
    return train, test

#######################################################################################################
# sender currecy
train_merged["sender_currency"] = train_merged["Sender"] + train_merged["InstructedCurrency"]
test_merged["sender_currency"] = test_merged["Sender"] + test_merged["InstructedCurrency"]

name = 'sender_currency'
train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name)


#######################################################################################################
# receiver currency
train_merged["receiver_currency"] = train_merged["Receiver"] + train_merged["InstructedCurrency"]
test_merged["receiver_currency"] = test_merged["Receiver"] + test_merged["InstructedCurrency"]

name = 'receiver_currency'
train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name)

#### total number of banks sender and receiver connect to (in and out degree)

In [12]:
def generate_in_out_degree(train, test, from_node_name, to_node_name):
    
    # sender out degree
    train, test = generate_feature(
        train, test, pivot_name=from_node_name, new_feature_name = from_node_name + "_out_degree", 
        func = 'n_unique', agg_col = to_node_name)

    # sender in degree
    train, test = generate_feature(
        train, test, pivot_name=to_node_name, new_feature_name = from_node_name + "_in_degree", 
        func = 'n_unique', agg_col = from_node_name)

    # receiver out degree
    train, test = generate_feature(
        train, test, pivot_name=from_node_name, new_feature_name = to_node_name + "_out_degree", 
        func = 'n_unique', agg_col = to_node_name)

    # receiver in degree
    train, test = generate_feature(
        train, test, pivot_name=to_node_name, new_feature_name = to_node_name + "_in_degree", 
        func = 'n_unique', agg_col = from_node_name)
    
    return train, test

train_merged, test_merged = generate_in_out_degree(train_merged, test_merged, 'Sender', 'Receiver')

#### total number of transactions  between sender and receiver bank

In [None]:
train_merged["sender_receiver"] = train_merged["Sender"] + train_merged["Receiver"]
test_merged["sender_receiver"] = test_merged["Sender"] + test_merged["Receiver"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_receiver', new_feature_name="sender_receiver_freq", func = 'value_count')

#### total number of currecy and avg amount money between sender and receiver bank

In [None]:
# number of currency
train_merged["sender_receiver_currency"] = train_merged["Sender"] + train_merged["Receiver"] + train_merged['InstructedCurrency']
test_merged["sender_receiver_currency"] = test_merged["Sender"] + test_merged["Receiver"] + train_merged['InstructedCurrency']

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_receiver_currency', 
    new_feature_name="sender_receiver_currency_freq", func = 'value_count')

# avg amount currency
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name="sender_receiver_currency", 
    new_feature_name="sender_receiver_currency_avg_amount", func = "mean", agg_col="InstructedAmount")

### Acount Network Features

#### total number of transaction of ordering and beneficary account

In [None]:
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'

# sender freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender, new_feature_name=sender + '_freq', func = 'value_count')

# receiver freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=receiver, new_feature_name=receiver + "_freq", func = 'value_count')

#### total number of currecy and avg amount money per currecy of ordering and beneficiary account

In [None]:
def generate_currency_feature(train, test, name):
    
    # currecy freq
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_freq", func = 'value_count')

    # currecy avg amount
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_avg_amount", func = 'mean', agg_col='InstructedAmount')
    
    return train, test

#######################################################################################################
# sender currecy
train_merged["OrderingAccount_currency"] = train_merged["OrderingAccount"] + train_merged["InstructedCurrency"]
test_merged["OrderingAccount_currency"] = test_merged["OrderingAccount"] + test_merged["InstructedCurrency"]
name = 'OrderingAccount_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name)


#######################################################################################################
# receiver currency
train_merged["BeneficiaryAccount_currency"] = train_merged["BeneficiaryAccount"] + train_merged["InstructedCurrency"]
test_merged["BeneficiaryAccount_currency"] = test_merged["BeneficiaryAccount"] + test_merged["InstructedCurrency"]
name = 'BeneficiaryAccount_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name)

#### total number of accounts ordering and beneficiary account connect to (in and out degree)

In [None]:
train_merged, test_merged = generate_in_out_degree(train_merged, test_merged, 'OrderingAccount', 'BeneficiaryAccount')

#### total number of transactions between sender and receiver account

In [None]:
%%time
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver = sender + "_" + receiver
train_merged[sender_receiver] = train_merged[sender] + train_merged[receiver]
test_merged[sender_receiver] = test_merged[sender] + test_merged[receiver]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver, 
    new_feature_name=sender_receiver + "_freq", func = 'value_count')

#### total number of currecy and avg amount money between sender and receiver bank

In [None]:
%%time
# number of currency
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver_currency = sender + "_" + receiver + "_currency"
train_merged[sender_receiver_currency] = train_merged[sender] + train_merged[receiver] + train_merged['InstructedCurrency']
test_merged[sender_receiver_currency] = test_merged[sender] + test_merged[receiver] + train_merged['InstructedCurrency']

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver_currency, 
    new_feature_name=sender_receiver_currency + "_freq", func = 'value_count')

# avg amount currency
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver_currency, 
    new_feature_name=sender_receiver_currency + "_avg_amount", func = "mean", agg_col="InstructedAmount")

### Account and Bank network features over time feature (hout, day, week)

In [None]:
# get hour day week from timestamp
train_merged["hour"] = train_merged["Timestamp"].dt.hour.astype(str)
test_merged["hour"] = test_merged["Timestamp"].dt.hour.astype(str)
train_merged["day"] = train_merged["Timestamp"].dt.day.astype(str)
test_merged["day"] = test_merged["Timestamp"].dt.day.astype(str)
train_merged["week"] = train_merged["Timestamp"].dt.isocalendar().week.astype(str)
test_merged["week"] = test_merged["Timestamp"].dt.isocalendar().week.astype(str)

In [None]:
def generate_features_based_on_time(train, test, from_node_name, to_node_name, time_col):
    
    ##############################################################################################
    # construct feature
    from_ = from_node_name + "_" + time_col
    to_ = to_node_name + "_" + time_col
    train[from_] = train[from_node_name] + train[time_col]
    test[from_] = test[from_node_name] + test[time_col]
    
    train[to_] = train[to_node_name] + train[time_col]
    test[to_] = test[to_node_name] + test[time_col]
    
    ##############################################################################################
    # total number of transactions
    # sender freq
    train, test = generate_feature(
        train, test, pivot_name=from_, new_feature_name=from_ + '_freq', func = 'value_count')

    # receiver freq
    train, test = generate_feature(
        train, test, pivot_name=to_, new_feature_name=to_ + "_freq", func = 'value_count')
    
    
    ################################################################################################
    # currency and amount
    # sender
    feature_name = from_node_name + "_currency_" + time_col
    train[feature_name] = train[from_node_name] + train["InstructedCurrency"] + train[time_col]
    test[feature_name] = test[from_node_name] + test["InstructedCurrency"] + test[time_col]
    
    train, test = generate_currency_feature(train, test, feature_name)
    
    train = train.drop([feature_name], axis = 1)
    test = test.drop([feature_name], axis = 1)
    
    # receiver
    feature_name = to_node_name + "_currency_" + time_col
    train[feature_name] = train[to_node_name] + train["InstructedCurrency"] + train[time_col]
    test[feature_name] = test[to_node_name] + test["InstructedCurrency"] + test[time_col]
    train, test = generate_currency_feature(train, test, feature_name)
    
    train = train.drop([feature_name], axis = 1)
    test = test.drop([feature_name], axis = 1)
    
    ################################################################################################
    # in and out degree
    train, test = generate_in_out_degree(train, test, from_, to_)
    
    ###############################################################################################
    # edge - number transaction
    train[from_ + "_" + to_] = train[from_] + train[to_]
    test[from_ + "_" + to_] = test[from_] + test[to_]
    
    train, test = generate_feature( train, test, 
               pivot_name=from_ + "_" + to_, new_feature_name=from_ + "_" + to_ + "_freq", func = 'value_count')
    
    train =train.drop([from_ + "_" + to_],axis = 1)
    test = test.drop([from_ + "_" + to_], axis = 1)
    
    ##############################################################################################
    # edge - currency and amount
    # number of currency
    train[from_ + "_" + to_ + "_currency"] = train[from_] + train[to_] + train['InstructedCurrency']
    test[from_ + "_" + to_ + "_currency"] = test[from_] + test[to_] + train['InstructedCurrency']

    train, test = generate_feature(
        train, test, pivot_name=from_ + "_" + to_ + "_currency", 
        new_feature_name=from_ + "_" + to_ + "_currency_freq", func = 'value_count')

    # avg amount currency
    train, test = generate_feature(
        train, test, pivot_name=from_ + "_" + to_ + "_currency", 
        new_feature_name=from_ + "_" + to_ + "_currency_avg_amount", func = "mean", agg_col="InstructedAmount")
    
    train = train.drop([from_ + "_" + to_ + "_currency"], axis = 1)
    test = test.drop([from_ + "_" + to_ + "_currency"], axis = 1)

    
    ###############################################################################################
    # drop columns
    train = train.drop([from_, to_], axis = 1)
    test = test.drop([from_, to_], axis = 1)
    
    
    return train, test

#### Fine-grained by hour of day 1 - 24

In [None]:
%%time
# bank
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'hour')

#### Fine-grained by day of month 1 - 31

In [None]:
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'day')

#### Fine-grained by week of month 1 - 4

In [None]:
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'week')

### Drop unnecessary columns

In [32]:
columns_to_drop = [
    "UETR",
    "Sender",
    "Receiver",
    "TransactionReference",
    "OrderingAccount",
    "OrderingName",
    "OrderingStreet",
    "OrderingCountryCityZip",
    "BeneficiaryAccount",
    "BeneficiaryName",
    "BeneficiaryStreet",
    "BeneficiaryCountryCityZip",
    "SettlementDate",
    "SettlementCurrency",
    "InstructedCurrency",
    "Timestamp",
    "sender_receiver",
    'OrderingAccount_BeneficiaryAccount_currency',
    'OrderingAccount_BeneficiaryAccount',
    'BeneficiaryAccount_currency',
    'OrderingAccount_currency',
    'receiver_currency',
    'sender_currency',
    'num_hops_x'
]

train_merged = train_merged.drop(columns_to_drop, axis=1)
test_merged = test_merged.drop(columns_to_drop, axis=1)
train_merged = train_merged.rename({'num_hops_y': 'num_hops'})
test_merged = test_merged.rename({'num_hops_y': 'num_hops'})

In [34]:
train_merged.columns

Index(['SettlementAmount', 'InstructedAmount', 'Label', 'Flag_ordering',
       'Flag_beneficiary', 'num_hops_y', 'Sender_freq', 'Receiver_freq',
       'sender_currency_freq', 'sender_currency_avg_amount',
       'receiver_currency_freq', 'receiver_currency_avg_amount',
       'Sender_out_degree', 'Sender_in_degree', 'Receiver_out_degree',
       'Receiver_in_degree', 'sender_receiver_freq', 'OrderingAccount_freq',
       'BeneficiaryAccount_freq', 'OrderingAccount_currency_freq',
       'OrderingAccount_currency_avg_amount',
       'BeneficiaryAccount_currency_freq',
       'BeneficiaryAccount_currency_avg_amount', 'OrderingAccount_out_degree',
       'OrderingAccount_in_degree', 'BeneficiaryAccount_out_degree',
       'BeneficiaryAccount_in_degree',
       'OrderingAccount_BeneficiaryAccount_freq',
       'OrderingAccount_BeneficiaryAccount_currency_freq',
       'OrderingAccount_BeneficiaryAccount_currency_avg_amount', 'hour', 'day',
       'week', 'Sender_hour_freq', 'Receiver_hou