In [2]:
import pandas as pd
import utils.data_utils as data_utils
from pathlib import Path

## Preparing Data

In [3]:
# laoding data
DATA_DIR = Path.cwd().parent / "data"
train_path = DATA_DIR / "swift_HA_train.csv"
test_path = DATA_DIR / "swift_HA_test.csv"
train, test = data_utils.load_swift_data(train_path, test_path)
bank_data = data_utils.load_bank_data(DATA_DIR)

In [4]:
print(train.shape, test.shape)

(4690135, 19) (704856, 19)


In [5]:
# Getting merged dataset
train_merged = data_utils.merge_swift_bank_data(train, bank_data)
test_merged = data_utils.merge_swift_bank_data(test, bank_data)
del bank_data
del train
del test

In [6]:
# fill missing data for flag as another category '12'
train_merged['Flag_ordering'] = train_merged['Flag_ordering'].fillna('12')
train_merged['Flag_beneficiary'] = train_merged['Flag_beneficiary'].fillna('12')
test_merged['Flag_ordering'] = test_merged['Flag_ordering'].fillna('12')
test_merged['Flag_beneficiary'] = test_merged['Flag_beneficiary'].fillna('12')
print(train_merged.shape, test_merged.shape)

(4690135, 21) (704856, 21)


In [7]:
print(train_merged.columns)
print(train_merged.shape)
print(test_merged.shape)

Index(['Timestamp', 'UETR', 'Sender', 'Receiver', 'TransactionReference',
       'OrderingAccount', 'OrderingName', 'OrderingStreet',
       'OrderingCountryCityZip', 'BeneficiaryAccount', 'BeneficiaryName',
       'BeneficiaryStreet', 'BeneficiaryCountryCityZip', 'SettlementDate',
       'SettlementCurrency', 'SettlementAmount', 'InstructedCurrency',
       'InstructedAmount', 'Label', 'Flag_ordering', 'Flag_beneficiary'],
      dtype='object')
(4690135, 21)
(704856, 21)


## Features

In [None]:
# util funcs
def generate_feature(train, test, pivot_name, new_feature_name, func, agg_col=None):
    if func == 'value_count':
        d = train[pivot_name].value_counts()
        d.name = new_feature_name
        train = train.merge(d, left_on=pivot_name, right_index=True, how = 'left')
        test = test.merge(d, left_on=pivot_name, right_index=True, how = 'left')
    elif func == 'mean':
        d = train.groupby(pivot_name).agg(**{
                new_feature_name: pd.NamedAgg(column=agg_col, aggfunc='mean')})
        train=train.merge(d, left_on=pivot_name, right_index=True, how = 'left')
        test=test.merge(d, left_on=pivot_name, right_index=True, how = 'left')
    elif func == 'n_unique':
        d=train.groupby(pivot_name).agg(**{
                new_feature_name: pd.NamedAgg(column=agg_col, aggfunc=lambda x: len(x.unique()))})
        train=train.merge(d, left_on=pivot_name, right_index=True, how = 'left')
        test=test.merge(d, left_on=pivot_name, right_index=True, how = 'left')
    elif func == 'count':
        d=train.groupby(pivot_name).agg(**{
                new_feature_name: pd.NamedAgg(column=agg_col, aggfunc='count')})
        train=train.merge(d, left_on=pivot_name, right_index=True, how = 'left')
        test=test.merge(d, left_on=pivot_name, right_index=True, how = 'left')
    else:
        raise ValueError("func is not a valid option.")


    return train, test

### Hops

In [None]:
d_train = train_merged['UETR'].value_counts()
d_train.name = 'num_hops'
d_test = test_merged['UETR'].value_counts()
d_test.name = 'num_hops'

train_merged = train_merged.merge(d_train, left_on = 'UETR', right_index = True)
test_merged = test_merged.merge(d_test, left_on = 'UETR', right_index = True)
print(train_merged.shape)
print(test_merged.shape)

### Bank Network Features

- node feature:
    - number of transaction assoicate to the node
    - number of currecy used and avg amount money transferred per currency for a node
    - in and out degree
- edge feature:
    - number of transaction between two nodes
    - number of currency used and avg amount money transferred per currency between two nodes

#### total number of transactions of sender and receiver bank

In [None]:
# sender freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='Sender', new_feature_name="Sender_freq", func = 'value_count')

# receiver freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='Receiver', new_feature_name="Receiver_freq", func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

#### total number of currecy and avg amount per currecy of sender and receiver bank

In [None]:
def generate_currency_feature(train, test, name, agg_col):
    
    # currecy freq
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_freq", func = 'value_count')

    # currecy avg amount
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_avg_amount", func = 'mean', agg_col=agg_col)
    
    return train, test

#######################################################################################################
# sender currecy
train_merged["sender_currency"] = train_merged["Sender"] + train_merged["InstructedCurrency"]
test_merged["sender_currency"] = test_merged["Sender"] + test_merged["InstructedCurrency"]

name = 'sender_currency'
train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'InstructedAmount')


#######################################################################################################
# receiver currency
train_merged["receiver_currency"] = train_merged["Receiver"] + train_merged["SettlementCurrency"]
test_merged["receiver_currency"] = test_merged["Receiver"] + test_merged["SettlementCurrency"]

name = 'receiver_currency'
train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'SettlementAmount')
print(train_merged.shape)
print(test_merged.shape)

#### total number of banks sender and receiver connect to (in and out degree)

In [None]:
def generate_in_out_degree(train, test, from_node_name, to_node_name):
    
    # sender out degree
    train, test = generate_feature(
        train, test, pivot_name=from_node_name, new_feature_name = from_node_name + "_out_degree", 
        func = 'n_unique', agg_col = to_node_name)

    # sender in degree
    train, test = generate_feature(
        train, test, pivot_name=to_node_name, new_feature_name = from_node_name + "_in_degree", 
        func = 'n_unique', agg_col = from_node_name)

    # receiver out degree
    train, test = generate_feature(
        train, test, pivot_name=from_node_name, new_feature_name = to_node_name + "_out_degree", 
        func = 'n_unique', agg_col = to_node_name)

    # receiver in degree
    train, test = generate_feature(
        train, test, pivot_name=to_node_name, new_feature_name = to_node_name + "_in_degree", 
        func = 'n_unique', agg_col = from_node_name)
    
    return train, test

train_merged, test_merged = generate_in_out_degree(train_merged, test_merged, 'Sender', 'Receiver')
print(train_merged.shape)
print(test_merged.shape)

#### total number of transactions  between sender and receiver bank

In [None]:
train_merged["sender_receiver"] = train_merged["Sender"] + train_merged["Receiver"]
test_merged["sender_receiver"] = test_merged["Sender"] + test_merged["Receiver"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_receiver', new_feature_name="sender_receiver_freq", func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

#### total number of currecy and avg amount money between sender and receiver bank

In [None]:
# number of currency
train_merged["sender_receiver_currency"] = train_merged["Sender"] + train_merged["Receiver"] + train_merged['InstructedCurrency']
test_merged["sender_receiver_currency"] = test_merged["Sender"] + test_merged["Receiver"] + test_merged['InstructedCurrency']

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_receiver_currency', 
    new_feature_name="sender_receiver_currency_freq", func = 'value_count')

# avg amount currency
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name="sender_receiver_currency", 
    new_feature_name="sender_receiver_currency_avg_amount", func = "mean", agg_col="InstructedAmount")
print(train_merged.shape)
print(test_merged.shape)

### Acount Network Features

#### total number of transaction of ordering and beneficary account

In [None]:
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'

# sender freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender, new_feature_name=sender + '_freq', func = 'value_count')

# receiver freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=receiver, new_feature_name=receiver + "_freq", func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

#### total number of currecy and avg amount money per currecy of ordering and beneficiary account

In [None]:
def generate_currency_feature(train, test, name, agg_col):
    
    # currecy freq
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_freq", func = 'value_count')

    # currecy avg amount
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_avg_amount", func = 'mean', agg_col=agg_col)
    
    return train, test

#######################################################################################################
# sender currecy
train_merged["OrderingAccount_currency"] = train_merged["OrderingAccount"] + train_merged["InstructedCurrency"]
test_merged["OrderingAccount_currency"] = test_merged["OrderingAccount"] + test_merged["InstructedCurrency"]
name = 'OrderingAccount_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'InstructedAmount')


#######################################################################################################
# receiver currency
train_merged["BeneficiaryAccount_currency"] = train_merged["BeneficiaryAccount"] + train_merged["SettlementCurrency"]
test_merged["BeneficiaryAccount_currency"] = test_merged["BeneficiaryAccount"] + test_merged["SettlementCurrency"]
name = 'BeneficiaryAccount_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, "SettlementAmount")
print(train_merged.shape)
print(test_merged.shape)

#### total number of accounts ordering and beneficiary account connect to (in and out degree)

In [None]:
train_merged, test_merged = generate_in_out_degree(train_merged, test_merged, 'OrderingAccount', 'BeneficiaryAccount')
print(train_merged.shape)
print(test_merged.shape)

#### total number of transactions between sender and receiver account

In [None]:
%%time
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver = sender + "_" + receiver
train_merged[sender_receiver] = train_merged[sender] + train_merged[receiver]
test_merged[sender_receiver] = test_merged[sender] + test_merged[receiver]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver, 
    new_feature_name=sender_receiver + "_freq", func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

#### total number of currecy and avg amount money between sender and receiver bank

In [None]:
%%time
# number of currency
sender = 'OrderingAccount'
receiver = 'BeneficiaryAccount'
sender_receiver_currency = sender + "_" + receiver + "_currency"
train_merged[sender_receiver_currency] = train_merged[sender] + train_merged[receiver] + train_merged['InstructedCurrency']
test_merged[sender_receiver_currency] = test_merged[sender] + test_merged[receiver] + test_merged['InstructedCurrency']

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver_currency, 
    new_feature_name=sender_receiver_currency + "_freq", func = 'value_count')

# avg amount currency
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name=sender_receiver_currency, 
    new_feature_name=sender_receiver_currency + "_avg_amount", func = "mean", agg_col="InstructedAmount")
print(train_merged.shape)
print(test_merged.shape)

### Account and Bank network features over time feature (hout, day, week)

In [None]:
# get hour day week from timestamp
train_merged["hour"] = train_merged["Timestamp"].dt.hour.astype(str)
test_merged["hour"] = test_merged["Timestamp"].dt.hour.astype(str)
train_merged["day"] = train_merged["Timestamp"].dt.day.astype(str)
test_merged["day"] = test_merged["Timestamp"].dt.day.astype(str)
train_merged["week"] = train_merged["Timestamp"].dt.isocalendar().week.astype(str)
test_merged["week"] = test_merged["Timestamp"].dt.isocalendar().week.astype(str)

In [None]:
train_merged["sender_hour"] = train_merged["Sender"] + train_merged["hour"].astype(str)
test_merged["sender_hour"] = test_merged["Sender"] + test_merged["hour"].astype(str)
train_merged, test_merged = generate_feature(
        train_merged, test_merged, pivot_name='sender_hour', new_feature_name='sender_hour_freq', func = 'value_count')

In [None]:
train_merged = train_merged.drop(['sender_hour', 'sender_hour_freq'], axis = 1)
test_merged = test_merged.drop(['sender_hour', 'sender_hour_freq'], axis = 1)

In [None]:
def generate_features_based_on_time(train, test, from_node_name, to_node_name, time_col):
    
    ##############################################################################################
    # construct feature
    from_ = from_node_name + "_" + time_col
    to_ = to_node_name + "_" + time_col
    train[from_] = train[from_node_name] + train[time_col]
    test[from_] = test[from_node_name] + test[time_col]
    
    train[to_] = train[to_node_name] + train[time_col]
    test[to_] = test[to_node_name] + test[time_col]
    
    ##############################################################################################
    # total number of transactions
    # sender freq
    train, test = generate_feature(
        train, test, pivot_name=from_, new_feature_name=from_ + '_freq', func = 'value_count')

    # receiver freq
    train, test = generate_feature(
        train, test, pivot_name=to_, new_feature_name=to_ + "_freq", func = 'value_count')
    
    
    ################################################################################################
    # currency and amount
    # sender
    feature_name = from_node_name + "_currency_" + time_col
    train[feature_name] = train[from_node_name] + train["InstructedCurrency"] + train[time_col]
    test[feature_name] = test[from_node_name] + test["InstructedCurrency"] + test[time_col]
    
    train, test = generate_currency_feature(train, test, feature_name)
    
    train = train.drop([feature_name], axis = 1)
    test = test.drop([feature_name], axis = 1)
    
    # receiver
    feature_name = to_node_name + "_currency_" + time_col
    train[feature_name] = train[to_node_name] + train["InstructedCurrency"] + train[time_col]
    test[feature_name] = test[to_node_name] + test["InstructedCurrency"] + test[time_col]
    train, test = generate_currency_feature(train, test, feature_name)
    
    train = train.drop([feature_name], axis = 1)
    test = test.drop([feature_name], axis = 1)
    
    ################################################################################################
    # in and out degree
    train, test = generate_in_out_degree(train, test, from_, to_)
    
    ###############################################################################################
    # edge - number transaction
    train[from_ + "_" + to_] = train[from_] + train[to_]
    test[from_ + "_" + to_] = test[from_] + test[to_]
    
    train, test = generate_feature( train, test, 
               pivot_name=from_ + "_" + to_, new_feature_name=from_ + "_" + to_ + "_freq", func = 'value_count')
    
    train =train.drop([from_ + "_" + to_],axis = 1)
    test = test.drop([from_ + "_" + to_], axis = 1)
    
    ##############################################################################################
    # edge - currency and amount
    # number of currency
    train[from_ + "_" + to_ + "_currency"] = train[from_] + train[to_] + train['InstructedCurrency']
    test[from_ + "_" + to_ + "_currency"] = test[from_] + test[to_] + train['InstructedCurrency']

    train, test = generate_feature(
        train, test, pivot_name=from_ + "_" + to_ + "_currency", 
        new_feature_name=from_ + "_" + to_ + "_currency_freq", func = 'value_count')

    # avg amount currency
    train, test = generate_feature(
        train, test, pivot_name=from_ + "_" + to_ + "_currency", 
        new_feature_name=from_ + "_" + to_ + "_currency_avg_amount", func = "mean", agg_col="InstructedAmount")
    
    train = train.drop([from_ + "_" + to_ + "_currency"], axis = 1)
    test = test.drop([from_ + "_" + to_ + "_currency"], axis = 1)

    
    ###############################################################################################
    # drop columns
    train = train.drop([from_, to_], axis = 1)
    test = test.drop([from_, to_], axis = 1)
    
    
    return train, test

#### Fine-grained by hour of day 1 - 24

In [None]:
%%time
# bank
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'hour')

#### Fine-grained by day of month 1 - 31

In [None]:
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'day')

#### Fine-grained by week of month 1 - 4

In [None]:
train_merged, test_merged = generate_features_based_on_time(train_merged, test_merged, 'Sender', 'Receiver', 'week')

### Country Features

In [None]:
def get_country(row, name):
    li = row[name].split('/')
    if len(li) >= 1:
        return li[0]
    else:
        return pd.NA
train_merged['OrderingCountry'] = train_merged.apply(lambda row: get_country(row, 'OrderingCountryCityZip'), axis=1)
train_merged['BeneficiaryCountry'] = train_merged.apply(lambda row: get_country(row, 'BeneficiaryCountryCityZip'), axis=1)
test_merged['OrderingCountry'] = test_merged.apply(lambda row:get_country(row, 'OrderingCountryCityZip'), axis=1)
test_merged['BeneficiaryCountry'] = test_merged.apply(lambda row:get_country(row, 'BeneficiaryCountryCityZip'), axis=1)

#### how many countries order account/Sender or beneficiary account/Receiver associated with

In [None]:
# sender freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='OrderingAccount', new_feature_name="OrderingAccount_Country_freq", 
    func = 'count', agg_col='OrderingCountry')

# receiver freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='BeneficiaryAccount', new_feature_name="BeneficiaryAccount_Country_freq", 
    func = 'count', agg_col='BeneficiaryCountry')
print(train_merged.shape)
print(test_merged.shape)

# ordering freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='Sender', new_feature_name="Sender_Country_freq", 
    func = 'count', agg_col='OrderingCountry')

# beneficiary freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='Receiver', new_feature_name="Receiver_Country_freq", 
    func = 'count', agg_col='BeneficiaryCountry')
print(train_merged.shape)
print(test_merged.shape)

#### Number of transactions

##### Number of transactions for ordering country and beneficiary country

In [None]:
# sender freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='OrderingCountry', new_feature_name="OrderingCountry_freq", 
    func = 'value_count')

# receiver freq
train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='BeneficiaryCountry', new_feature_name="BeneficiaryCountry_freq", 
    func = 'value_count')
print(train_merged.shape)
print(test_merged.shape)

##### Number of transactions of sender/orderingAccount-country and receiver/BeneficiaryAccount-country

In [None]:
# sender receiver
train_merged["sender_country"] = train_merged["Sender"] + train_merged["OrderingCountry"]
train_merged["receiver_country"] = train_merged["Receiver"] + train_merged["BeneficiaryCountry"]
test_merged["sender_country"] = test_merged["Sender"] + test_merged["OrderingCountry"]
test_merged["receiver_country"] = test_merged["Receiver"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_country', new_feature_name="sender_country_freq", func = 'value_count')

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='receiver_country', new_feature_name="receiver_country_freq", func = 'value_count')

train_merged = train_merged.drop(["sender_country", "receiver_country"], axis=1)
test_merged = test_merged.drop(["sender_country", 'receiver_country'], axis=1)

# ordering account beneficiary account
train_merged["ordering_country"] = train_merged["OrderingAccount"] + train_merged["OrderingCountry"]
train_merged["beneficiary_country"] = train_merged["BeneficiaryAccount"] + train_merged["BeneficiaryCountry"]
test_merged["ordering_country"] = test_merged["OrderingAccount"] + test_merged["OrderingCountry"]
test_merged["beneficiary_country"] = test_merged["BeneficiaryAccount"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='ordering_country', new_feature_name="ordering_country_freq", func = 'value_count')

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='beneficiary_country', new_feature_name="beneficiary_country_freq", func = 'value_count')

train_merged = train_merged.drop(["ordering_country", "beneficiary_country"], axis=1)
test_merged = test_merged.drop(["ordering_country", "beneficiary_country"], axis=1)

##### Number of transactions between ordering country or beneficiary country

In [None]:
train_merged["ordering_beneficiary_country"] = train_merged["OrderingCountry"] + train_merged["BeneficiaryCountry"]
test_merged["ordering_beneficiary_country"] = test_merged["OrderingCountry"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='ordering_beneficiary_country', new_feature_name="ordering_beneficiary_country_freq", 
    func = 'value_count')

train_merged = train_merged.drop(["ordering_beneficiary_country"], axis=1)
test_merged = test_merged.drop(["ordering_beneficiary_country"], axis=1)

# account - country - account - country
train_merged["orderingAccount_beneficiaryAccount_country"] = train_merged["OrderingAccount"] + train_merged["OrderingCountry"] + train_merged['BeneficiaryAccount'] + train_merged["BeneficiaryCountry"]
test_merged["orderingAccount_beneficiaryAccount_country"] = test_merged["OrderingAccount"] + test_merged["OrderingCountry"] + test["BeneficiaryAccount"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='orderingAccount_beneficiaryAccount_country', 
    new_feature_name="orderingAccount_beneficiaryAccount_country_freq", func = 'value_count')

train_merged = train_merged.drop(["orderingAccount_beneficiaryAccount_country"], axis=1)
test_merged = test_merged.drop(["orderingAccount_beneficiaryAccount_country"], axis=1)

# sender - country - receiver - country
train_merged["sender_receiver_country"] = train_merged["Sender"] + train_merged["OrderingCountry"] + train_merged['Receiver'] + train_merged["BeneficiaryCountry"]
test_merged["sender_receiver_country"] = test_merged["Sender"] + test_merged["OrderingCountry"] + test["Receiver"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_receiver_country', 
    new_feature_name="sender_receiver_country_freq", func = 'value_count')

train_merged = train_merged.drop(["sender_receiver_country"], axis=1)
test_merged = test_merged.drop(["sender_receiver_country"], axis=1)

#### Number of currency and average amount of currency

##### Number of currency and average amount of currency for ordering country and beneficiary country

In [None]:
def generate_currency_feature(train, test, name, agg_col):
    
    # currecy freq
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_freq", func = 'value_count')

    # currecy avg amount
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_avg_amount", func = 'mean', agg_col=agg_col)
    
    return train, test

#######################################################################################################
# ordering country currecy
train_merged["OrderingCountry_currency"] = train_merged["OrderingCountry"] + train_merged["InstructedCurrency"]
test_merged["OrderingCountry_currency"] = test_merged["OrderingCountry"] + test_merged["InstructedCurrency"]
name = 'OrderingCountry_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'InstructedAmount')


#######################################################################################################
# ordering country currency
train_merged["BeneficiaryCountry_currency"] = train_merged["BeneficiaryCountry"] + train_merged["SettlementCurrency"]
test_merged["BeneficiaryCountry_currency"] = test_merged["BeneficiaryCountry"] + test_merged["SettlementCurrency"]
name = 'BeneficiaryCountry_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, "SettlementAmount")
print(train_merged.shape)
print(test_merged.shape)

train_merged = train_merged.drop(["OrderingCountry_currency", "BeneficiaryCountry_currency"], axis=1)
test_merged = test_merged.drop(["OrderingCountry_currency", "BeneficiaryCountry_currency"], axis=1)

##### Currency feature for account-country/bank-country pattern

In [None]:
#######################################################################################################
# ordering accunt - country currecy
train_merged["OrderingAccount_Country_currency"] = train_merged["OrderingAccount"] + train_merged["OrderingCountry"] + train_merged["InstructedCurrency"]
test_merged["OrderingAccount_Country_currency"] = test_merged["OrderingAccount"] + test_merged["OrderingCountry"] + test_merged["InstructedCurrency"]
name = 'OrderingAccountCountry_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'InstructedAmount')


#######################################################################################################
# beneficiary account - country currency
train_merged["BeneficiaryAccount_Country_currency"] = train_merged["BeneficiaryAccount"] + train_merged["BeneficiaryCountry"] + train_merged["SettlementCurrency"]
test_merged["BeneficiaryAccount_Country_currency"] = test_merged["BeneficiaryAccount"] + test_merged["BeneficiaryCountry"] + test_merged["SettlementCurrency"]
name = 'BeneficiaryAccount_Country_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, "SettlementAmount")
print(train_merged.shape)
print(test_merged.shape)

train_merged = train_merged.drop(["OrderingAccount_Country_currency", "BeneficiaryAccount_Country_currency"], axis=1)
test_merged = test_merged.drop(["OrderingAccount_Country_currency", "BeneficiaryAccount_Country_currency"], axis=1)

#######################################################################################################
# sender - country currecy
train_merged["Sender_Country_currency"] = train_merged["Sender"] + train_merged["OrderingCountry"] + train_merged["InstructedCurrency"]
test_merged["Sender_Country_currency"] = test_merged["Sender"] + test_merged["OrderingCountry"] + test_merged["InstructedCurrency"]
name = 'Sender_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'InstructedAmount')


#######################################################################################################
# receiver - country currency
train_merged["Receiver_Country_currency"] = train_merged["Receiver"] + train_merged["BeneficiaryCountry"] + train_merged["SettlementCurrency"]
test_merged["Receiver_Country_currency"] = test_merged["Receiver"] + test_merged["BeneficiaryCountry"] + test_merged["SettlementCurrency"]
name = 'Receiver_Country_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, "SettlementAmount")
print(train_merged.shape)
print(test_merged.shape)

train_merged = train_merged.drop(["Sender_Country_currency", "Receiver_Country_currency"], axis=1)
test_merged = test_merged.drop(["Sender_Country_currency", "Receiver_Country_currency"], axis=1)

##### Number of transactions of sender/orderingAccount-country and receiver/BeneficiaryAccount-country

In [41]:
# sender receiver
train_merged["sender_country"] = train_merged["Sender"] + train_merged["OrderingCountry"]
train_merged["receiver_country"] = train_merged["Receiver"] + train_merged["BeneficiaryCountry"]
test_merged["sender_country"] = test_merged["Sender"] + test_merged["OrderingCountry"]
test_merged["receiver_country"] = test_merged["Receiver"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_country', new_feature_name="sender_country_freq", func = 'value_count')

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='receiver_country', new_feature_name="receiver_country_freq", func = 'value_count')

train_merged = train_merged.drop(["sender_country", "receiver_country"], axis=1)
test_merged = test_merged.drop(["sender_country", 'receiver_country'], axis=1)

# ordering account beneficiary account
train_merged["ordering_country"] = train_merged["OrderingAccount"] + train_merged["OrderingCountry"]
train_merged["beneficiary_country"] = train_merged["BeneficiaryAccount"] + train_merged["BeneficiaryCountry"]
test_merged["ordering_country"] = test_merged["OrderingAccount"] + test_merged["OrderingCountry"]
test_merged["beneficiary_country"] = test_merged["BeneficiaryAccount"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='ordering_country', new_feature_name="ordering_country_freq", func = 'value_count')

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='beneficiary_country', new_feature_name="beneficiary_country_freq", func = 'value_count')

train_merged = train_merged.drop(["ordering_country", "beneficiary_country"], axis=1)
test_merged = test_merged.drop(["ordering_country", "beneficiary_country"], axis=1)

##### Number of transactions between ordering country or beneficiary country

In [43]:
train_merged["ordering_beneficiary_country"] = train_merged["OrderingCountry"] + train_merged["BeneficiaryCountry"]
test_merged["ordering_beneficiary_country"] = test_merged["OrderingCountry"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='ordering_beneficiary_country', new_feature_name="ordering_beneficiary_country_freq", 
    func = 'value_count')

train_merged = train_merged.drop(["ordering_beneficiary_country"], axis=1)
test_merged = test_merged.drop(["ordering_beneficiary_country"], axis=1)

# account - country - account - country
train_merged["orderingAccount_beneficiaryAccount_country"] = train_merged["OrderingAccount"] + train_merged["OrderingCountry"] + train_merged['BeneficiaryAccount'] + train_merged["BeneficiaryCountry"]
test_merged["orderingAccount_beneficiaryAccount_country"] = test_merged["OrderingAccount"] + test_merged["OrderingCountry"] + test["BeneficiaryAccount"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='orderingAccount_beneficiaryAccount_country', 
    new_feature_name="orderingAccount_beneficiaryAccount_country_freq", func = 'value_count')

train_merged = train_merged.drop(["orderingAccount_beneficiaryAccount_country"], axis=1)
test_merged = test_merged.drop(["orderingAccount_beneficiaryAccount_country"], axis=1)

# sender - country - receiver - country
train_merged["sender_receiver_country"] = train_merged["Sender"] + train_merged["OrderingCountry"] + train_merged['Receiver'] + train_merged["BeneficiaryCountry"]
test_merged["sender_receiver_country"] = test_merged["Sender"] + test_merged["OrderingCountry"] + test["Receiver"] + test_merged["BeneficiaryCountry"]

train_merged, test_merged = generate_feature(
    train_merged, test_merged, pivot_name='sender_receiver_country', 
    new_feature_name="sender_receiver_country_freq", func = 'value_count')

train_merged = train_merged.drop(["sender_receiver_country"], axis=1)
test_merged = test_merged.drop(["sender_receiver_country"], axis=1)

#### Number of currency and average amount of currency

##### Number of currency and average amount of currency for ordering country and beneficiary country

In [None]:
def generate_currency_feature(train, test, name, agg_col):
    
    # currecy freq
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_freq", func = 'value_count')

    # currecy avg amount
    train, test = generate_feature(
        train, test, pivot_name=name, new_feature_name=name + "_avg_amount", func = 'mean', agg_col=agg_col)
    
    return train, test

#######################################################################################################
# ordering country currecy
train_merged["OrderingCountry_currency"] = train_merged["OrderingCountry"] + train_merged["InstructedCurrency"]
test_merged["OrderingCountry_currency"] = test_merged["OrderingCountry"] + test_merged["InstructedCurrency"]
name = 'OrderingCountry_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'InstructedAmount')


#######################################################################################################
# ordering country currency
train_merged["BeneficiaryCountry_currency"] = train_merged["BeneficiaryCountry"] + train_merged["SettlementCurrency"]
test_merged["BeneficiaryCountry_currency"] = test_merged["BeneficiaryCountry"] + test_merged["SettlementCurrency"]
name = 'BeneficiaryCountry_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, "SettlementAmount")
print(train_merged.shape)
print(test_merged.shape)

train_merged = train_merged.drop(["OrderingCountry_currency", "BeneficiaryCountry_currency"], axis=1)
test_merged = test_merged.drop(["OrderingCountry_currency", "BeneficiaryCountry_currency"], axis=1)

##### Currency feature for account-country/bank-country pattern

In [None]:
#######################################################################################################
# ordering accunt - country currecy
train_merged["OrderingAccount_Country_currency"] = train_merged["OrderingAccount"] + train_merged["OrderingCountry"] + train_merged["InstructedCurrency"]
test_merged["OrderingAccount_Country_currency"] = test_merged["OrderingAccount"] + test_merged["OrderingCountry"] + test_merged["InstructedCurrency"]
name = 'OrderingAccountCountry_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'InstructedAmount')


#######################################################################################################
# beneficiary account - country currency
train_merged["BeneficiaryAccount_Country_currency"] = train_merged["BeneficiaryAccount"] + train_merged["BeneficiaryCountry"] + train_merged["SettlementCurrency"]
test_merged["BeneficiaryAccount_Country_currency"] = test_merged["BeneficiaryAccount"] + test_merged["BeneficiaryCountry"] + test_merged["SettlementCurrency"]
name = 'BeneficiaryAccount_Country_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, "SettlementAmount")
print(train_merged.shape)
print(test_merged.shape)

train_merged = train_merged.drop(["OrderingAccount_Country_currency", "BeneficiaryAccount_Country_currency"], axis=1)
test_merged = test_merged.drop(["OrderingAccount_Country_currency", "BeneficiaryAccount_Country_currency"], axis=1)

#######################################################################################################
# sender - country currecy
train_merged["Sender_Country_currency"] = train_merged["Sender"] + train_merged["OrderingCountry"] + train_merged["InstructedCurrency"]
test_merged["Sender_Country_currency"] = test_merged["Sender"] + test_merged["OrderingCountry"] + test_merged["InstructedCurrency"]
name = 'Sender_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, 'InstructedAmount')


#######################################################################################################
# receiver - country currency
train_merged["Receiver_Country_currency"] = train_merged["Receiver"] + train_merged["BeneficiaryCountry"] + train_merged["SettlementCurrency"]
test_merged["Receiver_Country_currency"] = test_merged["Receiver"] + test_merged["BeneficiaryCountry"] + test_merged["SettlementCurrency"]
name = 'Receiver_Country_currency'

train_merged, test_merged = generate_currency_feature(train_merged, test_merged, name, "SettlementAmount")
print(train_merged.shape)
print(test_merged.shape)

train_merged = train_merged.drop(["Sender_Country_currency", "Receiver_Country_currency"], axis=1)
test_merged = test_merged.drop(["Sender_Country_currency", "Receiver_Country_currency"], axis=1)

## Drop unnecessary columns and save data

In [33]:
train_merged.columns

Index(['Timestamp', 'UETR', 'Sender', 'Receiver', 'TransactionReference',
       'OrderingAccount', 'OrderingName', 'OrderingStreet',
       'OrderingCountryCityZip', 'BeneficiaryAccount', 'BeneficiaryName',
       'BeneficiaryStreet', 'BeneficiaryCountryCityZip', 'SettlementDate',
       'SettlementCurrency', 'SettlementAmount', 'InstructedCurrency',
       'InstructedAmount', 'Label', 'Flag_ordering', 'Flag_beneficiary',
       'num_hops', 'Sender_freq', 'Receiver_freq', 'sender_currency',
       'sender_currency_freq', 'sender_currency_avg_amount',
       'receiver_currency', 'receiver_currency_freq',
       'receiver_currency_avg_amount', 'Sender_out_degree', 'Sender_in_degree',
       'Receiver_out_degree', 'Receiver_in_degree', 'sender_receiver',
       'sender_receiver_freq', 'sender_receiver_currency',
       'sender_receiver_currency_freq', 'sender_receiver_currency_avg_amount',
       'OrderingAccount_freq', 'BeneficiaryAccount_freq',
       'OrderingAccount_currency', 'Order

In [None]:
columns_to_drop = [
    "UETR",
    "Sender",
    "Receiver",
    "TransactionReference",
    "OrderingAccount",
    "OrderingName",
    "OrderingStreet",
    "OrderingCountryCityZip",
    "BeneficiaryAccount",
    "BeneficiaryName",
    "BeneficiaryStreet",
    "BeneficiaryCountryCityZip",
    "SettlementDate",
    "SettlementCurrency",
    "InstructedCurrency",
    "Timestamp",
    "sender_receiver",
    'OrderingAccount_BeneficiaryAccount_currency',
    'OrderingAccount_BeneficiaryAccount',
    'BeneficiaryAccount_currency',
    'OrderingAccount_currency',
    'receiver_currency',
    'sender_currency',
    'sender_hour'
]

train_merged = train_merged.drop(columns_to_drop, axis=1)
test_merged = test_merged.drop(columns_to_drop, axis=1)

## baseline verification

In [22]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sklearn.utils

In [35]:
retained_feature = ['SettlementAmount', 'InstructedAmount', 'Label', 'Sender_hour_freq', 'hour', 
                    'sender_currency_freq', 'sender_currency_avg_amount', 'sender_receiver_freq']
train_rt = train_merged[retained_feature].copy()
test_rt = test_merged[retained_feature].copy()

In [36]:
train_rt = train_rt.fillna(0)
test_rt = test_rt.fillna(0)
train_rt['hour'] = train_rt['hour'].astype('int64')
test_rt['hour'] = test_rt['hour'].astype('int64')

In [37]:
del train_merged
del test_merged

In [38]:
Y_train = train_rt["Label"].values
X_train = train_rt.drop(["Label"], axis=1).values
Y_test = test_rt["Label"].values
X_test = test_rt.drop(["Label"], axis=1).values

# Normalize

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [39]:
%%time
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=100)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(xgb, X_train, Y_train, cv=kfold, scoring="f1")

xgb.fit(X_train, Y_train)
print("Minimum:", cv_results.min())
print("Maximum:", cv_results.max())
print("StanDev:", cv_results.std())

Minimum: 0.7734241908006814
Maximum: 0.8044406490179333
StanDev: 0.010830083353441794
CPU times: total: 1h 46min 7s
Wall time: 7min 59s


In [40]:
pred_xgb = xgb.predict(X_test)
print("XGBoost Classification Report=\n\n", classification_report(Y_test, pred_xgb))
print("XGBoost Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_xgb))
pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]
print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb))

XGBoost Classification Report=

               precision    recall  f1-score   support

           0       1.00      1.00      1.00    704357
           1       0.90      0.68      0.78       499

    accuracy                           1.00    704856
   macro avg       0.95      0.84      0.89    704856
weighted avg       1.00      1.00      1.00    704856

XGBoost Confusion Matrix=

 [[704320     37]
 [   158    341]]
AUPRC: 0.8689991047437784


In [None]:
train_merged.to_csv('train_features.csv')
test_merged.to_csv('test_features.csv')