In [14]:
import mmh3
import BitVector
import redis
import math
import time


class BloomFilter():
    SEEDS = [543, 460, 171, 876, 796, 607, 650, 81, 837, 545, 591, 946, 846, 521, 913, 636, 878, 735, 414, 372,
             344, 324, 223, 180, 327, 891, 798, 933, 493, 293, 836, 10, 6, 544, 924, 849, 438, 41, 862, 648, 338,
             465, 562, 693, 979, 52, 763, 103, 387, 374, 349, 94, 384, 680, 574, 480, 307, 580, 71, 535, 300, 53,
             481, 519, 644, 219, 686, 236, 424, 326, 244, 212, 909, 202, 951, 56, 812, 901, 926, 250, 507, 739, 371,
             63, 584, 154, 7, 284, 617, 332, 472, 140, 605, 262, 355, 526, 647, 923, 199, 518]

    def __init__(self, capacity=1000000000, error_rate=0.00000001, key = 'BF', conn=None):
        """
        cpacity: number of unique keys
        error_rate: false postive rate
        
        """
        self.m = math.ceil(capacity*math.log2(math.e)*math.log2(1/error_rate))      #bitarray size
        self.k = math.ceil(math.log1p(2)*self.m/capacity)                           #hash functions
        self.mem = math.ceil(self.m/8/1024/1024)                                    #memory chunck
        self.blocknum = math.ceil(self.mem/512)                                     #memory chunck
        self.seeds = self.SEEDS[0:self.k]
        self.key = key
        self.N = 2**31-1
        self.redis = conn
        if not self.redis:
            # if no redis connection, using the memory 
            self.bitset = BitVector.BitVector(size=1<<32)
        print(self.m)
        print(self.mem)
        print(self.k)

    def add(self, value):
        name = self.key + "_" + str(ord(value[0])%self.blocknum)
        hashs = self.get_hashs(value)
        for hash in hashs:
            if self.redis:
                self.redis.setbit(name, hash, 1)
            else:
                self.bitset[hash] = 1

    def is_exist(self, value):
        name = self.key + "_" + str(ord(value[0])%self.blocknum)
        hashs = self.get_hashs(value)
        exist = True
        for hash in hashs:
            if self.redis:
                exist = exist & self.redis.getbit(name, hash)
            else:
                exist = exist & self.bitset[hash]
        return exist

    def get_hashs(self, value):
        hashs = list()
        for seed in self.seeds:
            hash = mmh3.hash(value, seed)
            if hash >= 0:
                hashs.append(hash)
            else:
                hashs.append(self.N - hash)
        return hashs


#pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
#conn = redis.StrictRedis(connection_pool=pool)


In [1]:
import tracemalloc
import pandas as pd
import numpy as np

In [26]:
swift = pd.read_csv('../data/swift.csv')

In [10]:
bank = pd.read_csv('../data/bank_dataset.csv')
bank['keys'] = bank['Account'] + bank['Name'] bank['Street'] + bank['CountryCityZip']

In [13]:
allkeys = list(bank['keys'])

In [23]:
%%time
def my_func():
    bf = BloomFilter(capacity=550000, error_rate = 0.0001)
    for acc in allkeys:
        bf.add(acc)

    return bf

tracemalloc.start()
gloabl_boom = my_func()

current, peak = tracemalloc.get_traced_memory()
print(f"Current memory usage is {current / 10**6}KB; Peak was {peak / 10**6}KB; Diff = {(peak - current) / 10**6}KB")
 
# stopping the library
tracemalloc.stop()

10543565
2
22
Current memory usage is 541191.978KB; Peak was 3225547.711KB; Diff = 2684355.733KB
CPU times: user 30.8 s, sys: 453 ms, total: 31.2 s
Wall time: 31.4 s


In [25]:
print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB; Diff = {(peak - current) / 10**6}MB")

Current memory usage is 541.191978MB; Peak was 3225.547711MB; Diff = 2684.355733MB


# Seperate the anomalies

>1) Collect all the anomalies from the data I provided (i.e., data without any simple anomalies). These are statistical anomalies<br>
>2) Find all the anomalies that are not in 1), these are simple anomalies.<br>
>3) We label 1) as 1 and 2) as 0. Train xgboost and measure its performace.<br>

In [2]:
import pandas as pd
import numpy as np
### Libraries for Algorithms

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import sklearn.utils
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from xgboost import XGBClassifier

In [4]:
swift = pd.read_csv('../data/swift.csv')

In [5]:
swift.shape

(5396833, 21)

In [6]:
train = pd.read_csv('../data/swift_HA_train.csv')
test = pd.read_csv('../data/swift_HA_test.csv')

In [7]:
filtered = pd.concat([train, test])

In [8]:
filtered.shape

(5394991, 20)

In [11]:
all_ano = swift[swift['Label']==1]
f_ano = filtered[filtered['Label']==1]
normal_list = list(all_ano[~all_ano.UETR.isin(f_ano.UETR)].UETR)
normal_ano = all_ano[all_ano.UETR.isin(normal_list)]

In [38]:
print(all_ano.shape)
print(f_ano.shape)
print(normal_ano.shape)

(5661, 21)
(3819, 8)
(1842, 8)


# Extract features

Features in simple anomalies (normal_ano) are based on the entire (unfiltered data)<br>
Features in statistical anomalies (f_ano) are also based on the entire data<br>

In [58]:
all_ano = swift[swift['Label']==1]
f_ano = filtered[filtered['Label']==1] # statistical anomalies
normal_list = list(all_ano[~all_ano.UETR.isin(f_ano.UETR)].UETR)
normal_ano = all_ano[all_ano.UETR.isin(normal_list)] # simple anomalies

In [59]:
swift["Timestamp"] = swift["Timestamp"].astype("datetime64[ns]")
f_ano["Timestamp"] = f_ano["Timestamp"].astype("datetime64[ns]")
normal_ano["Timestamp"] = normal_ano["Timestamp"].astype("datetime64[ns]")
#bank = pd.read_csv('../data/bank_dataset.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_ano["Timestamp"] = f_ano["Timestamp"].astype("datetime64[ns]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_ano["Timestamp"] = normal_ano["Timestamp"].astype("datetime64[ns]")


In [60]:
%%time
# Hour
swift["hour"] = swift["Timestamp"].dt.hour
f_ano["hour"] = f_ano["Timestamp"].dt.hour
normal_ano["hour"] = normal_ano["Timestamp"].dt.hour

# Hour frequency for each sender
senders = swift["Sender"].unique()
swift["sender_hour"] = swift["Sender"] + swift["hour"].astype(str)
f_ano["sender_hour"] = f_ano["Sender"] + f_ano["hour"].astype(str)
normal_ano["sender_hour"] = normal_ano["Sender"] + normal_ano["hour"].astype(str)

sender_hour_frequency = {}
for s in senders:
    sender_rows = swift[swift["Sender"] == s]
    for h in range(24):
        sender_hour_frequency[s + str(h)] = len(sender_rows[sender_rows["hour"] == h])

#swift["sender_hour_freq"] = swift["sender_hour"].map(sender_hour_frequency)
f_ano["sender_hour_freq"] = f_ano["sender_hour"].map(sender_hour_frequency)
normal_ano["sender_hour_freq"] = normal_ano["sender_hour"].map(sender_hour_frequency)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

CPU times: user 7.6 s, sys: 1.26 s, total: 8.86 s
Wall time: 9.08 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [62]:
%%time
# Sender-Currency Frequency and Average Amount per Sender-Currency
swift["sender_currency"] = swift["Sender"] + swift["InstructedCurrency"]
f_ano["sender_currency"] = f_ano["Sender"] + f_ano["InstructedCurrency"]
normal_ano["sender_currency"] = normal_ano["Sender"] + normal_ano["InstructedCurrency"]


sender_currency_freq = {}
sender_currency_avg = {}

for sc in set(
    list(swift["sender_currency"].unique())
):
    sender_currency_freq[sc] = len(swift[swift["sender_currency"] == sc])
    sender_currency_avg[sc] = swift[swift["sender_currency"] == sc][
        "InstructedAmount"
    ].mean()

#swift["sender_currency_freq"] = swift["sender_currency"].map(sender_currency_freq)
f_ano["sender_currency_freq"] = f_ano["sender_currency"].map(sender_currency_freq)
normal_ano["sender_currency_freq"] = normal_ano["sender_currency"].map(sender_currency_freq)

#swift["sender_currency_amount_average"] = swift["sender_currency"].map(sender_currency_avg)
f_ano["sender_currency_amount_average"] = f_ano["sender_currency"].map(sender_currency_avg)
normal_ano["sender_currency_amount_average"] = normal_ano["sender_currency"].map(sender_currency_avg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 19.7 s, sys: 844 ms, total: 20.6 s
Wall time: 20.7 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [63]:
%%time
# Sender-Receiver Frequency
swift["sender_receiver"] = swift["Sender"] + swift["Receiver"]
f_ano["sender_receiver"] = f_ano["Sender"] + f_ano["Receiver"]
normal_ano["sender_receiver"] = normal_ano["Sender"] + normal_ano["Receiver"]

sender_receiver_freq = {}

for sr in set(
    list(swift["sender_receiver"].unique())
):
    sender_receiver_freq[sr] = len(swift[swift["sender_receiver"] == sr])

#swift["sender_receiver_freq"] = swift["sender_receiver"].map(sender_receiver_freq)
f_ano["sender_receiver_freq"] = f_ano["sender_receiver"].map(sender_receiver_freq)
normal_ano["sender_receiver_freq"] = normal_ano["sender_receiver"].map(sender_receiver_freq)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 22.9 s, sys: 823 ms, total: 23.8 s
Wall time: 24 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [64]:
normal_ano

Unnamed: 0,Timestamp,UETR,Sender,Receiver,TransactionReference,OrderingAccount,OrderingName,OrderingStreet,OrderingCountryCityZip,BeneficiaryAccount,...,order_flag,bene_flag,hour,sender_hour,sender_hour_freq,sender_currency,sender_currency_freq,sender_currency_amount_average,sender_receiver,sender_receiver_freq
1579,2022-01-01 01:04:00,b60d7297-c127-4fcb-8cd7-bac07f49c148,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-34626,FR37714755422957005677,AVICENNIA MARINA-RESINIFERA,99| AVENUE NICOLAS,FR/21079 REGNIER,61102406474705654,...,0.0,,1,DPSUFRPP1,33584,DPSUFRPPEUR,1840201,1.690104e+08,DPSUFRPPABVVUS6S,1966056
11900,2022-01-01 04:34:00,db455a50-fa0c-4876-8a7e-62b23dd26ff8,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-8891,FR90714755422957006954,ELAPHOGLOSSUM ALATUM,21| BOULEVARD DUMAS,FR/90759 ILLET,611024064274691631,...,0.0,0.0,4,DPSUFRPP4,130291,DPSUFRPPEUR,1840201,1.690104e+08,DPSUFRPPABVVUS6S,1966056
12153,2022-01-01 04:35:00,9492b334-8b4c-45c1-92e8-bd129b4a8de4,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-392,FR94714755422957004845,CHAMAECYTISUS PROLIFERA,420| AVENUE GUY GRÉGOIRE,FR/56898 ROBERT-SUR-FAURE,611024064274714223,...,0.0,0.0,4,DPSUFRPP4,130291,DPSUFRPPEUR,1840201,1.690104e+08,DPSUFRPPABVVUS6S,1966056
13672,2022-01-01 04:52:00,d61bb80a-eb39-487e-9711-70ad6d82d5c0,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-8830,FR43714755422956984132,ERIGERON ELATIOR,3| RUE DE MARION,FR/60210 BODINVILLE,6110240642747035XX,...,0.0,1.0,4,DPSUFRPP4,130291,DPSUFRPPGBP,251926,3.699564e+06,DPSUFRPPABVVUS6S,1966056
14155,2022-01-01 04:53:00,bfedc7b2-fdb9-4f25-8772-fd549401d6ea,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-2832,FR92714755422956974326,PEDILANTHUS,16| RUE ADÉLAÏDE BAZIN,FR/95397 THIBAULT,6110240642746928XX,...,0.0,10.0,4,DPSUFRPP4,130291,DPSUFRPPEUR,1840201,1.690104e+08,DPSUFRPPABVVUS6S,1966056
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5384166,2022-01-25 11:52:00,c06a7851-2fd6-4235-a411-388b6fbd63aa,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-3623,FR83714755422956980405,SCHKUHRIA PINNATA-PINNATA,RUE MARÉCHAL,FR/27925 ÉTIENNE,6110240642746921XX,...,0.0,1.0,11,DPSUFRPP11,635743,DPSUFRPPEUR,1840201,1.690104e+08,DPSUFRPPABVVUS6S,1966056
5385966,2022-01-30 10:15:00,ce51f37a-a106-4583-a294-8a99e259a794,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-23777,FR21714755422956993549,PLEUROTHALLIS APPENDICULATA,6| AVENUE DE LÉVY,FR/04139 MARÉCHAL,611024064274696896,...,0.0,0.0,10,DPSUFRPP10,92216,DPSUFRPPEUR,1840201,1.690104e+08,DPSUFRPPABVVUS6S,1966056
5391548,2022-01-28 04:53:00,c87a8c77-63e3-43fb-9586-52b9a9a514be,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-260,FR71714755422956974025,CYNOGLOSSUM FURCATUM,43| AVENUE VINCENT,FR/88504 SAINT NOÉMI,6110240642746969XX,...,0.0,4.0,4,DPSUFRPP4,130291,DPSUFRPPGBP,251926,3.699564e+06,DPSUFRPPABVVUS6S,1966056
5393691,2022-01-29 11:27:00,a1a6a27e-6b58-47c6-b7a8-80972e3bb9b7,DPSUFRPP,ABVVUS6S,PETX22-FXIDA-8525,FR2114755422956996847,LESQUERELLA UTAHENSIS,24| RUE DE RAYMOND,FR/62344 SAINT JULES,611024064274710012,...,,0.0,11,DPSUFRPP11,635743,DPSUFRPPGBP,251926,3.699564e+06,DPSUFRPPABVVUS6S,1966056


In [65]:
columns_to_drop = [
    "UETR",
    "Sender",
    "Receiver",
    "TransactionReference",
    "OrderingAccount",
    "OrderingName",
    "OrderingStreet",
    "OrderingCountryCityZip",
    "BeneficiaryAccount",
    "BeneficiaryName",
    "BeneficiaryStreet",
    "BeneficiaryCountryCityZip",
    "SettlementDate",
    "SettlementCurrency",
    "InstructedCurrency",
    "Timestamp",
    "sender_hour",
    "sender_currency",
    "sender_receiver",
]

f_ano = f_ano.drop(columns_to_drop, axis=1)
normal_ano = normal_ano.drop(columns_to_drop, axis=1)

In [66]:
f_ano = f_ano.drop(['MessageId'], axis=1)
normal_ano = normal_ano.drop(['order_flag', 'bene_flag'], axis=1)
normal_ano['Label'] = normal_ano['Label'].map({1:0}) # map the label 1 to 0
ano = pd.concat([f_ano, normal_ano])

In [67]:
ano

Unnamed: 0,SettlementAmount,InstructedAmount,Label,hour,sender_hour_freq,sender_currency_freq,sender_currency_amount_average,sender_receiver_freq
439,5.696691e+06,5.089512e+06,1,11,635743,1840201,1.690104e+08,19
534,9.865503e+06,8.813994e+06,1,8,195031,1840201,1.690104e+08,1966056
636,5.523044e+06,4.934373e+06,1,11,635743,75,3.468684e+08,1966056
2463,1.792749e+09,1.601670e+09,1,22,4810,1840201,1.690104e+08,1966056
2730,9.203884e+08,8.222893e+08,1,5,50010,1840201,1.690104e+08,24
...,...,...,...,...,...,...,...,...
5384166,4.417685e+06,3.946828e+06,0,11,635743,1840201,1.690104e+08,1966056
5385966,4.817090e+06,4.303663e+06,0,10,92216,1840201,1.690104e+08,1966056
5391548,3.658144e+06,2.939922e+06,0,4,130291,251926,3.699564e+06,1966056
5393691,2.723516e+06,2.188794e+06,0,11,635743,251926,3.699564e+06,1966056


# Training

In [68]:
Y = ano["Label"].values
X = ano.drop(["Label"], axis=1).values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, random_state = 0)
# train and test split

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [69]:
%%time

xgb = XGBClassifier(n_estimators=100)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(xgb, X_train, Y_train, cv=kfold, scoring="f1")

xgb.fit(X_train, Y_train)

pred_xgb = xgb.predict(X_test)
print("XGBoost Classification Report=\n\n", classification_report(Y_test, pred_xgb))

print("XGBoost Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_xgb))

pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]

print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb))

XGBoost Classification Report=

               precision    recall  f1-score   support

           0       0.96      1.00      0.98       461
           1       1.00      0.98      0.99       955

    accuracy                           0.98      1416
   macro avg       0.98      0.99      0.98      1416
weighted avg       0.98      0.98      0.98      1416

XGBoost Confusion Matrix=

 [[459   2]
 [ 20 935]]
AUPRC: 0.9975559598451311
CPU times: user 5.52 s, sys: 2.14 s, total: 7.66 s
Wall time: 2.3 s


In [70]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=7, random_state=0, n_estimators=10)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(rf, X_train, Y_train, cv=kfold, scoring="f1")

rf.fit(X_train, Y_train)

pred_rf = rf.predict(X_test)
print("Random Forest Classification Report=\n\n", classification_report(Y_test, pred_rf))
print("Random Forest Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_rf))
pred_proba_rf = rf.predict_proba(X_test)[:, 1]

print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_rf))

Random Forest Classification Report=

               precision    recall  f1-score   support

           0       0.84      0.98      0.90       461
           1       0.99      0.91      0.95       955

    accuracy                           0.93      1416
   macro avg       0.91      0.94      0.92      1416
weighted avg       0.94      0.93      0.93      1416

Random Forest Confusion Matrix=

 [[450  11]
 [ 86 869]]
AUPRC: 0.9937134479542321
CPU times: user 217 ms, sys: 123 ms, total: 340 ms
Wall time: 96.3 ms


# Extract features

Features in simple anomalies (normal_ano) are based on the entire (unfiltered data)<br>
Features in statistical anomalies (f_ano) are based on the filtered data (dataset after removing the simple anomalies<br>

In [71]:
all_ano = swift[swift['Label']==1]
f_ano = filtered[filtered['Label']==1]
normal_list = list(all_ano[~all_ano.UETR.isin(f_ano.UETR)].UETR)
normal_ano = all_ano[all_ano.UETR.isin(normal_list)]

In [72]:
swift["Timestamp"] = swift["Timestamp"].astype("datetime64[ns]")
filtered["Timestamp"] = filtered["Timestamp"].astype("datetime64[ns]")

f_ano["Timestamp"] = f_ano["Timestamp"].astype("datetime64[ns]")
normal_ano["Timestamp"] = normal_ano["Timestamp"].astype("datetime64[ns]")
#bank = pd.read_csv('../data/bank_dataset.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_ano["Timestamp"] = f_ano["Timestamp"].astype("datetime64[ns]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_ano["Timestamp"] = normal_ano["Timestamp"].astype("datetime64[ns]")


In [73]:
%%time
# Hour
swift["hour"] = swift["Timestamp"].dt.hour
filtered["hour"] = filtered["Timestamp"].dt.hour


f_ano["hour"] = f_ano["Timestamp"].dt.hour
normal_ano["hour"] = normal_ano["Timestamp"].dt.hour

# Hour frequency for each sender
senders = swift["Sender"].unique()

swift["sender_hour"] = swift["Sender"] + swift["hour"].astype(str)
filtered["sender_hour"] = filtered["Sender"] + filtered["hour"].astype(str)

f_ano["sender_hour"] = f_ano["Sender"] + f_ano["hour"].astype(str)
normal_ano["sender_hour"] = normal_ano["Sender"] + normal_ano["hour"].astype(str)

sender_hour_frequency = {}
sender_hour_frequency_f = {}

for s in senders:
    sender_rows = swift[swift["Sender"] == s]
    sender_rows_f = filtered[filtered["Sender"] == s]
    
    for h in range(24):
        sender_hour_frequency[s + str(h)] = len(sender_rows[sender_rows["hour"] == h])
        sender_hour_frequency_f[s + str(h)] = len(sender_rows_f[sender_rows_f["hour"] == h])


f_ano["sender_hour_freq"] = f_ano["sender_hour"].map(sender_hour_frequency_f)
normal_ano["sender_hour_freq"] = normal_ano["sender_hour"].map(sender_hour_frequency)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

CPU times: user 16.9 s, sys: 5.66 s, total: 22.6 s
Wall time: 24.7 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [74]:
%%time
# Sender-Currency Frequency and Average Amount per Sender-Currency
swift["sender_currency"] = swift["Sender"] + swift["InstructedCurrency"]
filtered["sender_currency"] = filtered["Sender"] + filtered["InstructedCurrency"]

f_ano["sender_currency"] = f_ano["Sender"] + f_ano["InstructedCurrency"]
normal_ano["sender_currency"] = normal_ano["Sender"] + normal_ano["InstructedCurrency"]


sender_currency_freq = {}
sender_currency_avg = {}
sender_currency_freq_f = {}
sender_currency_avg_f = {}

for sc in set(
    list(swift["sender_currency"].unique())
):
    sender_currency_freq[sc] = len(swift[swift["sender_currency"] == sc])
    sender_currency_avg[sc] = swift[swift["sender_currency"] == sc][
        "InstructedAmount"
    ].mean()
    
    sender_currency_freq_f[sc] = len(filtered[filtered["sender_currency"] == sc])
    sender_currency_avg_f[sc] = filtered[filtered["sender_currency"] == sc][
        "InstructedAmount"
    ].mean()
    
#swift["sender_currency_freq"] = swift["sender_currency"].map(sender_currency_freq)
f_ano["sender_currency_freq"] = f_ano["sender_currency"].map(sender_currency_freq_f)
normal_ano["sender_currency_freq"] = normal_ano["sender_currency"].map(sender_currency_freq)

#swift["sender_currency_amount_average"] = swift["sender_currency"].map(sender_currency_avg)
f_ano["sender_currency_amount_average"] = f_ano["sender_currency"].map(sender_currency_avg_f)
normal_ano["sender_currency_amount_average"] = normal_ano["sender_currency"].map(sender_currency_avg)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 40.8 s, sys: 3.77 s, total: 44.6 s
Wall time: 45.5 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

In [75]:
%%time
# Sender-Receiver Frequency
swift["sender_receiver"] = swift["Sender"] + swift["Receiver"]
filtered["sender_receiver"] = filtered["Sender"] + filtered["Receiver"]

f_ano["sender_receiver"] = f_ano["Sender"] + f_ano["Receiver"]
normal_ano["sender_receiver"] = normal_ano["Sender"] + normal_ano["Receiver"]

sender_receiver_freq = {}
sender_receiver_freq_f = {}
for sr in set(
    list(swift["sender_receiver"].unique())
):
    sender_receiver_freq[sr] = len(swift[swift["sender_receiver"] == sr])
    sender_receiver_freq_f[sr] = len(filtered[filtered["sender_receiver"] == sr])

#swift["sender_receiver_freq"] = swift["sender_receiver"].map(sender_receiver_freq)
f_ano["sender_receiver_freq"] = f_ano["sender_receiver"].map(sender_receiver_freq_f)
normal_ano["sender_receiver_freq"] = normal_ano["sender_receiver"].map(sender_receiver_freq)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


CPU times: user 46.1 s, sys: 2.69 s, total: 48.8 s
Wall time: 49.3 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [76]:
columns_to_drop = [
    "UETR",
    "Sender",
    "Receiver",
    "TransactionReference",
    "OrderingAccount",
    "OrderingName",
    "OrderingStreet",
    "OrderingCountryCityZip",
    "BeneficiaryAccount",
    "BeneficiaryName",
    "BeneficiaryStreet",
    "BeneficiaryCountryCityZip",
    "SettlementDate",
    "SettlementCurrency",
    "InstructedCurrency",
    "Timestamp",
    "sender_hour",
    "sender_currency",
    "sender_receiver",
]

f_ano = f_ano.drop(columns_to_drop, axis=1)
normal_ano = normal_ano.drop(columns_to_drop, axis=1)

In [77]:
f_ano = f_ano.drop(['MessageId'], axis=1)
normal_ano = normal_ano.drop(['order_flag', 'bene_flag'], axis=1)
normal_ano['Label'] = normal_ano['Label'].map({1:0}) # map the label 1 to 0
ano = pd.concat([f_ano, normal_ano])

# Training

In [78]:
Y = ano["Label"].values
X = ano.drop(["Label"], axis=1).values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, random_state = 0)
# train and test split

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [79]:
%%time

xgb = XGBClassifier(n_estimators=100)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(xgb, X_train, Y_train, cv=kfold, scoring="f1")

xgb.fit(X_train, Y_train)

pred_xgb = xgb.predict(X_test)
print("XGBoost Classification Report=\n\n", classification_report(Y_test, pred_xgb))

print("XGBoost Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_xgb))

pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]

print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb))

XGBoost Classification Report=

               precision    recall  f1-score   support

           0       1.00      1.00      1.00       461
           1       1.00      1.00      1.00       955

    accuracy                           1.00      1416
   macro avg       1.00      1.00      1.00      1416
weighted avg       1.00      1.00      1.00      1416

XGBoost Confusion Matrix=

 [[461   0]
 [  0 955]]
AUPRC: 1.0
CPU times: user 2.52 s, sys: 1.12 s, total: 3.64 s
Wall time: 918 ms


In [80]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=7, random_state=0, n_estimators=10)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(rf, X_train, Y_train, cv=kfold, scoring="f1")

rf.fit(X_train, Y_train)

pred_rf = rf.predict(X_test)
print("Random Forest Classification Report=\n\n", classification_report(Y_test, pred_rf))
print("Random Forest Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_rf))
pred_proba_rf = rf.predict_proba(X_test)[:, 1]

print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_rf))

Random Forest Classification Report=

               precision    recall  f1-score   support

           0       1.00      1.00      1.00       461
           1       1.00      1.00      1.00       955

    accuracy                           1.00      1416
   macro avg       1.00      1.00      1.00      1416
weighted avg       1.00      1.00      1.00      1416

Random Forest Confusion Matrix=

 [[461   0]
 [  0 955]]
AUPRC: 1.0
CPU times: user 186 ms, sys: 89.8 ms, total: 275 ms
Wall time: 84.9 ms


# Extrac features

Features in simple anomalies (normal_ano) are based on the filtered data<br>
Features in statistical anomalies (f_ano) are also based on the filtered data<br>

In [81]:
all_ano = swift[swift['Label']==1]
f_ano = filtered[filtered['Label']==1]
normal_list = list(all_ano[~all_ano.UETR.isin(f_ano.UETR)].UETR)
normal_ano = all_ano[all_ano.UETR.isin(normal_list)]

#swift["Timestamp"] = swift["Timestamp"].astype("datetime64[ns]")
filtered["Timestamp"] = filtered["Timestamp"].astype("datetime64[ns]")

f_ano["Timestamp"] = f_ano["Timestamp"].astype("datetime64[ns]")
normal_ano["Timestamp"] = normal_ano["Timestamp"].astype("datetime64[ns]")
#bank = pd.read_csv('../data/bank_dataset.csv')

# Hour
#swift["hour"] = swift["Timestamp"].dt.hour
filtered["hour"] = filtered["Timestamp"].dt.hour


f_ano["hour"] = f_ano["Timestamp"].dt.hour
normal_ano["hour"] = normal_ano["Timestamp"].dt.hour

# Hour frequency for each sender
senders = swift["Sender"].unique()

#swift["sender_hour"] = swift["Sender"] + swift["hour"].astype(str)
filtered["sender_hour"] = filtered["Sender"] + filtered["hour"].astype(str)

f_ano["sender_hour"] = f_ano["Sender"] + f_ano["hour"].astype(str)
normal_ano["sender_hour"] = normal_ano["Sender"] + normal_ano["hour"].astype(str)

#sender_hour_frequency = {}
sender_hour_frequency_f = {}

for s in senders:
 #   sender_rows = swift[swift["Sender"] == s]
    sender_rows_f = filtered[filtered["Sender"] == s]
    
    for h in range(24):
  #      sender_hour_frequency[s + str(h)] = len(sender_rows[sender_rows["hour"] == h])
        sender_hour_frequency_f[s + str(h)] = len(sender_rows_f[sender_rows_f["hour"] == h])


f_ano["sender_hour_freq"] = f_ano["sender_hour"].map(sender_hour_frequency_f)
normal_ano["sender_hour_freq"] = normal_ano["sender_hour"].map(sender_hour_frequency_f)

# Sender-Currency Frequency and Average Amount per Sender-Currency
#swift["sender_currency"] = swift["Sender"] + swift["InstructedCurrency"]
filtered["sender_currency"] = filtered["Sender"] + filtered["InstructedCurrency"]

f_ano["sender_currency"] = f_ano["Sender"] + f_ano["InstructedCurrency"]
normal_ano["sender_currency"] = normal_ano["Sender"] + normal_ano["InstructedCurrency"]


#sender_currency_freq = {}
#sender_currency_avg = {}
sender_currency_freq_f = {}
sender_currency_avg_f = {}

for sc in set(
    list(swift["sender_currency"].unique())
):
  #  sender_currency_freq[sc] = len(swift[swift["sender_currency"] == sc])
  #  sender_currency_avg[sc] = swift[swift["sender_currency"] == sc]["InstructedAmount"].mean()
    
    sender_currency_freq_f[sc] = len(filtered[filtered["sender_currency"] == sc])
    sender_currency_avg_f[sc] = filtered[filtered["sender_currency"] == sc][
        "InstructedAmount"
    ].mean()
    
#swift["sender_currency_freq"] = swift["sender_currency"].map(sender_currency_freq)
f_ano["sender_currency_freq"] = f_ano["sender_currency"].map(sender_currency_freq_f)
normal_ano["sender_currency_freq"] = normal_ano["sender_currency"].map(sender_currency_freq_f)

#swift["sender_currency_amount_average"] = swift["sender_currency"].map(sender_currency_avg)
f_ano["sender_currency_amount_average"] = f_ano["sender_currency"].map(sender_currency_avg_f)
normal_ano["sender_currency_amount_average"] = normal_ano["sender_currency"].map(sender_currency_avg_f)


# Sender-Receiver Frequency
#swift["sender_receiver"] = swift["Sender"] + swift["Receiver"]
filtered["sender_receiver"] = filtered["Sender"] + filtered["Receiver"]

f_ano["sender_receiver"] = f_ano["Sender"] + f_ano["Receiver"]
normal_ano["sender_receiver"] = normal_ano["Sender"] + normal_ano["Receiver"]

#sender_receiver_freq = {}
sender_receiver_freq_f = {}
for sr in set(
    list(swift["sender_receiver"].unique())
):
#    sender_receiver_freq[sr] = len(swift[swift["sender_receiver"] == sr])
    sender_receiver_freq_f[sr] = len(filtered[filtered["sender_receiver"] == sr])

#swift["sender_receiver_freq"] = swift["sender_receiver"].map(sender_receiver_freq)
f_ano["sender_receiver_freq"] = f_ano["sender_receiver"].map(sender_receiver_freq_f)
normal_ano["sender_receiver_freq"] = normal_ano["sender_receiver"].map(sender_receiver_freq_f)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_ano["Timestamp"] = f_ano["Timestamp"].astype("datetime64[ns]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  normal_ano["Timestamp"] = normal_ano["Timestamp"].astype("datetime64[ns]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  f_ano["hour"] = f_ano["Timestamp"].dt.hour
A value is trying to b

In [82]:
columns_to_drop = [
    "UETR",
    "Sender",
    "Receiver",
    "TransactionReference",
    "OrderingAccount",
    "OrderingName",
    "OrderingStreet",
    "OrderingCountryCityZip",
    "BeneficiaryAccount",
    "BeneficiaryName",
    "BeneficiaryStreet",
    "BeneficiaryCountryCityZip",
    "SettlementDate",
    "SettlementCurrency",
    "InstructedCurrency",
    "Timestamp",
    "sender_hour",
    "sender_currency",
    "sender_receiver",
]

f_ano = f_ano.drop(columns_to_drop, axis=1)
normal_ano = normal_ano.drop(columns_to_drop, axis=1)

In [83]:
f_ano = f_ano.drop(['MessageId'], axis=1)
normal_ano = normal_ano.drop(['order_flag', 'bene_flag'], axis=1)
normal_ano['Label'] = normal_ano['Label'].map({1:0}) # map the label 1 to 0
ano = pd.concat([f_ano, normal_ano])

In [84]:
Y = ano["Label"].values
X = ano.drop(["Label"], axis=1).values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, random_state = 0)
# train and test split

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [85]:
%%time

xgb = XGBClassifier(n_estimators=100)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(xgb, X_train, Y_train, cv=kfold, scoring="f1")

xgb.fit(X_train, Y_train)

pred_xgb = xgb.predict(X_test)
print("XGBoost Classification Report=\n\n", classification_report(Y_test, pred_xgb))

print("XGBoost Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_xgb))

pred_proba_xgb = xgb.predict_proba(X_test)[:, 1]

print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_xgb))

XGBoost Classification Report=

               precision    recall  f1-score   support

           0       0.96      1.00      0.98       461
           1       1.00      0.98      0.99       955

    accuracy                           0.98      1416
   macro avg       0.98      0.99      0.98      1416
weighted avg       0.98      0.98      0.98      1416

XGBoost Confusion Matrix=

 [[459   2]
 [ 20 935]]
AUPRC: 0.9975559598451311
CPU times: user 5.41 s, sys: 2.23 s, total: 7.64 s
Wall time: 2.04 s


In [86]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=7, random_state=0, n_estimators=10)
kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
cv_results = cross_val_score(rf, X_train, Y_train, cv=kfold, scoring="f1")

rf.fit(X_train, Y_train)

pred_rf = rf.predict(X_test)
print("Random Forest Classification Report=\n\n", classification_report(Y_test, pred_rf))
print("Random Forest Confusion Matrix=\n\n", confusion_matrix(Y_test, pred_rf))
pred_proba_rf = rf.predict_proba(X_test)[:, 1]

print("AUPRC:", metrics.average_precision_score(y_true=Y_test, y_score=pred_proba_rf))

Random Forest Classification Report=

               precision    recall  f1-score   support

           0       0.84      0.98      0.90       461
           1       0.99      0.91      0.95       955

    accuracy                           0.93      1416
   macro avg       0.91      0.94      0.92      1416
weighted avg       0.94      0.93      0.93      1416

Random Forest Confusion Matrix=

 [[450  11]
 [ 86 869]]
AUPRC: 0.9937134479542321
CPU times: user 175 ms, sys: 73.7 ms, total: 248 ms
Wall time: 101 ms
