In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import PowerTransformer

Timestamp Handling and Related Feature Engineering

In [None]:
train_df = pd.read_csv("/content/train.csv")
train_df['DateTime'] = train_df['Date'] + ' ' + train_df['Time']

train_df['DateTime'] = pd.to_datetime(train_df['DateTime'], format='%Y-%m-%d %H:%M:%S')

train_df['Timestamp'] = train_df['DateTime'].astype('int64') // 10**9  # seconds since epoch


In [5]:
train_df['hour'] = train_df['DateTime'].dt.hour
train_df['day_of_week'] = train_df['DateTime'].dt.dayofweek
train_df['is_weekend'] = train_df['day_of_week'] >= 5


In [6]:
train_df['month'] = train_df['DateTime'].dt.month

train_df['week'] = train_df['DateTime'].dt.isocalendar().week

Sender & Receiver Embedding for Graph Analysis

In [12]:
train_df.head()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type,DateTime,Timestamp,hour,day_of_week,is_weekend,month,week
0,16:06:46,2023-07-02,7483018893,6829969150,229.88,UK pounds,UK pounds,UK,UK,Cash Withdrawal,0,Normal_Cash_Withdrawal,2023-07-02 16:06:46,1688314006,16,6,True,7,26
1,22:13:20,2023-03-19,9446578208,7077419779,5134.16,UK pounds,UK pounds,UK,UK,Credit card,0,Normal_Fan_Out,2023-03-19 22:13:20,1679264000,22,6,True,3,11
2,01:35:17,2023-03-12,604583965,2509698744,11733.34,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Fan_Out,2023-03-12 01:35:17,1678584917,1,6,True,3,10
3,23:52:14,2023-03-30,5237553101,8569687956,3902.44,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Group,2023-03-30 23:52:14,1680220334,23,3,False,3,13
4,08:06:47,2023-07-25,7407074160,3732161109,6467.05,UK pounds,UK pounds,UK,UK,Debit card,0,Normal_Small_Fan_Out,2023-07-25 08:06:47,1690272407,8,1,False,7,30


In [15]:
account_encoder = LabelEncoder()
all_accounts = pd.concat([train_df['Sender_account'], train_df['Receiver_account']]).unique()
account_encoder.fit(all_accounts)

train_df['Sender_encoded'] = account_encoder.transform(train_df['Sender_account'])
train_df['Receiver_encoded'] = account_encoder.transform(train_df['Receiver_account'])


# for later graph analysis
num_accounts = len(account_encoder.classes_)
embedding_dim = 16

account_embedding = nn.Embedding(num_accounts, embedding_dim)

sender_ids = torch.tensor(train_df['Sender_encoded'].values)
sender_vectors = account_embedding(sender_ids)

receiver_ids = torch.tensor(train_df['Receiver_encoded'].values)
receiver_vectors = account_embedding(receiver_ids)

print(sender_vectors.shape)
print(receiver_vectors.shape)

torch.Size([1140570, 16])
torch.Size([1140570, 16])


In [16]:
train_df.head()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,...,Laundering_type,DateTime,Timestamp,hour,day_of_week,is_weekend,month,week,Sender_encoded,Receiver_encoded
0,16:06:46,2023-07-02,7483018893,6829969150,229.88,UK pounds,UK pounds,UK,UK,Cash Withdrawal,...,Normal_Cash_Withdrawal,2023-07-02 16:06:46,1688314006,16,6,True,7,26,451984,412455
1,22:13:20,2023-03-19,9446578208,7077419779,5134.16,UK pounds,UK pounds,UK,UK,Credit card,...,Normal_Fan_Out,2023-03-19 22:13:20,1679264000,22,6,True,3,11,569896,427502
2,01:35:17,2023-03-12,604583965,2509698744,11733.34,UK pounds,UK pounds,UK,UK,Debit card,...,Normal_Fan_Out,2023-03-12 01:35:17,1678584917,1,6,True,3,10,36491,151416
3,23:52:14,2023-03-30,5237553101,8569687956,3902.44,UK pounds,UK pounds,UK,UK,Debit card,...,Normal_Group,2023-03-30 23:52:14,1680220334,23,3,False,3,13,316527,517027
4,08:06:47,2023-07-25,7407074160,3732161109,6467.05,UK pounds,UK pounds,UK,UK,Debit card,...,Normal_Small_Fan_Out,2023-07-25 08:06:47,1690272407,8,1,False,7,30,447242,225178




Log Transform of `Amount` to reduce skewness (just standardize the `Amount` here)

In [18]:
pt = PowerTransformer(method='yeo-johnson', standardize=True)

train_df['amount_power'] = pt.fit_transform(train_df[['Amount']])

Label encoding for categorical variables


In [20]:
currency_encoder = LabelEncoder()
all_currencies = pd.concat([train_df['Payment_currency'], train_df['Received_currency']]).unique()
currency_encoder.fit(all_currencies)

train_df['Payment_currency_encoded'] = currency_encoder.transform(train_df['Payment_currency'])
train_df['Received_currency_encoded'] = currency_encoder.transform(train_df['Received_currency'])

In [None]:
payment_type_encoder = LabelEncoder()
train_df['Payment_type_encoded'] = payment_type_encoder.fit_transform(train_df['Payment_type'])

In [25]:
bank_location_encoder = LabelEncoder()
all_locations = pd.concat([train_df['Sender_bank_location'], train_df['Receiver_bank_location']]).unique()
bank_location_encoder.fit(all_locations)

train_df['Sender_bank_location_encoded'] = bank_location_encoder.transform(train_df['Sender_bank_location'])
train_df['Receiver_bank_location_encoded'] = bank_location_encoder.transform(train_df['Receiver_bank_location'])