In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [2]:
file_path = './Data/financial_anomaly_data.csv'
df = pd.read_csv(file_path)
df.head(6)

Unnamed: 0,Timestamp,TransactionID,AccountID,Amount,Merchant,TransactionType,Location
0,01-01-2023 08:00,TXN1127,ACC4,95071.92,MerchantH,Purchase,Tokyo
1,01-01-2023 08:01,TXN1639,ACC10,15607.89,MerchantH,Purchase,London
2,01-01-2023 08:02,TXN872,ACC8,65092.34,MerchantE,Withdrawal,London
3,01-01-2023 08:03,TXN1438,ACC6,87.87,MerchantE,Purchase,London
4,01-01-2023 08:04,TXN1338,ACC6,716.56,MerchantI,Purchase,Los Angeles
5,01-01-2023 08:05,TXN1083,ACC15,13957.99,MerchantC,Transfer,London


In [3]:
df.describe()

Unnamed: 0,Amount
count,216960.0
mean,50090.025108
std,29097.905016
min,10.51
25%,25061.2425
50%,50183.98
75%,75080.46
max,978942.26


In [4]:
df.isnull()

Unnamed: 0,Timestamp,TransactionID,AccountID,Amount,Merchant,TransactionType,Location
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
217436,True,True,True,True,True,True,True
217437,True,True,True,True,True,True,True
217438,True,True,True,True,True,True,True
217439,True,True,True,True,True,True,True


In [5]:
df.isnull().sum()

Timestamp          481
TransactionID      481
AccountID          481
Amount             481
Merchant           481
TransactionType    481
Location           481
dtype: int64

In [6]:
df.nunique()

Timestamp          216960
TransactionID        1999
AccountID              15
Amount             214687
Merchant               10
TransactionType         3
Location                5
dtype: int64

In [7]:
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,Timestamp,TransactionID,AccountID,Amount,Merchant,TransactionType,Location
0,01-01-2023 08:00,TXN1127,ACC4,95071.92,MerchantH,Purchase,Tokyo
1,01-01-2023 08:01,TXN1639,ACC10,15607.89,MerchantH,Purchase,London
2,01-01-2023 08:02,TXN872,ACC8,65092.34,MerchantE,Withdrawal,London
3,01-01-2023 08:03,TXN1438,ACC6,87.87,MerchantE,Purchase,London
4,01-01-2023 08:04,TXN1338,ACC6,716.56,MerchantI,Purchase,Los Angeles
...,...,...,...,...,...,...,...
216955,31-05-2023 23:55,TXN1286,ACC6,62536.88,MerchantA,Withdrawal,San Francisco
216956,31-05-2023 23:56,TXN1015,ACC5,68629.69,MerchantG,Transfer,London
216957,31-05-2023 23:57,TXN1979,ACC15,8203.57,MerchantF,Purchase,London
216958,31-05-2023 23:58,TXN1845,ACC14,77800.36,MerchantF,Purchase,New York


In [8]:
df_cleaned[df_cleaned.isnull().any(axis=1)]

Unnamed: 0,Timestamp,TransactionID,AccountID,Amount,Merchant,TransactionType,Location


In [9]:
df_cleaned.nunique()

Timestamp          216960
TransactionID        1999
AccountID              15
Amount             214687
Merchant               10
TransactionType         3
Location                5
dtype: int64

In [10]:
# unique values per column

columns_to_include = [
    col for col in df_cleaned.columns if df_cleaned[col].nunique() < 20]
for column_name in columns_to_include:
  unique_values = np.sort(df_cleaned[column_name].unique())
  print(f"Unique values in column '{column_name}':")
  for value in unique_values:
    print(value)
  print()

Unique values in column 'AccountID':
ACC1
ACC10
ACC11
ACC12
ACC13
ACC14
ACC15
ACC2
ACC3
ACC4
ACC5
ACC6
ACC7
ACC8
ACC9

Unique values in column 'Merchant':
MerchantA
MerchantB
MerchantC
MerchantD
MerchantE
MerchantF
MerchantG
MerchantH
MerchantI
MerchantJ

Unique values in column 'TransactionType':
Purchase
Transfer
Withdrawal

Unique values in column 'Location':
London
Los Angeles
New York
San Francisco
Tokyo



In [11]:
df_original = df
df = df_cleaned
df

Unnamed: 0,Timestamp,TransactionID,AccountID,Amount,Merchant,TransactionType,Location
0,01-01-2023 08:00,TXN1127,ACC4,95071.92,MerchantH,Purchase,Tokyo
1,01-01-2023 08:01,TXN1639,ACC10,15607.89,MerchantH,Purchase,London
2,01-01-2023 08:02,TXN872,ACC8,65092.34,MerchantE,Withdrawal,London
3,01-01-2023 08:03,TXN1438,ACC6,87.87,MerchantE,Purchase,London
4,01-01-2023 08:04,TXN1338,ACC6,716.56,MerchantI,Purchase,Los Angeles
...,...,...,...,...,...,...,...
216955,31-05-2023 23:55,TXN1286,ACC6,62536.88,MerchantA,Withdrawal,San Francisco
216956,31-05-2023 23:56,TXN1015,ACC5,68629.69,MerchantG,Transfer,London
216957,31-05-2023 23:57,TXN1979,ACC15,8203.57,MerchantF,Purchase,London
216958,31-05-2023 23:58,TXN1845,ACC14,77800.36,MerchantF,Purchase,New York


In [12]:
df.describe()

Unnamed: 0,Amount
count,216960.0
mean,50090.025108
std,29097.905016
min,10.51
25%,25061.2425
50%,50183.98
75%,75080.46
max,978942.26


In [13]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, FunctionTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2

In [14]:
df = df.drop(columns=['TransactionID'])

In [15]:
def date_time_splitter(df: pd.DataFrame):
  df = df.copy()
  df['Timestamp'] = pd.to_datetime(
      df['Timestamp'], format='%d-%m-%Y %H:%M')  # Parse timestamp format

  df.loc[:, 'Day'] = df['Timestamp'].dt.day
  df.loc[:, 'Month'] = df['Timestamp'].dt.month
  df.loc[:, 'Year'] = df['Timestamp'].dt.year
  df.loc[:, 'Hour'] = df['Timestamp'].dt.hour
  df.loc[:, 'Minute'] = df['Timestamp'].dt.minute

  df = df.drop(columns=['Timestamp'])

  return df

In [16]:
df_original = df
df = date_time_splitter(df)

In [17]:
categorical_cols = df.select_dtypes(include=['object']).columns
categorical_cols

numerical_cols = df.select_dtypes(include=['number']).columns
numerical_cols

Index(['AccountID', 'Merchant', 'TransactionType', 'Location'], dtype='object')

Index(['Amount', 'Day', 'Month', 'Year', 'Hour', 'Minute'], dtype='object')

In [18]:
transformers = [
    # ('scaler', MinMaxScaler(), ['Amount']),
    ('label', LabelEncoder(), [
        'AccountID', 'Merchant', 'TransactionType', 'Location'
    ]),
    # ("date_time_splitter", FunctionTransformer(lambda df_original: date_time_splitter(df_original)), df_original.columns)
]

In [19]:
preprocessor = ColumnTransformer(
    transformers=transformers, remainder='passthrough')

In [20]:
# preprocessor.fit(df_original)

In [21]:
def encode_categorical_features(df):
  encoder = LabelEncoder()
  return encoder.fit_transform(df)

In [22]:
def scale_numerical_features(df):
  scaler = MinMaxScaler()
  return scaler.fit_transform(df[['Amount']])

In [23]:
trf1 = ColumnTransformer(
    transformers=[
        # Wrap in FunctionTransformer
        ("date_time_sep", FunctionTransformer(
            lambda x: date_time_splitter(x)), [0]),
    ],
    remainder='passthrough'
)

trf2 = ColumnTransformer(
    transformers=[
        ("label_encode_categorical", FunctionTransformer(
            lambda x: encode_categorical_features(x, [1, 3, 4, 5]))),
    ],
    remainder='passthrough'
)

trf3 = ColumnTransformer(
    transformers=[
        ("scale_amount", FunctionTransformer(scale_numerical_features), [3]),
    ],
    remainder='passthrough'
)

In [24]:
pipeline = Pipeline([
    ('trf1', trf1),
    ('trf2', trf2),
    ('trf3', trf3)
])

In [25]:
# data = pd.DataFrame(df)

In [26]:
# pipeline.fit(data)
# df_transformed = pipeline.transform(data)

# print(df_transformed)