In [None]:
import pandas as pd
from pyod.models.ecod import ECOD
import numpy as np
from sklearn.preprocessing import MinMaxScaler
pd.options.mode.chained_assignment = None 
import matplotlib.pyplot as plt

In [None]:
df_customer = pd.read_feather('data/synth_customer_metadata_openslava.ftr')
df_customer = df_customer.dropna()
df_customer

In [None]:
df_customer['CITY_ADDRESS'] = df_customer['CITY_ADDRESS'].astype('category').cat.codes
df_customer['GENDER'] = df_customer['GENDER'].astype('category').cat.codes
df_customer

In [None]:
df_transaction = pd.read_feather('data/synth_transactions_openslava.ftr')
df_transaction = df_transaction.dropna()
df_transaction

In [None]:
df_transaction = df_transaction.drop('CURRENCY', axis=1)

In [None]:
df_transaction['DT_TXN_DAY'] = df_transaction['DT_TXN'].astype(str)
df_transaction['DT_TXN_DAY'] = df_transaction['DT_TXN_DAY'].apply(lambda x: x[len('YYYY-MM-'):])
df_transaction['DT_TXN_DAY'] = df_transaction['DT_TXN_DAY'].astype(int)

df_transaction['DT_TXN_MONTH'] = df_transaction['DT_TXN'].astype(str)
df_transaction['DT_TXN_MONTH'] = df_transaction['DT_TXN_MONTH'].apply(lambda x: x[len('YYYY-'):-len('-DD')])
df_transaction['DT_TXN_MONTH'] = df_transaction['DT_TXN_MONTH'].astype(int)

df_transaction = df_transaction.drop('DT_TXN', axis=1)
df_transaction

In [None]:
df_merged = pd.merge(df_customer, df_transaction, on='ID')
df_merged

In [None]:
df_merged_train = df_merged[df_merged['ID'] < int(4111*0.8)]
df_merged_train.shape

In [None]:
df_merged_test = df_merged[df_merged['ID'] >= int(4111*0.8)]
df_merged_test.shape

In [None]:
global_scaler = MinMaxScaler(feature_range=(0, 10), copy=False)
global_train_scaled = global_scaler.fit_transform(np.array(df_merged_train.drop('ID', axis=1)))
global_test_scaled = global_scaler.transform(np.array(df_merged_test.drop('ID', axis=1)))
global_model = ECOD(contamination=0.0001, n_jobs=-1)
global_model.fit(global_scaled)
predictions = ecod.predict(global_test_scaled)

In [None]:
anomalies = sum(predictions)
non_anomalies = len(predictions) - anomalies
print(f'anomalies: {anomalies} non_anomalies: {non_anomalies}')
fig, ax = plt.subplots()
labels = ['anomalies', 'non-anomalies']
bar_colors = ['tab:red', 'tab:blue']
ax.bar(labels, [anomalies, non_anomalies], color=bar_colors)
ax.set_ylabel('Transaction count')
ax.set_title('Anomaly distribution')
plt.yscale("log")
plt.show()

In [None]:
def create_ratio_column(df, column_name):
    df[f'{column_name}_ratio'] = df.apply(lambda row: row.VL_TXN / row[column_name], axis=1)

In [None]:
df_list = [g for _, g in df_merged.groupby('ID')]
local_sample = df_list[-1]
local_sample

In [None]:
dummy_transaction = local_sample.values.tolist()[0]
local_sample_features = list(local_sample)
dummy_transaction

In [None]:
dummy_transaction[local_sample_features.index('VL_TXN')] = 50000
dummy_transaction[local_sample_features.index('DT_TXN_MONTH')] = 12
dummy_transaction_df = pd.DataFrame([dummy_transaction], columns=local_sample_features)
local_sample = pd.concat([dummy_transaction_df, local_sample])
local_sample

In [None]:
local_sample = local_sample.drop(['ID', 'GENDER', 'AGE_YEARS', 'CITY_ADDRESS', 'CNT_CARDS', 'CNT_TXN', 'CNT_TXN_3M',
                         'CNT_TXN_6M', 'CNT_TXN_12M'], axis=1)
columns_to_ratio = ['VL_CURR_BALANCE', 'VL_INCOME', 'VL_TXN_ALL', 'VL_CURR_BALANCE_3M', 'VL_TXN_ALL_3M', 'VL_INCOME_3M',
                    'VL_CURR_BALANCE_6M', 'VL_TXN_ALL_6M', 'VL_INCOME_6M', 'VL_CURR_BALANCE_12M', 'VL_TXN_ALL_12M',
                    'VL_INCOME_12M']

for column_name in columns_to_ratio: 
    create_ratio_column(local_sample, column_name)
    local_sample = local_sample.drop(column_name, axis=1)
local_sample

In [None]:
local_sample.value_counts('DT_TXN_MONTH')

In [None]:
local_sample_train = local_sample[local_sample['DT_TXN_MONTH'] < 12]
local_sample_train.shape

In [None]:
local_sample_test = local_sample[local_sample['DT_TXN_MONTH'] == 12]
local_sample_test

In [None]:
local_scaler = MinMaxScaler(feature_range=(0, 10), copy=False)
local_sample_train_scaled = local_scaler.fit_transform(np.array(local_sample_train))
local_model = ECOD(contamination=0.01, n_jobs=-1)
local_model.fit(local_sample_train_scaled)

In [None]:
dummy_data_local_test = local_sample_test.values.tolist()[0]
dummy_data_local_test_scaled = local_scaler.transform([dummy_data_local_test])
local_model.predict(dummy_data_local_test_scaled)

In [None]:
dummy_data_global_test_scaled = global_scaler.transform(np.array(dummy_transaction_df.drop('ID', axis=1)))
global_model.predict(dummy_data_global_test_scaled)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
global_input = keras.Input(shape=(global_train_scaled.shape[1],))
x = layers.Dense(16, activation="relu")(global_input)
x = layers.Dense(8, activation="relu")(x)
x = layers.Dense(16, activation="relu")(x)
global_out = layers.Dense(global_train_scaled.shape[1], activation='sigmoid')(x)
                          
local_scaled = scaler.fit_transform(np.array(local_sample))
local_input = keras.Input(shape=(local_scaled.shape[1],))
x = layers.Dense(8, activation="relu")(local_input)
x = layers.Dense(4, activation="relu")(x)
x = layers.Dense(8, activation="relu")(x)
local_out = layers.Dense(local_scaled.shape[1], activation='sigmoid')(x)

x = layers.concatenate([global_out, local_out])
ensamble = layers.Dense(1)(x)

model = keras.Model(
    inputs=[global_input, local_input],
    outputs=ensamble
)

model.summary()