## Data Acquisition

In [1]:
from os import listdir
from os.path import isfile, join

import pandas as pd

In [2]:
def merge_dataset(data_files: str = "../../../datasets", symbol: str = "btc"):
    files = [f for f in listdir(data_files) if isfile(join(data_files, f)) and f.startswith(symbol) and f.endswith('.parquet')]
    return pd.concat(
        pd.read_parquet(join(data_files, f), engine='fastparquet') for f in files
    )

In [4]:
def prepare_dataset(data_files: str = "../../datasets", symbol: str = "btc", datetime_index: bool = True):
    df = merge_dataset(data_files=data_files, symbol=symbol)
    
    # timestamp to python datetime
    if datetime_index:
        df['date'] = pd.to_datetime(df['date'], unit='s')

    df = df.set_index(['date'])
    df.sort_index(inplace=True)

    return df

In [5]:
top_10_symbol = [
        'BTC-BRL', 'ETH-BRL', 'USDT-BRL', 'SOL-BRL', 'XRP-BRL',
        'USDC-BRL', 'DOGE-BRL', 'ADA-BRL', 'AVAX-BRL', 'SHIB-BRL'
    ]

In [8]:
def create_dataset(pairs: list):
    dataset = {}
    pairs = pairs if pairs else top_10_symbol
    print(pairs)
    for pair in pairs:
        pair = pair.replace('-BRL', '').lower()
        dataset[pair] = prepare_dataset(symbol=pair)
    return dataset

In [25]:
def dataset_by_time_range(start_date='2023-01-01', end_date='2023-12-31', columns=['price', 'amount'], pairs: list = None):
    dataset = create_dataset(pairs=pairs)
    return {pair: dataset[pair].loc[f'{start_date}':f'{end_date}'][columns] for pair in dataset}

In [26]:
df = dataset_by_time_range(pairs=['BTC-BRL', 'ETH-BRL'])

['BTC-BRL', 'ETH-BRL']


In [27]:
df

{'btc':                              price    amount
 date                                        
 2023-01-01 00:06:11   87967.500660  0.001286
 2023-01-01 00:06:11   87999.990000  0.000020
 2023-01-01 00:06:11   88000.000000  0.000020
 2023-01-01 00:06:11   88000.000000  0.000020
 2023-01-01 00:06:11   87999.990000  0.000020
 ...                            ...       ...
 2023-12-31 23:52:07  208937.028757  0.000290
 2023-12-31 23:53:00  208900.061061  0.000005
 2023-12-31 23:54:40  208966.626665  0.000005
 2023-12-31 23:57:07  209254.320000  0.000023
 2023-12-31 23:59:16  209216.773414  0.000047
 
 [1282109 rows x 2 columns],
 'eth':                             price    amount
 date                                       
 2023-01-01 00:03:20   6361.442902  0.001012
 2023-01-01 00:08:22   6380.840349  0.000784
 2023-01-01 00:09:34   6380.726680  0.007836
 2023-01-01 00:15:55   6379.786496  0.015675
 2023-01-01 00:16:40   6360.000000  0.016000
 ...                           ...       .

## Exploratory Data Analysis

In [None]:
# TBD

## Models

### Isolation Forest

In [None]:
import os

import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

### Plotting Figures

In [None]:
def plot_outliers(model, df, pair, time_range, save_plot=True, save_df=True):
    model.fit(df)
    df['score'] = model.decision_function(df[['price', 'amount']])
    df['anomaly_value'] = model.predict(df[['price', 'amount']])
    # outliers
    outliers = df.loc[df['anomaly_value'] == -1]
    outliers_index = list(outliers.index)
    
    #datapoints classified as -1 are anomalous
    print(f'Total Number of Non-Outliers and Outliers {pair}: ', df['anomaly_value'].value_counts())
    fig = df['anomaly_value'].value_counts().plot(kind='bar',
                                                       xlabel='1 Normal Point -1 Anomaly Point',
                                                       ylabel='Total Transactions',
                                                       title=f'Total Number of Non-Outliers and Outliers {pair}').get_figure()

    if save_plot:
        try:
            os.makedirs(f'./iforest/figures/1{time_range}/{pair}')
        except FileExistsError:
            pass
        
        fig.savefig(f'./iforest/figures/1{time_range}/{pair}/{pair}-1{time_range}-outliers-count')

    if save_df:
        try:
            os.makedirs(f'./iforest/datasets/1{time_range}/{pair}')
        except FileExistsError:
            pass
        
        df.to_csv(f'./iforest/datasets/1{time_range}/{pair}/df_{pair}_anomaly.csv', index=True)  

    return outliers, outliers_index

In [None]:
def plot_detection(df, outliers, time_range, pair, save_fig=True):
    plt.figure(figsize = (32, 16))
    plt.plot(df[['price']], marker = '.')
    plt.plot(outliers[['price']], 'o', color = 'red', label = 'outlier')
    plt.title(f'Detection By Isolation Forest {pair}')
    
    #plt.grid()
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()

    if save_fig:
        try:
            os.makedirs(f'./iforest/figures/1{time_range}/{pair}')
        except FileExistsError:
            pass
    
        plt.savefig(f'./iforest/figures/1{time_range}/{pair}/{pair}-1{time_range}-detection-by-price-and-date')

In [None]:
def plot_figures(dataset, model, time_range):
    for pair in dataset:
        df_pair = dataset[pair]
        outliers, outliers_idx = plot_outliers(model, df=df_pair, pair=pair, time_range=time_range)
        plot_detection(df=df_pair, outliers=outliers, time_range=time_range, pair=pair)

#### 1 year dataset

In [None]:
#initiate the model with 0.05% contamination

model = IsolationForest(
    n_estimators=1_000,
    max_samples='auto',
    contamination=float(0.005),
    random_state=139,
    n_jobs=-1
)

dataset_by_year = dataset_by_time_range(start_date='2023-01-01',
                                        end_date='2023-12-31',
                                        columns=['price', 'amount'])
dataset_by_year['usdt']
plot_figures(dataset_by_year, model, time_range='year')

In [None]:
dataset_by_year

#### 1 month dataset

In [None]:
#initiate the model with 0.05% contamination

model_month = IsolationForest(
    n_estimators=1_000,
    max_samples='auto',
    contamination=float(0.0005),
    random_state=139,
    n_jobs=-1
)

In [None]:
dataset_by_month = dataset_by_time_range(start_date='2023-12-01',
                                        end_date='2023-12-31',
                                        columns=['price', 'amount'])
plot_figures(dataset_by_month, model_month, time_range='month')

#### 1 day dataset

In [None]:
#initiate the model with 0.1% contamination

model_day = IsolationForest(
    n_estimators=1_000,
    max_samples='auto',
    contamination=float(0.001),
    random_state=139,
    n_jobs=-1
)

In [None]:
dataset_by_day = dataset_by_time_range(start_date='2023-12-13',
                                        end_date='2023-12-13',
                                        columns=['price', 'amount'])
plot_figures(dataset_by_day, model_day, time_range='day')

#### 1 hour dataset

In [None]:
#initiate the model with 1% contamination

model_hour = IsolationForest(
    n_estimators=1_000,
    max_samples='auto',
    contamination=float(0.1),
    random_state=139,
    n_jobs=-1
)

In [None]:
dataset_by_hour = dataset_by_time_range(start_date='2023-12-23 17:00:00',
                                        end_date='2023-12-23 17:59:59',
                                        columns=['price', 'amount'])
plot_figures(dataset_by_hour, model_hour, time_range='hour')