## Data Acquisition

In [None]:
from os import listdir
from os.path import isfile, join

import pandas as pd

In [None]:
def merge_dataset(data_files: str = "../datasets", symbol: str = "btc"):
    files = [f for f in listdir(data_files) if isfile(join(data_files, f)) and f.startswith(symbol) and f.endswith('.parquet')]
    return pd.concat(
        pd.read_parquet(join(data_files, f), engine='fastparquet') for f in files
    )

In [None]:
def prepare_dataset(data_files: str = "../datasets", symbol: str = "btc", datetime_index: bool = True):
    df = merge_dataset(data_files=data_files, symbol=symbol)
    
    # timestamp to python datetime
    if datetime_index:
        df['date'] = pd.to_datetime(df['date'], unit='s')

    df = df.set_index(['date'])
    df.sort_index(inplace=True)

    return df

In [None]:
top_10_symbol = [
        'BTC-BRL', 'ETH-BRL', 'USDT-BRL', 'SOL-BRL', 'XRP-BRL',
        'USDC-BRL', 'DOGE-BRL', 'ADA-BRL', 'AVAX-BRL', 'SHIB-BRL'
    ]

In [None]:
datasets = {}
for pair in top_10_symbol:
    pair = pair.replace('-BRL', '').lower()
    datasets[pair] = prepare_dataset(symbol=pair)

In [None]:
for k in datasets.keys():
    print(datasets[k].head())

In [None]:
for k in datasets.keys():
    print(datasets[k].tail())

In [None]:
for k in datasets.keys():
    print(datasets[k].info())

In [None]:
for k in datasets.keys():
    print(datasets[k].describe())

## Exploratory Data Analysis

In [None]:
# TBD

## Models

### Isolation Forest

In [None]:
import numpy as np
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt

In [None]:
#initiate the model with 5% contamination

model_year = IsolationForest(
    n_estimators=1_000,
    max_samples='auto',
    contamination=float(0.05),
    random_state=139,
    n_jobs=-1
)

#### 1 year dataset

In [None]:
# 2023 pick was arbitrary, nothing special in it
df_year = {k: datasets[k].loc['2023-01-01':'2023-12-31'][['price', 'amount']] for k in datasets}

In [None]:
def plot_outliers(df_year, save_plot=True):
    model_year.fit(df_year)
    df_year['score'] = model_year.decision_function(df_year[['price', 'amount']])
    df_year['anomaly_value'] = model_year.predict(df_year[['price', 'amount']])
    # outliers
    outliers_year = df_year.loc[df_year['anomaly_value'] == -1]
    outlier_index = list(outliers_year.index)
    
    #datapoints classified -1 are anomalous
    print(df_year['anomaly_value'].value_counts())
    df_year['anomaly_value'].value_counts().plot(kind = 'bar')

In [None]:
# outliers
outliers_year = df_year.loc[df_year['anomaly_value'] == -1]
outlier_index = list(outliers_year.index)

#datapoints classified -1 are anomalous
print(df_year['anomaly_value'].value_counts())
df_year['anomaly_value'].value_counts().plot(kind = 'bar')

In [None]:
outliers_year

In [None]:
plt.figure(figsize = (32, 16))

plt.plot(df_year[['price']], marker = '.')
plt.plot(outliers_year[['price']], 'o', color = 'red', label = 'outlier')
plt.title('Detection By Isolation Forest')

#plt.grid()
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()

In [None]:
import plotly.express as px
import plotly.graph_objects as go

In [None]:
# fig = px.line(df_year_2023, x=df_year_2023.index, y="price")
# fig.add_trace(go.Scatter(mode="markers", x=outliers.index, y=outliers["price"], name="outliers"))

#### 1 month dataset

In [None]:
df_month = btc_trades_df.loc['2024-01-01':'2024-01-31'][['price', 'amount']]

In [None]:
#initiate the model with 5% contamination

model_month = IsolationForest(
    n_estimators=1_000,
    max_samples='auto',
    contamination=float(0.05),
    random_state=139,
    n_jobs=-1
)

In [None]:
model_month.fit(df_month)
df_month['score'] = model_month.decision_function(df_month[['price', 'amount']])
df_month['anomaly_value'] = model_month.predict(df_month[['price', 'amount']])

In [None]:
# outliers
outliers_month = df_month.loc[df_month['anomaly_value'] == -1]
outlier_index_month = list(outliers_month.index)

#datapoints classified -1 are anomalous
print(df_month['anomaly_value'].value_counts())
df_month['anomaly_value'].value_counts().plot(kind = 'bar')

In [None]:
outliers_month

In [None]:
plt.figure(figsize = (32, 16))

plt.plot(df_month[['price']], marker = '.')
plt.plot(outliers_month[['price']], 'o', color = 'red', label = 'outlier')
plt.title('Detection By Isolation Forest')

#plt.grid()
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()

#### 1 day dataset

In [None]:
df_day = btc_trades_df.loc['2024-03-24':'2024-03-24'][['price', 'amount']]

In [None]:
df_day

In [None]:
#initiate the model with 5% contamination

model_day = IsolationForest(
    n_estimators=1_000,
    max_samples='auto',
    contamination=float(0.05),
    random_state=139,
    n_jobs=-1
)

In [None]:
model_day.fit(df_day)
df_day['score'] = model_day.decision_function(df_day[['price', 'amount']])
df_day['anomaly_value'] = model_day.predict(df_day[['price', 'amount']])

In [None]:
# outliers
outliers_day = df_day.loc[df_day['anomaly_value'] == -1]
outlier_index_day = list(outliers_day.index)

#datapoints classified -1 are anomalous
print(df_day['anomaly_value'].value_counts())
df_day['anomaly_value'].value_counts().plot(kind = 'bar')

In [None]:
outliers_day

In [None]:
plt.figure(figsize = (32, 16))

plt.plot(df_day[['price']], marker = '.')
plt.plot(outliers_day[['price']], 'o', color = 'red', label = 'outlier')
plt.title('Detection By Isolation Forest')

#plt.grid()
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()

#### 1 hour dataset

In [None]:
df_hour = btc_trades_df.loc['2024-03-24 17:00:00':'2024-03-24 17:59:59'][['price', 'amount']]

In [None]:
df_hour

In [None]:
#initiate the model with 1% contamination

model_hour = IsolationForest(
    n_estimators=1_000,
    max_samples='auto',
    contamination=float(0.01),
    random_state=139,
    n_jobs=-1
)

In [None]:
model_hour.fit(df_hour)
df_hour['score'] = model_hour.decision_function(df_hour[['price', 'amount']])
df_hour['anomaly_value'] = model_hour.predict(df_hour[['price', 'amount']])

In [None]:
# outliers
outliers_hour = df_hour.loc[df_hour['anomaly_value'] == -1]
outlier_index_h = list(outliers_hour.index)

#datapoints classified -1 are anomalous
print(df_hour['anomaly_value'].value_counts())
df_hour['anomaly_value'].value_counts().plot(kind = 'bar')

In [None]:
outliers_hour

In [None]:
plt.figure(figsize = (16, 8))

plt.plot(df_hour[['price']], marker = '.')
plt.plot(outliers_hour[['price']], 'o', color = 'red', label = 'outlier')
plt.title('Isolation Forest - Day')

#plt.grid()
plt.xlabel('Date')
plt.ylabel('Price')
plt.legend()