### PLAN
- Prepare train data
- Train a regressor model
- Implement an anomaly detection algorithm
- Identify Anomalies
- Visualize Anomalies


##### STEP ONE : PREPARE TRAINING DATA

The (placeholder) data to be used for this model is the close data.

In [10]:
# Import Necessary Libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas_ta as ta

In [3]:
# Define some functions
def split_data(data:np.ndarray, split_percent:int) :
    split_index = int(round(len(data) * split_percent))
    
    train_data = data[:split_index]
    test_data = data[split_index:]

    return train_data, test_data

In [5]:
# Import the data
raw_data = pd.read_parquet("/Users/jerryinyang/Code/fourteen/data/prices/BTCUSDT_1D.parquet")
raw_data.shape

(2190, 5)

In [7]:
# Prepare the training data
clean_data = raw_data.copy()
clean_data.index = pd.DatetimeIndex(clean_data.index.values,
                               freq=clean_data.index.inferred_freq)
clean_data.shape

(2190, 5)

In [90]:
# Extra data processing
data = clean_data.copy()

# Feature Engineering
data = data[['high', 'low', 'close', 'volume']]
data['atr'] = ta.atr(data['high'], data['low'], data['close'])
data['target'] = data['close'].shift(-1)

# Normalization
data = data.apply(lambda x: np.log(x))

data = data.dropna()
# data = data.diff()
# data = data.dropna()

# Denoising
data

Unnamed: 0,high,low,close,volume,atr,target
2018-01-12 01:00:00,9.554623,9.433484,9.528067,9.706083,7.620100,9.561701
2018-01-13 01:00:00,9.587406,9.525600,9.561701,9.410953,7.557345,9.508591
2018-01-14 01:00:00,9.570773,9.439005,9.508591,9.742021,7.549555,9.513398
2018-01-15 01:00:00,9.564511,9.484009,9.513398,9.592339,7.506785,9.296518
2018-01-16 01:00:00,9.513620,9.108861,9.296518,11.057238,7.640594,9.304631
...,...,...,...,...,...,...
2023-12-22 01:00:00,10.700956,10.678504,10.691241,10.397671,7.316391,10.685153
2023-12-23 01:00:00,10.691688,10.675702,10.685153,9.714572,7.277322,10.668758
2023-12-24 01:00:00,10.690717,10.657259,10.668758,10.132388,7.277267,10.682265
2023-12-25 01:00:00,10.687442,10.662433,10.682265,10.204378,7.259079,10.657469


In [101]:
# Split data into train and test split
array_data = data.to_numpy()

window_size = 7
data_X, data_y = [], []

for index in range(window_size, len(array_data) - 1):
    start_index = index - window_size
    end_index = index 
    
    data_X.append(array_data[start_index : end_index, :-1]   )
    data_y.append(array_data[index -1, -1])

train_X, test_X = split_data(data_X, .7)
train_y, test_y = split_data(data_y, .7)

In [102]:
xxx = np.array(train_X)
xxx.shape

(1517, 7, 5)

##### STEP TWO : TRAIN A REGRESSOR MODEL

We would use a Gradient Boosting Regressors. Other alternatives are :
- Gradient Boosting Regressors
- LSTM

In [86]:
# Import Necessary Libraries
import plotly.graph_objects as go
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error


In [93]:
# Create and fit GradientBoostingRegressor
model = GradientBoostingRegressor(random_state=0)
model.fit(train_X, train_y)

In [98]:
predictions = model.predict(train_X)
rmse = np.sqrt(mean_squared_error(train_y, predictions))
rmse

0.031762282683569606

In [100]:
# Create a scatter plot for the original data
fig = go.Figure()
fig.add_trace(go.Scatter(x=list(range(len(train_y))), y=train_y, mode='markers', name='Original Data'))

# Add a line plot for the predictions
fig.add_trace(go.Scatter(x=list(range(len(predictions))), y=predictions, mode='lines', name='Predictions'))

# Update layout
fig.update_layout(title='Original Data vs. Predictions',
                  xaxis_title='Index',
                  yaxis_title='Value')

# Show the plot
fig.show()


In [108]:
# Simple LSTM
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


class LSTMRegression(nn.Module):
    def __init__(self, n_feats, n_period, hidden_size, num_layers, output_size=1):
        super().__init__()
        self.lstm = nn.LSTM(input_size=n_feats, 
                            hidden_size=hidden_size, 
                            num_layers=num_layers, 
                            batch_first=True)  
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        out, _ = self.lstm(x)  # Ignore hidden states
        out = out[:, -1, :]  # Take the last output of the sequence
        out = self.linear(out)
        return out

    def train_model(self, train_loader, optimizer, criterion, n_epochs):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(device)

        for epoch in range(n_epochs):
            for x, y in train_loader:  
                x, y = x.to(device), y.to(device)

                optimizer.zero_grad()
                output = self(x)
                loss = criterion(output, y)
                loss.backward()
                optimizer.step()

    def predict(self, x):
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.to(device)

        x = torch.tensor(x, device=device)  # Convert to tensor
        with torch.no_grad():
            output = self(x.float())  # Ensure float type for prediction
        return output.item() 
    

class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [106]:
model = LSTMRegression(n_feats=5, 
                       n_period=7, 
                       hidden_size=32,  # Experiment with this value
                       num_layers=1) 
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.MSELoss()

In [110]:
train_dataset = TimeSeriesDataset(np.array(train_X), np.array(train_y))  # Assuming you have your data as NumPy arrays or similar
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) 

In [111]:
model.train_model(train_loader, optimizer, criterion, n_epochs=100) 


Using a target size (torch.Size([32])) that is different to the input size (torch.Size([32, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.


Using a target size (torch.Size([13])) that is different to the input size (torch.Size([13, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.



In [116]:
predictions = model.predict(train_X[0])

IndexError: too many indices for tensor of dimension 2