In [119]:
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from pytorch_forecasting import TemporalFusionTransformer, Baseline
from pytorch_forecasting.data import TimeSeriesDataSet
from pytorch_forecasting.metrics import SMAPE
from pytorch_lightning import Trainer
from pytorch_forecasting.data import NaNLabelEncoder
import matplotlib.pyplot as plt


In [120]:
# Example dataframe setup, replace with your actual data
data = pd.read_csv('October.csv')  # Load your dataset


In [121]:
# Ensure the 'time' column is in datetime format
data['time'] = pd.to_datetime(data['time'])

# Set the 'time' column as the index
data.set_index('time', inplace=True)

# Now, resample to 5-minute intervals and forward fill missing values
data_resampled = data.resample('5T').ffill()

# Alternatively, use interpolation
# data_resampled = data.resample('5T').interpolate(method='linear')

# Check the first few rows of the resampled data
print(data_resampled.head())


                     Unnamed: 0     open     high      low    close  \
time                                                                  
2022-01-03 01:00:00           0  1.13753  1.13784  1.13748  1.13782   
2022-01-03 01:05:00           1  1.13782  1.13782  1.13742  1.13763   
2022-01-03 01:10:00           2  1.13762  1.13763  1.13728  1.13730   
2022-01-03 01:15:00           3  1.13729  1.13762  1.13713  1.13718   
2022-01-03 01:20:00           4  1.13718  1.13721  1.13697  1.13705   

                     tick_volume  spread  real_volume  
time                                                   
2022-01-03 01:00:00          192       0            0  
2022-01-03 01:05:00          119       0            0  
2022-01-03 01:10:00          106       0            0  
2022-01-03 01:15:00          115       0            0  
2022-01-03 01:20:00           44       0            0  


  data_resampled = data.resample('5T').ffill()


In [122]:
data_resampled.reset_index(inplace=True)


In [123]:
del data_resampled['Unnamed: 0']

In [124]:
# Check time step differences
time_diff = data_resampled['time'].diff().dropna()
print(time_diff.value_counts())  # Should all be 5 minutes (if resampled correctly)


time
0 days 00:05:00    297204
Name: count, dtype: int64


In [125]:
data_resampled[296200:]

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume
296200,2024-10-27 12:20:00,1.07948,1.07962,1.07948,1.07959,82,0,0
296201,2024-10-27 12:25:00,1.07948,1.07962,1.07948,1.07959,82,0,0
296202,2024-10-27 12:30:00,1.07948,1.07962,1.07948,1.07959,82,0,0
296203,2024-10-27 12:35:00,1.07948,1.07962,1.07948,1.07959,82,0,0
296204,2024-10-27 12:40:00,1.07948,1.07962,1.07948,1.07959,82,0,0
...,...,...,...,...,...,...,...,...
297200,2024-10-30 23:40:00,1.08594,1.08605,1.08593,1.08597,82,0,0
297201,2024-10-30 23:45:00,1.08597,1.08598,1.08576,1.08576,140,0,0
297202,2024-10-30 23:50:00,1.08576,1.08581,1.08563,1.08563,149,0,0
297203,2024-10-30 23:55:00,1.08563,1.08569,1.08550,1.08566,197,0,0


In [126]:
data = data_resampled

In [127]:
# Add technical features (e.g., moving averages, RSI, etc.) if necessary
# For example, moving averages:
data['moving_avg'] = data['close'].rolling(window=14).mean()

# Create target variable, e.g., predict next day's close price
data['target'] = data['close'].shift(-1)

# Drop rows with NaN values
data = data.dropna()

# Normalize the features (standard scaling)
scaler = StandardScaler()
data[['open', 'high', 'low', 'close', 'moving_avg']] = scaler.fit_transform(data[['open', 'high', 'low', 'close', 'moving_avg']])

# Add a time index for TFT
data['time_idx'] = np.arange(len(data))

# Convert data to appropriate format for PyTorch Forecasting
max_prediction_length = 1  # We predict next day's price movement
max_encoder_length = 48  # 48 * 5 minutes = 4 hours


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[['open', 'high', 'low', 'close', 'moving_avg']] = scaler.fit_transform(data[['open', 'high', 'low', 'close', 'moving_avg']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['time_idx'] = np.arange(len(data))


In [128]:
data[296200:]

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume,moving_avg,target,time_idx
296213,2024-10-27 13:25:00,0.175534,0.174846,0.180258,0.178698,82,0,0,0.178672,1.07959,296200
296214,2024-10-27 13:30:00,0.175534,0.174846,0.180258,0.178698,82,0,0,0.178672,1.07959,296201
296215,2024-10-27 13:35:00,0.175534,0.174846,0.180258,0.178698,82,0,0,0.178672,1.07959,296202
296216,2024-10-27 13:40:00,0.175534,0.174846,0.180258,0.178698,82,0,0,0.178672,1.07959,296203
296217,2024-10-27 13:45:00,0.175534,0.174846,0.180258,0.178698,82,0,0,0.178672,1.07959,296204
...,...,...,...,...,...,...,...,...,...,...,...
297199,2024-10-30 23:35:00,0.355311,0.355088,0.359848,0.359340,81,0,0,0.361292,1.08597,297186
297200,2024-10-30 23:40:00,0.359293,0.357935,0.363542,0.360194,82,0,0,0.360682,1.08576,297187
297201,2024-10-30 23:45:00,0.360146,0.355942,0.358711,0.354220,140,0,0,0.359605,1.08563,297188
297202,2024-10-30 23:50:00,0.354173,0.351102,0.355017,0.350521,149,0,0,0.358569,1.08566,297189


In [129]:
data.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.dropna(inplace=True)


In [130]:
data.reset_index(inplace=True)
del data['index']
data

Unnamed: 0,time,open,high,low,close,tick_volume,spread,real_volume,moving_avg,target,time_idx
0,2022-01-03 02:05:00,1.819128,1.824927,1.821856,1.825810,76,0,0,1.820883,1.13709,0
1,2022-01-03 02:10:00,1.825670,1.823788,1.816457,1.814431,55,0,0,1.819399,1.13690,1
2,2022-01-03 02:15:00,1.814292,1.811829,1.811627,1.809026,62,0,0,1.817916,1.13678,2
3,2022-01-03 02:20:00,1.808887,1.808412,1.807648,1.805613,63,0,0,1.816859,1.13665,3
4,2022-01-03 02:25:00,1.805474,1.806134,1.804807,1.801914,66,0,0,1.815782,1.13620,4
...,...,...,...,...,...,...,...,...,...,...,...
297186,2024-10-30 23:35:00,0.355311,0.355088,0.359848,0.359340,81,0,0,0.361292,1.08597,297186
297187,2024-10-30 23:40:00,0.359293,0.357935,0.363542,0.360194,82,0,0,0.360682,1.08576,297187
297188,2024-10-30 23:45:00,0.360146,0.355942,0.358711,0.354220,140,0,0,0.359605,1.08563,297188
297189,2024-10-30 23:50:00,0.354173,0.351102,0.355017,0.350521,149,0,0,0.358569,1.08566,297189


In [131]:
data['group_id'] = 0
data['time'] = pd.to_datetime(data['time'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['group_id'] = 0
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['time'] = pd.to_datetime(data['time'])


In [132]:
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.utils.data import DataLoader
from pytorch_forecasting.data import TimeSeriesDataSet
# Step 1: Convert the 'time' column to datetime format and set as index
data.set_index('time', inplace=True)

# Step 2: Resample data to 5-minute intervals
data_resampled = data.resample('5T').ffill()

# Step 3: Add constant group_id (for a single sequence)
data_resampled['group_id'] = 0

# Step 4: Split data into training and validation using sklearn's train_test_split
train_data, val_data = train_test_split(data_resampled, test_size=0.2, shuffle=False)

# Step 5: Create TimeSeriesDataSet for training and validation
training_data = TimeSeriesDataSet(
    data_resampled,
    time_idx="time_idx",  # Ensure time_idx exists in your data
    target="target",
    group_ids=["group_id"],
    min_encoder_length=24,
    max_encoder_length=24,
    min_prediction_length=1,
    max_prediction_length=1,
    time_varying_known_reals=["open", "high", "low", "close", "moving_avg"],
    time_varying_unknown_reals=["target"],
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
)

train_dataset = TimeSeriesDataSet.from_dataset(training_data, train_data)
val_dataset = TimeSeriesDataSet.from_dataset(training_data, val_data)

# Step 6: Create DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print(f"Training DataLoader size: {len(train_dataloader.dataset)}")
print(f"Validation DataLoader size: {len(val_dataloader.dataset)}")


  data_resampled = data.resample('5T').ffill()


Training DataLoader size: 237728
Validation DataLoader size: 59415


In [138]:
from pytorch_forecasting import TemporalFusionTransformer
from pytorch_lightning import Trainer
from pytorch_forecasting.metrics import MAE, SMAPE, PoissonLoss, QuantileLoss
# Initialize the Temporal Fusion Transformer model
tft = TemporalFusionTransformer.from_dataset(
    train_dataset,
    learning_rate=0.001,
    hidden_size=64,
    attention_head_size=4,
    dropout=0.1,
    hidden_continuous_size=32,
    output_size=1,
    loss=MAE(),  # For regression (next day prediction)
)


  super().__init__(loss=loss, logging_metrics=logging_metrics, **kwargs)


In [2]:
import pandas as pd
import pandas_ta as ta

# Sample DataFrame with Open, High, Low, and Close columns.
# Replace with your actual data.
data = {
    'open': [1.0, 1.2, 1.1, 1.3, 1.25],
    'high': [1.2, 1.3, 1.25, 1.35, 1.3],
    'low': [0.95, 1.1, 1.05, 1.15, 1.1],
    'close': [1.15, 1.25, 1.2, 1.3, 1.28]
}
df = pd.DataFrame(data)

# STOCH(9,6)
df['STOCH_k'], df['STOCH_d'] = ta.stoch(df['high'], df['low'], df['close'], k=9, d=6)

# STOCHRSI(14)
df['STOCHRSI'] = ta.stochrsi(df['close'], length=14)

# MACD(12,26)
df['MACD'], df['MACD_signal'], df['MACD_hist'] = ta.macd(df['close'], fast=12, slow=26)

# ADX(14)
df['ADX'] = ta.adx(df['high'], df['low'], df['close'], length=14)['ADX_14']

# Williams %R
df['WILLIAMS_R'] = ta.willr(df['high'], df['low'], df['close'], length=14)

# CCI(14)
df['CCI'] = ta.cci(df['high'], df['low'], df['close'], length=14)

# ATR(14)
df['ATR'] = ta.atr(df['high'], df['low'], df['close'], length=14)

# Highs/Lows(14) - This is calculated as the difference between the current price and the price 14 periods ago.
df['Highs_Lows'] = df['close'] - df['close'].shift(14)

# Ultimate Oscillator
df['ULTIMATE_OSC'] = ta.uo(df['high'], df['low'], df['close'])

# ROC (Rate of Change)
df['ROC'] = ta.roc(df['close'])

# Bull/Bear Power(13)
df['BULL_POWER'] = df['high'] - ta.ema(df['close'], length=13)
df['BEAR_POWER'] = df['low'] - ta.ema(df['close'], length=13)

print(df[['STOCH_k', 'STOCH_d', 'STOCHRSI', 'MACD', 'MACD_signal', 'ADX', 'WILLIAMS_R', 'CCI', 'ATR', 'Highs_Lows', 'ULTIMATE_OSC', 'ROC', 'BULL_POWER', 'BEAR_POWER']])


ImportError: cannot import name 'NaN' from 'numpy' (c:\Users\hevar\AppData\Local\Programs\Python\Python310\lib\site-packages\numpy\__init__.py)

In [144]:
from pytorch_lightning import Trainer
import pytorch_lightning as pl

# Setup PyTorch Forecasting's built-in Trainer
trainer = pl.Trainer(
    max_epochs=20,
    accelerator="gpu" if torch.cuda.is_available() else "cpu",
    devices=1 if torch.cuda.is_available() else 1,
    gradient_clip_val=0.1,
)

# Train the model using the built-in trainer
trainer.fit(tft, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


TypeError: `model` must be a `LightningModule` or `torch._dynamo.OptimizedModule`, got `TemporalFusionTransformer`