In [1]:
# ! .venv\Scripts\pip install darts
# ! .venv\Scripts\pip install dask dask[distributed]
# ! .venv\Scripts\pip install bokeh dask[dataframe] pyarrow

In [2]:
import pandas as pd
df = pd.read_parquet('TRAIN_Reco_2021_2022_2023.parquet.gzip').reset_index()

In [3]:
df['ExecutionTime'] = pd.to_datetime(df['ExecutionTime'])
numerical_columns = ['high', 'low', 'close', 'volume']
df[numerical_columns] = df[numerical_columns].astype('float16')
df.dtypes

ExecutionTime    datetime64[ns, Europe/Berlin]
ID                                      object
high                                   float16
low                                    float16
close                                  float16
volume                                 float16
dtype: object

In [4]:
train_start_date = '2022-01-01'
train_end_date = '2023-06-30'

val_start_date = '2023-07-01'
val_end_date = '2023-12-31'  # Adjust if you have data beyond 2023

# Step 4: Split the data into training and validation sets
train_df = df[(df['ExecutionTime'] >= train_start_date) & (df['ExecutionTime'] <= train_end_date)]
val_df = df[(df['ExecutionTime'] >= val_start_date) & (df['ExecutionTime'] <= val_end_date)]

In [5]:
print(train_df["ID"].nunique(), val_df["ID"].nunique())

672 672


In [6]:
train_df.head()

Unnamed: 0,ExecutionTime,ID,high,low,close,volume
34473,2022-01-01 00:00:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0
34474,2022-01-01 00:15:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0
34475,2022-01-01 00:30:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0
34476,2022-01-01 00:45:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0
34477,2022-01-01 01:00:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0


In [7]:
train_df.set_index('ExecutionTime', inplace=True)
val_df.set_index('ExecutionTime', inplace=True)

def create_lag_rolling_features(df):
    # Lag features
    for column in ['low', 'high', 'close', 'volume']:
        for lag in range(1, 11):  # Create 10 lags
            df[f'{column}_lag_{lag}'] = df[column].shift(lag)
    
    # Rolling window features (rolling mean of the last 10 periods)
    for column in ['low', 'high', 'close', 'volume']:
        df[f'{column}_rolling_mean_10'] = df[column].rolling(window=10).mean()
    
    return df

In [8]:
# Apply the lag and rolling window function to each asset group separately in the training set
train_df = train_df.groupby('ID').apply(create_lag_rolling_features)

# Apply the lag and rolling window function to each asset group separately in the validation set
val_df = val_df.groupby('ID').apply(create_lag_rolling_features)

  train_df = train_df.groupby('ID').apply(create_lag_rolling_features)
  val_df = val_df.groupby('ID').apply(create_lag_rolling_features)


In [9]:
# Handle missing values resulting from lagging
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)

In [10]:
train_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,high,low,close,volume,low_lag_1,low_lag_2,low_lag_3,low_lag_4,low_lag_5,...,volume_lag_5,volume_lag_6,volume_lag_7,volume_lag_8,volume_lag_9,volume_lag_10,low_rolling_mean_10,high_rolling_mean_10,close_rolling_mean_10,volume_rolling_mean_10
ID,ExecutionTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Fri00Q1,2022-01-01 02:30:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fri00Q1,2022-01-01 02:45:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fri00Q1,2022-01-01 03:00:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fri00Q1,2022-01-01 03:15:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fri00Q1,2022-01-01 03:30:00+01:00,Fri00Q1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
train_df.shape

(35150304, 49)

In [12]:
val_df.shape

(11411904, 49)

In [13]:
from sklearn.preprocessing import MinMaxScaler

# List of columns to scale
columns_to_scale = ['high', 'low', 'close', 'volume']

# Create copies of the DataFrames
train_df_scaled = train_df.copy()
val_df_scaled = val_df.copy()

# Dictionary to store scalers for each asset
scalers = {}

# Assets present in training data
assets_in_train = train_df_scaled.index.get_level_values('ID').unique()

for asset in assets_in_train:
    # Training data for this asset
    asset_train_data = train_df_scaled.loc[asset, columns_to_scale]
    
    # Initialize and fit the scaler
    scaler = MinMaxScaler()
    scaled_train_values = scaler.fit_transform(asset_train_data)
    
    # Replace training data with scaled values
    train_df_scaled.loc[asset, columns_to_scale] = scaled_train_values
    
    # Store the scaler
    scalers[asset] = scaler
    
    # Check if the asset exists in validation data
    if asset in val_df_scaled.index.get_level_values('ID'):
        asset_val_data = val_df_scaled.loc[asset, columns_to_scale]
        
        # Transform validation data
        scaled_val_values = scaler.transform(asset_val_data)
        
        # Replace validation data with scaled values
        val_df_scaled.loc[asset, columns_to_scale] = scaled_val_values
    else:
        # Asset not in validation data; no action needed
        pass

# Handle assets present only in validation data
assets_in_val = val_df_scaled.index.get_level_values('ID').unique()
assets_only_in_val = set(assets_in_val) - set(assets_in_train)

for asset in assets_only_in_val:
    print(f"Warning: Asset {asset} is present in validation data but not in training data. Skipping scaling for this asset.")
    # Decide how to handle these assets
    # For example, you could drop them:
    val_df_scaled = val_df_scaled.drop(asset, level='ID')

# # Reset index if necessary
# train_df_scaled = train_df_scaled.reset_index()
# val_df_scaled = val_df_scaled.reset_index()

# Now proceed with your modeling using train_df_scaled and val_df_scaled


In [14]:
train_df_scaled.shape

(35150304, 49)

In [15]:
train_df_scaled["ID"].nunique()

672

In [16]:
new_train_scaled_df = train_df_scaled
new_val_scaled_df = val_df_scaled

In [17]:
new_train_scaled_df['ID_numeric'] = new_train_scaled_df['ID'].astype('category').cat.codes
new_train_scaled_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ID,high,low,close,volume,low_lag_1,low_lag_2,low_lag_3,low_lag_4,low_lag_5,...,volume_lag_6,volume_lag_7,volume_lag_8,volume_lag_9,volume_lag_10,low_rolling_mean_10,high_rolling_mean_10,close_rolling_mean_10,volume_rolling_mean_10,ID_numeric
ID,ExecutionTime,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Fri00Q1,2022-01-01 02:30:00+01:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Fri00Q1,2022-01-01 02:45:00+01:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Fri00Q1,2022-01-01 03:00:00+01:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Fri00Q1,2022-01-01 03:15:00+01:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
Fri00Q1,2022-01-01 03:30:00+01:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [18]:
new_train_scaled_df.columns

Index(['ID', 'high', 'low', 'close', 'volume', 'low_lag_1', 'low_lag_2',
       'low_lag_3', 'low_lag_4', 'low_lag_5', 'low_lag_6', 'low_lag_7',
       'low_lag_8', 'low_lag_9', 'low_lag_10', 'high_lag_1', 'high_lag_2',
       'high_lag_3', 'high_lag_4', 'high_lag_5', 'high_lag_6', 'high_lag_7',
       'high_lag_8', 'high_lag_9', 'high_lag_10', 'close_lag_1', 'close_lag_2',
       'close_lag_3', 'close_lag_4', 'close_lag_5', 'close_lag_6',
       'close_lag_7', 'close_lag_8', 'close_lag_9', 'close_lag_10',
       'volume_lag_1', 'volume_lag_2', 'volume_lag_3', 'volume_lag_4',
       'volume_lag_5', 'volume_lag_6', 'volume_lag_7', 'volume_lag_8',
       'volume_lag_9', 'volume_lag_10', 'low_rolling_mean_10',
       'high_rolling_mean_10', 'close_rolling_mean_10',
       'volume_rolling_mean_10', 'ID_numeric'],
      dtype='object')

In [19]:
import pandas as pd
from darts import TimeSeries
from darts.models import RNNModel

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
# Remove timezone information from the 'ExecutionTime' column
# new_train_scaled_df['ExecutionTime'] = pd.to_datetime(new_train_scaled_df['ExecutionTime']).dt.tz_localize(None)
# new_train_scaled_df.index = new_train_scaled_df.index.tz_localize(None)

# Remove 'ID' from the index, keep 'ExecutionTime' as the index
new_train_scaled_df = new_train_scaled_df.reset_index(level='ID', drop=True)


In [21]:
new_train_scaled_df = new_train_scaled_df.reset_index()
new_train_scaled_df['ExecutionTime'] = pd.to_datetime(new_train_scaled_df['ExecutionTime']).dt.tz_localize(None)

new_train_scaled_df.head()

Unnamed: 0,ExecutionTime,ID,high,low,close,volume,low_lag_1,low_lag_2,low_lag_3,low_lag_4,...,volume_lag_6,volume_lag_7,volume_lag_8,volume_lag_9,volume_lag_10,low_rolling_mean_10,high_rolling_mean_10,close_rolling_mean_10,volume_rolling_mean_10,ID_numeric
0,2022-01-01 02:30:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,2022-01-01 02:45:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,2022-01-01 03:00:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,2022-01-01 03:15:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,2022-01-01 03:30:00,Fri00Q1,0.0,0.04895,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [23]:
# new_train_scaled_df = new_train_scaled_df.drop(columns=['index'])

In [None]:
# new_train_scaled_df.to_csv("new_train.csv")