In [None]:
import pandas as pd
import datetime as dt
import os
import numpy as np
from scipy.stats import zscore

In [2]:
# Read in the stock price/volume data downloaded from Kaggle

df = pd.read_csv(os.path.join('Stocks','hpq.us.txt'),delimiter=',',usecols=['Date','Open', 'High', 'Low', 'Close', 'Volume'])
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1970-01-02,0.30627,0.30627,0.30627,0.30627,2441124
1,1970-01-05,0.30627,0.31768,0.30627,0.31385,6219848
2,1970-01-06,0.31385,0.31385,0.30996,0.30996,4447522
3,1970-01-07,0.31385,0.31385,0.31385,0.31385,1872638
4,1970-01-08,0.31385,0.31768,0.31385,0.31385,4280330


In [3]:
# Normalise the prices and volumes by converting prices to log-returns and converting volumes to a z-score

df_norm = (
    df
    .assign(
        ret_open = lambda x: np.log(x['Open']) - np.log(x['Open'].shift(1)),
        ret_high = lambda x: np.log(x['High']) - np.log(x['High'].shift(1)),
        ret_low = lambda x: np.log(x['Low']) - np.log(x['Low'].shift(1)),
        ret_close = lambda x: np.log(x['Close']) - np.log(x['Close'].shift(1)),
        z_volume = lambda x: x['Volume'].transform(zscore)
    )
)

df_norm.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume'], inplace=True)
df_norm = df_norm.iloc[1:]
df_norm.head()

Unnamed: 0,Date,ret_open,ret_high,ret_low,ret_close,z_volume
1,1970-01-05,0.0,0.036578,0.0,0.024448,-0.656026
2,1970-01-06,0.024448,-0.012129,0.011976,-0.012472,-0.739312
3,1970-01-07,0.0,0.0,0.012472,0.012472,-0.860314
4,1970-01-08,0.0,0.012129,0.0,0.0,-0.747169
5,1970-01-09,0.0,0.0,0.0,0.012129,-0.872885


In [4]:
# Create the EWM versions of the log-returns and volumes using different half lives

for col in ['ret_open', 'ret_high', 'ret_low', 'ret_close', 'z_volume']:
    for halflife in [1, 5, 10, 20]:
        df_norm[f'{col}_ewm_{halflife}'] = df_norm[col].ewm(halflife=halflife).mean()

df_ewm = df_norm.drop(columns=['ret_open', 'ret_high', 'ret_low', 'z_volume'])
df_ewm.head()

Unnamed: 0,Date,ret_close,ret_open_ewm_1,ret_open_ewm_5,ret_open_ewm_10,ret_open_ewm_20,ret_high_ewm_1,ret_high_ewm_5,ret_high_ewm_10,ret_high_ewm_20,...,ret_low_ewm_10,ret_low_ewm_20,ret_close_ewm_1,ret_close_ewm_5,ret_close_ewm_10,ret_close_ewm_20,z_volume_ewm_1,z_volume_ewm_5,z_volume_ewm_10,z_volume_ewm_20
1,1970-01-05,0.024448,0.0,0.0,0.0,0.0,0.036578,0.036578,0.036578,0.036578,...,0.0,0.0,0.024448,0.024448,0.024448,0.024448,-0.656026,-0.656026,-0.656026,-0.656026
2,1970-01-06,-0.012472,0.016299,0.01307,0.012648,0.012436,0.004106,0.010539,0.01138,0.011802,...,0.006196,0.006092,-0.000165,0.004711,0.005349,0.005668,-0.71155,-0.700551,-0.699112,-0.698391
3,1970-01-07,0.012472,0.006985,0.008097,0.008136,0.008146,0.00176,0.006529,0.007321,0.007731,...,0.008434,0.008293,0.007056,0.007663,0.007889,0.008015,-0.796558,-0.761334,-0.75661,-0.754246
4,1970-01-08,0.0,0.00326,0.005635,0.005886,0.006003,0.00729,0.008232,0.008651,0.008888,...,0.006102,0.006111,0.003293,0.005333,0.005707,0.005906,-0.770217,-0.757026,-0.753999,-0.752384
5,1970-01-09,0.012129,0.001577,0.004176,0.00454,0.004717,0.003528,0.006101,0.006673,0.006985,...,0.004707,0.004802,0.007854,0.007092,0.007176,0.007238,-0.823207,-0.787022,-0.781181,-0.778183


In [5]:
# Change the date string to a datetime and apply it as the index of the dataframe

def str_to_datetime(s):
    split = s.split('-')
    year, month, day = int(split[0]), int(split[1]), int(split[2])
    return dt.datetime(year=year, month=month, day=day)

df_ewm['Date'] = df_ewm['Date'].apply(str_to_datetime)
df_ewm.index = df_ewm.pop('Date')
df_ewm.head()

Unnamed: 0_level_0,ret_close,ret_open_ewm_1,ret_open_ewm_5,ret_open_ewm_10,ret_open_ewm_20,ret_high_ewm_1,ret_high_ewm_5,ret_high_ewm_10,ret_high_ewm_20,ret_low_ewm_1,...,ret_low_ewm_10,ret_low_ewm_20,ret_close_ewm_1,ret_close_ewm_5,ret_close_ewm_10,ret_close_ewm_20,z_volume_ewm_1,z_volume_ewm_5,z_volume_ewm_10,z_volume_ewm_20
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1970-01-05,0.024448,0.0,0.0,0.0,0.0,0.036578,0.036578,0.036578,0.036578,0.0,...,0.0,0.0,0.024448,0.024448,0.024448,0.024448,-0.656026,-0.656026,-0.656026,-0.656026
1970-01-06,-0.012472,0.016299,0.01307,0.012648,0.012436,0.004106,0.010539,0.01138,0.011802,0.007984,...,0.006196,0.006092,-0.000165,0.004711,0.005349,0.005668,-0.71155,-0.700551,-0.699112,-0.698391
1970-01-07,0.012472,0.006985,0.008097,0.008136,0.008146,0.00176,0.006529,0.007321,0.007731,0.010549,...,0.008434,0.008293,0.007056,0.007663,0.007889,0.008015,-0.796558,-0.761334,-0.75661,-0.754246
1970-01-08,0.0,0.00326,0.005635,0.005886,0.006003,0.00729,0.008232,0.008651,0.008888,0.004923,...,0.006102,0.006111,0.003293,0.005333,0.005707,0.005906,-0.770217,-0.757026,-0.753999,-0.752384
1970-01-09,0.012129,0.001577,0.004176,0.00454,0.004717,0.003528,0.006101,0.006673,0.006985,0.002382,...,0.004707,0.004802,0.007854,0.007092,0.007176,0.007238,-0.823207,-0.787022,-0.781181,-0.778183


In [6]:
def df_to_windowed_df(dataframe, first_date_str, last_date_str, input_features, target, n=3, horizon=1):
    first_date = str_to_datetime(first_date_str)
    last_date = str_to_datetime(last_date_str)

    target_date = first_date
    
    dates = []
    X, Y = [], []

    last_time = False
    while True:
        # Extract window of size n + horizon
        df_subset = dataframe.loc[:target_date].tail(n + horizon)

        if len(df_subset) != n + horizon:
            print(f'Error: Window of size {n + horizon} is too large for date {target_date}')
            return

        # Multivariate input features (X)
        input_data = df_subset[input_features].to_numpy()
        x = input_data[:n]

        # Multistep output (y) using one of the features (e.g., the first one in the list)
        target_data = df_subset[target].to_numpy()
        y = target_data[n:, 0]  # You can change 0 to the index of the target feature

        dates.append(target_date)
        X.append(x.flatten())
        Y.append(y)

        # Advance to next target date
        next_week = dataframe.loc[target_date:target_date + dt.timedelta(days=7)]
        next_datetime_str = str(next_week.head(2).tail(1).index.values[0])
        next_date_str = next_datetime_str.split('T')[0]
        year_month_day = next_date_str.split('-')
        year, month, day = year_month_day
        next_date = dt.datetime(day=int(day), month=int(month), year=int(year))

        if last_time:
            break

        target_date = next_date

        if target_date == last_date:
            last_time = True

    # Create result DataFrame
    ret_df = pd.DataFrame({})
    ret_df['Target Date'] = dates

    X = np.array(X)
    for i in range(X.shape[1]):
        ret_df[f'X{i}'] = X[:, i]

    Y = np.array(Y)
    for i in range(Y.shape[1]):
        ret_df[f'Target+{i+1}'] = Y[:, i]

    return ret_df

In [7]:
def windowed_df_to_date_X_y(windowed_dataframe):
    
    dates = windowed_dataframe['Target Date']
    X_columns = [col for col in windowed_dataframe.columns if col.startswith('X')]
    y_columns = [col for col in windowed_dataframe.columns if col.startswith('Target+')]

    X = windowed_dataframe[X_columns]
    y = windowed_dataframe[y_columns]

    return dates, X, y

In [8]:
window_size = 5
pred_horizon = 1

windowed_df_5D_1D = df_to_windowed_df(df_ewm, 
                                '1970-01-12', 
                                '2017-11-10', 
                                list(df_ewm.loc[:, df_ewm.columns != 'ret_close']),
                                ['ret_close'],
                                n=window_size,
                                horizon=pred_horizon)

dates_5D_1D, X_5D, y_1D = windowed_df_to_date_X_y(windowed_df_5D_1D)

  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'Target+{i+1}'] = Y[:, i]


In [9]:
window_size = 10
pred_horizon = 5

windowed_df_10D_5D = df_to_windowed_df(df_ewm, 
                                '1970-01-23', 
                                '2017-11-10', 
                                list(df_ewm.loc[:, df_ewm.columns != 'ret_close']),
                                ['ret_close'],
                                n=window_size,
                                horizon=pred_horizon)

dates_10D_5D, X_10D, y_5D = windowed_df_to_date_X_y(windowed_df_10D_5D)

  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = 

In [10]:
window_size = 20
pred_horizon = 10

windowed_df_20D_10D = df_to_windowed_df(df_ewm, 
                                '1970-02-13', 
                                '2017-11-10', 
                                list(df_ewm.loc[:, df_ewm.columns != 'ret_close']),
                                ['ret_close'],
                                n=window_size,
                                horizon=pred_horizon)

dates_20D_10D, X_20D, y_10D = windowed_df_to_date_X_y(windowed_df_20D_10D)

  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = 

In [11]:
stock = 'HPQ'

X_5D.to_parquet(stock+'_X_5D.gzip', compression='gzip')
y_1D.to_parquet(stock+'_y_1D.gzip', compression='gzip')

X_10D.to_parquet(stock+'_X_10D.gzip', compression='gzip')
y_5D.to_parquet(stock+'_y_5D.gzip', compression='gzip')

X_20D.to_parquet(stock+'_X_20D.gzip', compression='gzip')
y_10D.to_parquet(stock+'_y_10D.gzip', compression='gzip')