In [None]:
import pandas as pd
import datetime as dt
import os
import numpy as np
from scipy.stats import zscore

In [4]:
# Read in the stock price/volume data downloaded from Kaggle

df = pd.read_csv(os.path.join('Stocks','aapl.us.txt'),delimiter=',',usecols=['Date','Open', 'High', 'Low', 'Close', 'Volume'])
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume
0,1984-09-07,0.42388,0.42902,0.41874,0.42388,23220030
1,1984-09-10,0.42388,0.42516,0.41366,0.42134,18022532
2,1984-09-11,0.42516,0.43668,0.42516,0.42902,42498199
3,1984-09-12,0.42902,0.43157,0.41618,0.41618,37125801
4,1984-09-13,0.43927,0.44052,0.43927,0.43927,57822062


In [5]:
# Normalise the prices and volumes by converting prices to log-returns and converting volumes to a z-score

df_norm = (
    df
    .assign(
        ret_open = lambda x: np.log(x['Open']) - np.log(x['Open'].shift(1)),
        ret_high = lambda x: np.log(x['High']) - np.log(x['High'].shift(1)),
        ret_low = lambda x: np.log(x['Low']) - np.log(x['Low'].shift(1)),
        ret_close = lambda x: np.log(x['Close']) - np.log(x['Close'].shift(1)),
        z_volume = lambda x: x['Volume'].transform(zscore)
    )
)

df_norm.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume'], inplace=True)
df_norm = df_norm.iloc[1:]
df_norm.head()

Unnamed: 0,Date,ret_open,ret_high,ret_low,ret_close,z_volume
1,1984-09-10,0.0,-0.009038,-0.012206,-0.00601,-0.892025
2,1984-09-11,0.003015,0.026735,0.027421,0.018063,-0.645657
3,1984-09-12,0.009038,-0.011771,-0.021348,-0.030386,-0.699735
4,1984-09-13,0.023611,0.020526,0.053996,0.053996,-0.49141
5,1984-09-14,0.002842,0.034296,0.002842,0.014442,-0.380425


In [6]:
# Create the EWM versions of the log-returns and volumes using different half lives

for col in ['ret_open', 'ret_high', 'ret_low', 'ret_close', 'z_volume']:
    for halflife in [1, 5, 10, 20]:
        df_norm[f'{col}_ewm_{halflife}'] = df_norm[col].ewm(halflife=halflife).mean()

df_ewm = df_norm.drop(columns=['ret_open', 'ret_high', 'ret_low', 'z_volume'])
df_ewm.head()

Unnamed: 0,Date,ret_close,ret_open_ewm_1,ret_open_ewm_5,ret_open_ewm_10,ret_open_ewm_20,ret_high_ewm_1,ret_high_ewm_5,ret_high_ewm_10,ret_high_ewm_20,...,ret_low_ewm_10,ret_low_ewm_20,ret_close_ewm_1,ret_close_ewm_5,ret_close_ewm_10,ret_close_ewm_20,z_volume_ewm_1,z_volume_ewm_5,z_volume_ewm_10,z_volume_ewm_20
1,1984-09-10,-0.00601,0.0,0.0,0.0,0.0,-0.009038,-0.009038,-0.009038,-0.009038,...,-0.012206,-0.012206,-0.00601,-0.00601,-0.00601,-0.00601,-0.892025,-0.892025,-0.892025,-0.892025
2,1984-09-11,0.018063,0.00201,0.001612,0.00156,0.001534,0.014811,0.010086,0.009468,0.009158,...,0.008294,0.007951,0.010039,0.00686,0.006444,0.006235,-0.72778,-0.760317,-0.764574,-0.766707
3,1984-09-12,-0.030386,0.006026,0.004437,0.004227,0.004122,-0.000379,0.001771,0.001892,0.001939,...,-0.002279,-0.002156,-0.013061,-0.007311,-0.006693,-0.006397,-0.711754,-0.737268,-0.741447,-0.743605
4,1984-09-13,0.053996,0.015405,0.010268,0.009588,0.009251,0.01077,0.007475,0.007046,0.00683,...,0.013285,0.01262,0.022703,0.011334,0.010091,0.009495,-0.594237,-0.662497,-0.672296,-0.677242
5,1984-09-14,0.014442,0.00892,0.008346,0.008045,0.007878,0.022913,0.014419,0.013276,0.01271,...,0.010897,0.010527,0.018439,0.012139,0.011086,0.010554,-0.483882,-0.589469,-0.605563,-0.613694


In [7]:
# Change the date string to a datetime and apply it as the index of the dataframe

def str_to_datetime(s):
    split = s.split('-')
    year, month, day = int(split[0]), int(split[1]), int(split[2])
    return dt.datetime(year=year, month=month, day=day)

df_ewm['Date'] = df_ewm['Date'].apply(str_to_datetime)
df_ewm.index = df_ewm.pop('Date')
df_ewm.head()

Unnamed: 0_level_0,ret_close,ret_open_ewm_1,ret_open_ewm_5,ret_open_ewm_10,ret_open_ewm_20,ret_high_ewm_1,ret_high_ewm_5,ret_high_ewm_10,ret_high_ewm_20,ret_low_ewm_1,...,ret_low_ewm_10,ret_low_ewm_20,ret_close_ewm_1,ret_close_ewm_5,ret_close_ewm_10,ret_close_ewm_20,z_volume_ewm_1,z_volume_ewm_5,z_volume_ewm_10,z_volume_ewm_20
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984-09-10,-0.00601,0.0,0.0,0.0,0.0,-0.009038,-0.009038,-0.009038,-0.009038,-0.012206,...,-0.012206,-0.012206,-0.00601,-0.00601,-0.00601,-0.00601,-0.892025,-0.892025,-0.892025,-0.892025
1984-09-11,0.018063,0.00201,0.001612,0.00156,0.001534,0.014811,0.010086,0.009468,0.009158,0.014212,...,0.008294,0.007951,0.010039,0.00686,0.006444,0.006235,-0.72778,-0.760317,-0.764574,-0.766707
1984-09-12,-0.030386,0.006026,0.004437,0.004227,0.004122,-0.000379,0.001771,0.001892,0.001939,-0.006108,...,-0.002279,-0.002156,-0.013061,-0.007311,-0.006693,-0.006397,-0.711754,-0.737268,-0.741447,-0.743605
1984-09-13,0.053996,0.015405,0.010268,0.009588,0.009251,0.01077,0.007475,0.007046,0.00683,0.025948,...,0.013285,0.01262,0.022703,0.011334,0.010091,0.009495,-0.594237,-0.662497,-0.672296,-0.677242
1984-09-14,0.014442,0.00892,0.008346,0.008045,0.007878,0.022913,0.014419,0.013276,0.01271,0.014022,...,0.010897,0.010527,0.018439,0.012139,0.011086,0.010554,-0.483882,-0.589469,-0.605563,-0.613694


In [8]:
def df_to_windowed_df(dataframe, first_date_str, last_date_str, input_features, target, n=3, horizon=1):
    first_date = str_to_datetime(first_date_str)
    last_date = str_to_datetime(last_date_str)

    target_date = first_date
    
    dates = []
    X, Y = [], []

    last_time = False
    while True:
        # Extract window of size n + horizon
        df_subset = dataframe.loc[:target_date].tail(n + horizon)

        if len(df_subset) != n + horizon:
            print(f'Error: Window of size {n + horizon} is too large for date {target_date}')
            return

        # Multivariate input features (X)
        input_data = df_subset[input_features].to_numpy()
        x = input_data[:n]

        # Multistep output (y) using one of the features (e.g., the first one in the list)
        target_data = df_subset[target].to_numpy()
        y = target_data[n:, 0]  # You can change 0 to the index of the target feature

        dates.append(target_date)
        X.append(x.flatten())
        Y.append(y)

        # Advance to next target date
        next_week = dataframe.loc[target_date:target_date + dt.timedelta(days=7)]
        next_datetime_str = str(next_week.head(2).tail(1).index.values[0])
        next_date_str = next_datetime_str.split('T')[0]
        year_month_day = next_date_str.split('-')
        year, month, day = year_month_day
        next_date = dt.datetime(day=int(day), month=int(month), year=int(year))

        if last_time:
            break

        target_date = next_date

        if target_date == last_date:
            last_time = True

    # Create result DataFrame
    ret_df = pd.DataFrame({})
    ret_df['Target Date'] = dates

    X = np.array(X)
    for i in range(X.shape[1]):
        ret_df[f'X{i}'] = X[:, i]

    Y = np.array(Y)
    for i in range(Y.shape[1]):
        ret_df[f'Target+{i+1}'] = Y[:, i]

    return ret_df

In [9]:
def windowed_df_to_date_X_y(windowed_dataframe):
    
    dates = windowed_dataframe['Target Date']
    X_columns = [col for col in windowed_dataframe.columns if col.startswith('X')]
    y_columns = [col for col in windowed_dataframe.columns if col.startswith('Target+')]

    X = windowed_dataframe[X_columns]
    y = windowed_dataframe[y_columns]

    return dates, X, y

In [10]:
window_size = 5
pred_horizon = 1

windowed_df_5D_1D = df_to_windowed_df(df_ewm, 
                                '1984-09-17', 
                                '2017-11-10', 
                                list(df_ewm.loc[:, df_ewm.columns != 'ret_close']),
                                ['ret_close'],
                                n=window_size,
                                horizon=pred_horizon)

dates_5D_1D, X_5D, y_1D = windowed_df_to_date_X_y(windowed_df_5D_1D)

  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'Target+{i+1}'] = Y[:, i]


In [11]:
window_size = 10
pred_horizon = 5

windowed_df_10D_5D = df_to_windowed_df(df_ewm, 
                                '1984-09-28', 
                                '2017-11-10', 
                                list(df_ewm.loc[:, df_ewm.columns != 'ret_close']),
                                ['ret_close'],
                                n=window_size,
                                horizon=pred_horizon)

dates_10D_5D, X_10D, y_5D = windowed_df_to_date_X_y(windowed_df_10D_5D)

  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = 

In [12]:
window_size = 20
pred_horizon = 10

windowed_df_20D_10D = df_to_windowed_df(df_ewm, 
                                '1984-10-19', 
                                '2017-11-10', 
                                list(df_ewm.loc[:, df_ewm.columns != 'ret_close']),
                                ['ret_close'],
                                n=window_size,
                                horizon=pred_horizon)

dates_20D_10D, X_20D, y_10D = windowed_df_to_date_X_y(windowed_df_20D_10D)

  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = X[:, i]
  ret_df[f'X{i}'] = 

In [13]:
stock = 'AAPL'

X_5D.to_parquet(stock+'_X_5D.gzip', compression='gzip')
y_1D.to_parquet(stock+'_y_1D.gzip', compression='gzip')

X_10D.to_parquet(stock+'_X_10D.gzip', compression='gzip')
y_5D.to_parquet(stock+'_y_5D.gzip', compression='gzip')

X_20D.to_parquet(stock+'_X_20D.gzip', compression='gzip')
y_10D.to_parquet(stock+'_y_10D.gzip', compression='gzip')