In [24]:
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [26]:
# read the pandas dataframe from CSV file
def read_dataframe_from_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

df = pd.read_csv("../data/spx_data_with_features.csv", index_col=0)

# reset index to default integer index
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,Close,High,Low,Open,Volume,Year,Month,Day,Weekday,Return,Log Return,Log Return Lag-1,Log Return Lag-2,Log Return Lag-5,Rolling Mean 5,Rolling Std 20,Rolling Skew 30,Rolling Kurt 30
0,1402.050049,1407.719971,1376.25,1389.939941,1092100000,2000,2,15,1,0.008713,0.008675,0.002031,-0.021192,0.012198,-0.00558,0.013786,-0.625913,0.582261
1,1387.670044,1404.550049,1385.579956,1402.050049,1018800000,2000,2,16,2,-0.010256,-0.010309,0.008675,0.002031,-0.021035,-0.003435,0.0139,-0.285637,0.249951
2,1388.26001,1399.880005,1380.069946,1387.670044,1034800000,2000,2,17,3,0.000425,0.000425,-0.010309,0.008675,0.00362,-0.004074,0.013867,-0.274113,0.251292
3,1346.089966,1388.589966,1345.319946,1388.26001,1042300000,2000,2,18,4,-0.030376,-0.030847,0.000425,-0.010309,-0.021192,-0.006005,0.015295,-0.329597,-0.034608
4,1352.170044,1358.109985,1331.880005,1346.089966,980000000,2000,2,22,1,0.004517,0.004507,-0.030847,0.000425,0.002031,-0.00551,0.014234,-0.562546,0.035657


In [27]:
def create_windowed_dataset(X, y, window_size):
    Xs, ys = [], []
    for i in range(len(X) - window_size):
        Xs.append(X.iloc[i:(i + window_size)].values)
        ys.append(y.iloc[i + window_size])
    return np.array(Xs), np.array(ys)

features = ['feature1', 'feature2', 'feature3']  # replace with your feature names

X = df[df.columns]  # all columns with your engineered features
df["Log Returns Future 1D"] = df["Log Return"].shift(-1)   # e.g. log_return shifted by -1
y = df["Log Returns Future 1D"]  # target variable
X_seq, y_seq = create_windowed_dataset(X, y, window_size=10)

X_seq.shape, y_seq.shape

((6249, 10, 18), (6249,))

In [28]:
train_size = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:train_size], X_seq[train_size:]
y_train, y_test = y_seq[:train_size], y_seq[train_size:]

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4999, 10, 18), (1250, 10, 18), (4999,), (1250,))

In [29]:
scaler = StandardScaler()
X_train_flat = X_train.reshape(-1, X_train.shape[-1])
X_train_scaled = scaler.fit_transform(X_train_flat).reshape(X_train.shape)

X_test_flat = X_test.reshape(-1, X_test.shape[-1])
X_test_scaled = scaler.transform(X_test_flat).reshape(X_test.shape)

X_train_flat.shape, X_train_scaled.shape, X_test_flat.shape, X_test_scaled.shape

((49990, 18), (4999, 10, 18), (12500, 18), (1250, 10, 18))

#### Q4

In [30]:
def walk_forward_split(X, y, initial_train_size, test_size, step_size, expanding=True):
    splits = []
    train_start = 0
    train_end = initial_train_size

    while train_end + test_size <= len(X):
        test_start = train_end
        test_end = train_end + test_size

        X_train = X[train_start:train_end]
        y_train = y[train_start:train_end]
        X_test = X[test_start:test_end]
        y_test = y[test_start:test_end]

        splits.append(((X_train, y_train), (X_test, y_test)))

        train_end += step_size
        if not expanding:
            train_start += step_size  # rolling window

    return splits