In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Load the dataset
file_path = "S&P500_L5Y.csv"  # Update this if needed
df = pd.read_csv(file_path)

# Convert Date column to datetime format and sort in ascending order
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df = df.sort_values(by='Date')

# Rename 'Close/Last' for easier reference
df.rename(columns={'Close/Last': 'Close'}, inplace=True)

# Compute log returns
df['Log_Returns'] = np.log(df['Close'] / df['Close'].shift(1))

# Drop first row due to NaN in log returns
df = df.dropna()

# Perform Augmented Dickey-Fuller test to determine d (degree of differencing)
adf_test = adfuller(df['Close'])
d = 0 if adf_test[1] < 0.05 else 1  # If stationary, d=0; else, d=1

print(f"ADF Statistic: {adf_test[0]}")
print(f"p-value: {adf_test[1]}")
print(f"Critical Values: {adf_test[4]}")
print(f"Is the price series stationary? {'Yes' if d == 0 else 'No (differencing required)'}")

# Split into training (256 weeks) and testing (4 weeks)
train_size = 256 * 5  # 1280 days
test_size = 4 * 5  # 20 days

train_data = df.iloc[:-test_size]  # All but the last 20 days
test_data = df.iloc[-test_size:]  # Last 20 days

# Save to CSV if needed
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)


ADF Statistic: -1.1998631669669666
p-value: 0.6736545712780757
Critical Values: {'1%': np.float64(-3.4355754676859886), '5%': np.float64(-2.8638475772391665), '10%': np.float64(-2.5679985805677017)}
Is the price series stationary? No (differencing required)
Training data shape: (1235, 6)
Testing data shape: (20, 6)
