In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Load the dataset
file_path = "S&P500_L5Y.csv"  # Update this if needed
df = pd.read_csv(file_path)

# Convert Date column to datetime format and sort in ascending order
df['Date'] = pd.to_datetime(df['Date'], format='%m/%d/%Y')
df = df.sort_values(by='Date')

# Rename 'Close/Last' for easier reference
df.rename(columns={'Close/Last': 'Close'}, inplace=True)

# Compute log returns
df['Log_Returns'] = np.log(df['Close'] / df['Close'].shift(1))

# Drop first row due to NaN in log returns
df = df.dropna()

# Perform Augmented Dickey-Fuller test to determine d (degree of differencing)
adf_test = adfuller(df['Close'])
d = 0 if adf_test[1] < 0.05 else 1  # If stationary, d=0; else, d=1

print(f"ADF Statistic: {adf_test[0]}")
print(f"p-value: {adf_test[1]}")
print(f"Critical Values: {adf_test[4]}")
print(f"Is the price series stationary? {'Yes' if d == 0 else 'No (differencing required)'}")

# Split into training (256 weeks) and testing (4 weeks)
train_size = 256 * 5  # 1280 days
test_size = 4 * 5  # 20 days

train_data = df.iloc[:-test_size]  # All but the last 20 days
test_data = df.iloc[-test_size:]  # Last 20 days

# Save to CSV if needed
train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)


ADF Statistic: -1.1998631669669666
p-value: 0.6736545712780757
Critical Values: {'1%': np.float64(-3.4355754676859886), '5%': np.float64(-2.8638475772391665), '10%': np.float64(-2.5679985805677017)}
Is the price series stationary? No (differencing required)
Training data shape: (1235, 6)
Testing data shape: (20, 6)


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import pmdarima
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error


print("NumPy version:", np.__version__)
print("pmdarima version:", pmdarima.__version__)

# Load preprocessed training data
df = pd.read_csv("train_data.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Plot ACF and PACF to determine p and q
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
plot_acf(df['Close'].diff().dropna() if d == 1 else df['Close'], ax=axes[0])  # Identify q
plot_pacf(df['Close'].diff().dropna() if d == 1 else df['Close'], ax=axes[1])  # Identify p
plt.show()

# Auto ARIMA to find the best (p,d,q)
auto_model = auto_arima(df['Close'], seasonal=False, stepwise=True, trace=True)
p, d, q = auto_model.order
print(f"Optimal ARIMA Order: ({p}, {d}, {q})")

# Fit ARIMA model for price levels
model = ARIMA(df['Close'], order=(p, d, q))
model_fit = model.fit()

# Print model summary
print(model_fit.summary())

# Load test dataset
test_df = pd.read_csv("test_data.csv")
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df.set_index('Date', inplace=True)

# Forecast future price levels
forecast = model_fit.forecast(steps=len(test_df))

# Evaluate the model
rmse = np.sqrt(mean_squared_error(test_df['Close'], forecast))
mae = mean_absolute_error(test_df['Close'], forecast)

print(f"RMSE: {rmse:.5f}")
print(f"MAE: {mae:.5f}")

# Plot actual vs. predicted prices
plt.figure(figsize=(10,5))
plt.plot(test_df.index, test_df['Close'], label="Actual Prices", marker='o')
plt.plot(test_df.index, forecast, label="Predicted Prices", linestyle='dashed', marker='x')
plt.legend()
plt.title(f"ARIMA({p},{d},{q}) Forecast vs Actual Prices")
plt.show()


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject