In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load preprocessed training data
df = pd.read_csv("train_data.csv")
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

# Plot ACF and PACF to determine p and q
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
plot_acf(df['Close'].diff().dropna() if d == 1 else df['Close'], ax=axes[0])  # Identify q
plot_pacf(df['Close'].diff().dropna() if d == 1 else df['Close'], ax=axes[1])  # Identify p
plt.show()

# Auto ARIMA to find the best (p,d,q)
auto_model = auto_arima(df['Close'], seasonal=False, stepwise=True, trace=True)
p, d, q = auto_model.order
print(f"Optimal ARIMA Order: ({p}, {d}, {q})")

# Fit ARIMA model for price levels
model = ARIMA(df['Close'], order=(p, d, q))
model_fit = model.fit()

# Print model summary
print(model_fit.summary())

# Load test dataset
test_df = pd.read_csv("test_data.csv")
test_df['Date'] = pd.to_datetime(test_df['Date'])
test_df.set_index('Date', inplace=True)

# Forecast future price levels
forecast = model_fit.forecast(steps=len(test_df))

# Evaluate the model
rmse = np.sqrt(mean_squared_error(test_df['Close'], forecast))
mae = mean_absolute_error(test_df['Close'], forecast)

print(f"RMSE: {rmse:.5f}")
print(f"MAE: {mae:.5f}")

# Plot actual vs. predicted prices
plt.figure(figsize=(10,5))
plt.plot(test_df.index, test_df['Close'], label="Actual Prices", marker='o')
plt.plot(test_df.index, forecast, label="Predicted Prices", linestyle='dashed', marker='x')
plt.legend()
plt.title(f"ARIMA({p},{d},{q}) Forecast vs Actual Prices")
plt.show()


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject