In [None]:
# cell 1
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline

from task1_data_preprocessing import preprocess_data
from statsmodels.tsa.stattools import adfuller
from pmdarima import auto_arima
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [None]:
# cell 2: load data & split
data = preprocess_data()
pivot_close = data["pivot_close"]
series = pivot_close['TSLA'].dropna().sort_index()

train_end = "2023-12-31"
test_start = "2024-01-01"
train = series[:train_end]
test = series[test_start:]

print("Train:", train.index.min(), "->", train.index.max(), " (", len(train), "obs )")
print("Test:", test.index.min(), "->", test.index.max(), " (", len(test), "obs )")

plt.figure(figsize=(12,4)); series.plot(title="TSLA - Closing Price"); plt.show()


In [None]:
# cell 3: ARIMA (auto_arima), diagnose & forecast
train_log = np.log(train)

arima_model = auto_arima(train_log, seasonal=False, stepwise=True, trace=True,
                         error_action='ignore', suppress_warnings=True,
                         max_p=5, max_q=5, max_d=3)
print(arima_model.summary())

n_periods = len(test)
fc_log, confint = arima_model.predict(n_periods=n_periods, return_conf_int=True)
fc = np.exp(fc_log)

# Plot
plt.figure(figsize=(12,5))
plt.plot(train.index, np.exp(train_log), label='Train')
plt.plot(test.index, test.values, label='Actual')
plt.plot(test.index, fc, label='ARIMA Forecast')
plt.fill_between(test.index, np.exp(confint[:,0]), np.exp(confint[:,1]), color='pink', alpha=0.3)
plt.legend(); plt.title("ARIMA Forecast (log-model)")
plt.show()


In [None]:
# cell 4:Evaluate ARIMA
def rmse(y_true, y_pred): return math.sqrt(mean_squared_error(y_true, y_pred))
def mape(y_true, y_pred): return np.mean(np.abs((y_true - y_pred)/np.where(y_true==0,1e-8,y_true)))*100

arima_mae = mean_absolute_error(test.values, fc)
arima_rmse = rmse(test.values, fc)
arima_mape = mape(test.values, fc)
print("ARIMA MAE:", arima_mae, "RMSE:", arima_rmse, "MAPE:", arima_mape)


In [None]:
# cell 5: LSTM data prep
lookback = 60
scaler = MinMaxScaler(feature_range=(0,1))
vals = series.values.reshape(-1,1)
vals_scaled = scaler.fit_transform(vals)

# create sequences
def create_sequences(data, lookback):
    X, y = [], []
    for i in range(lookback, len(data)):
        X.append(data[i-lookback:i, 0])
        y.append(data[i,0])
    return np.array(X), np.array(y)

X_all, y_all = create_sequences(vals_scaled, lookback)
dates_all = series.index[lookback:]

train_mask = dates_all <= train_end
test_mask = dates_all >= test_start

X_train = X_all[train_mask]; y_train = y_all[train_mask]
X_test = X_all[test_mask]; y_test = y_all[test_mask]

X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test  = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

print("LSTM train shape:", X_train.shape, "test shape:", X_test.shape)


In [None]:
# cell 6: LSTM model training
tf.random.set_seed(42)
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(X_train.shape[1], 1)),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')
es = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
history = model.fit(X_train, y_train, validation_split=0.1, epochs=100, batch_size=32, callbacks=[es], verbose=1)


In [None]:
# cell 7: LSTM forecast & evaluate
y_pred_scaled = model.predict(X_test)
y_pred = scaler.inverse_transform(y_pred_scaled)
y_test_true = scaler.inverse_transform(y_test.reshape(-1,1))

lstm_mae = mean_absolute_error(y_test_true, y_pred)
lstm_rmse = rmse(y_test_true, y_pred)
lstm_mape = mape(y_test_true.flatten(), y_pred.flatten())

print("LSTM MAE:", lstm_mae, "RMSE:", lstm_rmse, "MAPE:", lstm_mape)

plt.figure(figsize=(12,5))
plt.plot(test.index, test.values, label='Actual')
# LSTM dates are dates_all[test_mask]
lstm_dates = dates_all[test_mask]
plt.plot(lstm_dates, y_pred.flatten(), label='LSTM Forecast')
plt.legend(); plt.title("LSTM Predictions vs Actual")
plt.show()


In [None]:
# cell 8: Compare & conclude
print("ARIMA -> MAE, RMSE, MAPE:", arima_mae, arima_rmse, arima_mape)
print("LSTM  -> MAE, RMSE, MAPE:", lstm_mae, lstm_rmse, lstm_mape)

# Save metrics to CSV/JSON if desired
metrics = {
    "arima": {"mae":float(arima_mae),"rmse":float(arima_rmse),"mape":float(arima_mape)},
    "lstm":  {"mae":float(lstm_mae),"rmse":float(lstm_rmse),"mape":float(lstm_mape)}
}
import json
with open("task2_metrics.json","w") as f:
    json.dump(metrics,f,indent=2)

# Brief discussion placeholder
print("\nDiscussion:\n- Compare metrics and visual shape of forecasts.\n- ARIMA may be strong for linear, stationary dynamics after differencing.\n- LSTM may capture nonlinear patterns but needs more data/tuning and careful validation.\n")
