In [None]:
import os
import pandas as pd
import numpy as np
import itertools
from tqdm import tqdm
from prophet import Prophet
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

# ----------- Folder Setup -------------
data_folder = r'C:\Users\jahna\OneDrive\Documents\TIME_SERIES_STOCK_FORECASTING\PREPROCESSING\CLEAN_DATA_FINAL'
traditional_output = r'C:\Users\jahna\OneDrive\Documents\TIME_SERIES_STOCK_FORECASTING\Time_Series_Models\time_series_outputs'
lstm_output = 
os.makedirs(traditional_output, exist_ok=True)
os.makedirs(lstm_output, exist_ok=True)

# ----------- Evaluation Metric -------------
def evaluate(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return rmse, mae, mape

# ----------- ARIMA Tuning -------------
def tune_arima(train, test):
    best_score, best_cfg = float("inf"), None
    for p,d,q in itertools.product(range(0, 3), range(0,2), range(0,3)):
        try:
            model = ARIMA(train, order=(p,d,q)).fit()
            pred = model.forecast(steps=len(test))
            rmse = np.sqrt(mean_squared_error(test, pred))
            if rmse < best_score:
                best_score, best_cfg = rmse, (p,d,q)
        except:
            continue
    final_model = ARIMA(train, order=best_cfg).fit()
    preds = final_model.forecast(steps=len(test))
    return preds, best_cfg

# ----------- SARIMA Tuning -------------
def tune_sarima(train, test):
    best_score, best_cfg = float("inf"), None
    for pdq in itertools.product(range(0,2), range(0,2), range(0,2)):
        for seasonal_pdq in itertools.product(range(0,2), range(0,2), range(0,2)):
            try:
                model = SARIMAX(train, order=pdq, seasonal_order=seasonal_pdq + (12,)).fit(disp=False)
                pred = model.forecast(len(test))
                rmse = np.sqrt(mean_squared_error(test, pred))
                if rmse < best_score:
                    best_score, best_cfg = rmse, (pdq, seasonal_pdq + (12,))
            except:
                continue
    final_model = SARIMAX(train, order=best_cfg[0], seasonal_order=best_cfg[1]).fit(disp=False)
    preds = final_model.forecast(len(test))
    return preds, best_cfg

# ----------- Prophet Tuning -------------
def tune_prophet(df_prophet):
    best_score = float("inf")
    best_params = None
    best_preds = None
    for cps in [0.01, 0.1, 0.5]:
        for mode in ['additive', 'multiplicative']:
            try:
                model = Prophet(changepoint_prior_scale=cps, seasonality_mode=mode)
                model.fit(df_prophet)
                future = model.make_future_dataframe(periods=30)
                forecast = model.predict(future)
                y_pred = forecast.iloc[-30:]['yhat'].values
                y_true = df_prophet['y'].values[-30:]
                rmse = np.sqrt(mean_squared_error(y_true, y_pred))
                if rmse < best_score:
                    best_score = rmse
                    best_preds = y_pred
                    best_params = {'cps': cps, 'mode': mode}
            except:
                continue
    return best_preds, best_params

# ----------- LSTM Model -------------
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

def train_lstm(series):
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(series.reshape(-1, 1))

    seq_len = 30
    X, y = create_sequences(scaled_data, seq_len)
    X = X.reshape((X.shape[0], X.shape[1], 1))

    train_size = int(0.8 * len(X))
    X_train, y_train = X[:train_size], y[:train_size]
    X_test, y_test = X[train_size:], y[train_size:]

    model = Sequential([
        LSTM(50, activation='relu', input_shape=(seq_len, 1)),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')

    es = EarlyStopping(monitor='val_loss', patience=3)
    model.fit(X_train, y_train, validation_split=0.2, epochs=20, batch_size=16, callbacks=[es], verbose=0)

    preds = model.predict(X_test)
    preds = scaler.inverse_transform(preds)
    y_test_actual = scaler.inverse_transform(y_test.reshape(-1, 1))

    return preds.flatten(), y_test_actual.flatten()

# ----------- Process Each Stock File -------------
traditional_records = []
lstm_records = []

for file in tqdm(os.listdir(data_folder)):
    if not file.endswith(".csv"): continue
    filepath = os.path.join(data_folder, file)
    df = pd.read_csv(filepath)
    
    if 'Date' not in df.columns or 'Close' not in df.columns:
        print(f"Skipping {file}: Missing required columns")
        continue

    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date')
    series = df['Close'].values
    stock_name = os.path.splitext(file)[0]

    train_size = int(len(series) * 0.8)
    train, test = series[:train_size], series[train_size:]

    # ARIMA
    try:
        arima_pred, arima_params = tune_arima(train, test)
        arima_rmse, arima_mae, arima_mape = evaluate(test, arima_pred)
    except:
        arima_rmse = arima_mae = arima_mape = None

    # SARIMA
    try:
        sarima_pred, sarima_params = tune_sarima(train, test)
        sarima_rmse, sarima_mae, sarima_mape = evaluate(test, sarima_pred)
    except:
        sarima_rmse = sarima_mae = sarima_mape = None

    # Prophet
    try:
        df_prophet = df[['Date', 'Close']].rename(columns={'Date': 'ds', 'Close': 'y'})
        prophet_pred, prophet_params = tune_prophet(df_prophet)
        y_true = df_prophet['y'].values[-30:]
        prophet_rmse, prophet_mae, prophet_mape = evaluate(y_true, prophet_pred)
    except:
        prophet_rmse = prophet_mae = prophet_mape = None

    # Save traditional metrics
    traditional_records.append({
        'Stock': stock_name,
        'ARIMA_RMSE': arima_rmse, 'ARIMA_MAE': arima_mae, 'ARIMA_MAPE': arima_mape,
        'SARIMA_RMSE': sarima_rmse, 'SARIMA_MAE': sarima_mae, 'SARIMA_MAPE': sarima_mape,
        'Prophet_RMSE': prophet_rmse, 'Prophet_MAE': prophet_mae, 'Prophet_MAPE': prophet_mape,
    })

    # LSTM
    try:
        lstm_pred, lstm_actual = train_lstm(series)
        lstm_rmse, lstm_mae, lstm_mape = evaluate(lstm_actual, lstm_pred)
    except:
        lstm_rmse = lstm_mae = lstm_mape = None

    lstm_records.append({
        'Stock': stock_name,
        'LSTM_RMSE': lstm_rmse,
        'LSTM_MAE': lstm_mae,
        'LSTM_MAPE': lstm_mape
    })

# ----------- Save Individual Results -------------
pd.DataFrame(traditional_records).to_csv(f"{traditional_output}/model_metrics_summary.csv", index=False)
pd.DataFrame(lstm_records).to_csv(f"{lstm_output}/lstm_model_metrics.csv", index=False)

# ----------- Final Comparison -------------
try:
    traditional_df = pd.read_csv(f"{traditional_output}/model_metrics_summary.csv")
except:
    traditional_df = pd.DataFrame()

try:
    lstm_df = pd.read_csv(f"{lstm_output}/lstm_model_metrics.csv")
except:
    lstm_df = pd.DataFrame()

combined_df = pd.merge(traditional_df, lstm_df, on='Stock', how='outer')

def get_best_model(row):
    scores = {
        "ARIMA": row.get("ARIMA_RMSE"),
        "SARIMA": row.get("SARIMA_RMSE"),
        "Prophet": row.get("Prophet_RMSE"),
        "LSTM": row.get("LSTM_RMSE"),
    }
    scores = {k: v for k, v in scores.items() if pd.notna(v)}
    return min(scores, key=scores.get) if scores else None

combined_df["Best_Model"] = combined_df.apply(get_best_model, axis=1)
combined_df.to_csv("final_model_comparison.csv", index=False)

print("\n✅ Final model comparison saved to: final_model_comparison.csv")
print(combined_df[["Stock", "Best_Model"]])
