**Import Cleaned Dataset**

In [None]:
from google.colab import userdata
import json, os

kaggle_json = {
    "username": userdata.get('KAGGLE_USERNAME'),
    "key": userdata.get('KAGGLE_KEY')
}

os.makedirs("/root/.kaggle", exist_ok=True)
with open("/root/.kaggle/kaggle.json", "w") as f:
    json.dump(kaggle_json, f)

os.chmod("/root/.kaggle/kaggle.json", 0o600)

# 📥 Download and unzip
!kaggle datasets download -d adithyabhaskar2511/stock-market-analysis --unzip


Dataset URL: https://www.kaggle.com/datasets/adithyabhaskar2511/stock-market-analysis
License(s): MIT


**Model Ready Preprocessed Data**

In [None]:
# 📃 Notebook 4 – Fast Preprocessing & Model-Ready Storage for 30 Stocks

"""
This notebook is optimized to:
- Load each stock
- Preprocess & split data
- Store scaled and ready-to-use train/test sets in `.npz` files

Model training (LSTM, GRU, Prophet) will be handled during deployment
"""

import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

# 📁 Ensure output folders exist
os.makedirs("preprocessed", exist_ok=True)

# ✅ Define 30 tickers and source paths
ticker_info = {
    # Static Stocks (NIFTY 50)

    "ITC": "static",



    # Global Stocks
    "AAPL": "live"
}

# ⏩ Helper to create sequences
def create_sequences(data, seq_length=60):
    X, y = [], []
    for i in range(seq_length, len(data)):
        X.append(data[i - seq_length:i])
        y.append(data[i])
    return np.array(X), np.array(y)

# 🚀 Preprocessing loop
for ticker, src in ticker_info.items():
    print(f"\n🚀 Preprocessing: {ticker}")

    # Handle different filename patterns
    if src == "static":
        path = f"data/processed/enriched/{src}/{ticker}_WITH_INDICATORS__clean.csv"
    else:
        path = f"data/processed/enriched/{src}/{ticker}_live.csv"

    if not os.path.exists(path):
        print(f"❌ Skipping (file not found): {path}")
        continue

    try:
        df = pd.read_csv(path)

        # Try to parse Date column
        if "Date" in df.columns:
            df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
            df = df.set_index("Date")
        elif df.columns[0].lower().startswith("date"):
            df.rename(columns={df.columns[0]: "Date"}, inplace=True)
            df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
            df = df.set_index("Date")
        else:
            print(f"❌ No valid 'Date' column found in {ticker}. Skipping.")
            continue

        # Fix column name for Close
        if "close" not in df.columns:
            if "unnamed: 4" in df.columns:
                df.rename(columns={"unnamed: 4": "close"}, inplace=True)

        if "close" not in df.columns:
            print(f"❌ 'close' column missing in {ticker} even after renaming. Skipping.")
            continue

        df = df[["close"]].dropna()

        # Scale data
        scaler = MinMaxScaler()
        scaled_data = scaler.fit_transform(df)

        train_size = int(len(scaled_data) * 0.8)
        train_data, test_data = scaled_data[:train_size], scaled_data[train_size:]

        X_train, y_train = create_sequences(train_data)
        X_test, y_test = create_sequences(test_data)

        X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
        X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

        # Save preprocessed data
        np.savez_compressed(f"preprocessed/{ticker}_processed.npz",
                            X_train=X_train, y_train=y_train,
                            X_test=X_test, y_test=y_test,
                            scaler_min=scaler.data_min_,
                            scaler_max=scaler.data_max_)

    except Exception as e:
        print(f"❌ Error processing {ticker}: {e}")

print("\n📆 All stocks preprocessed and saved. Models will be trained LIVE.")



🚀 Preprocessing: ITC

🚀 Preprocessing: AAPL

📆 All stocks preprocessed and saved. Models will be trained LIVE.


In [None]:
!pip install cmdstanpy
!pip install numpy pandas matplotlib scikit-learn tensorflow prophet


Collecting cmdstanpy
  Downloading cmdstanpy-1.2.5-py3-none-any.whl.metadata (4.0 kB)
Collecting stanio<2.0.0,>=0.4.0 (from cmdstanpy)
  Downloading stanio-0.5.1-py3-none-any.whl.metadata (1.6 kB)
Downloading cmdstanpy-1.2.5-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.5/94.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading stanio-0.5.1-py3-none-any.whl (8.1 kB)
Installing collected packages: stanio, cmdstanpy
Successfully installed cmdstanpy-1.2.5 stanio-0.5.1
Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting prophet
  Downloading prophet-1.1.6-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.5 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.

**Model Training And Comparison**

In [None]:
# 📒 Notebook 4 – Fast Model Training & Comparison for All Stocks

"""
- Loads preprocessed data from /preprocessed
- Trains LSTM, GRU, Prophet models for each ticker
- Saves trained models to /models
- Generates comparison plots in /plots
- Uses fewer epochs for faster execution
"""

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense
from prophet import Prophet
from math import sqrt
import warnings
warnings.filterwarnings("ignore")

os.makedirs("models", exist_ok=True)
os.makedirs("plots", exist_ok=True)

# ✅ Auto-detect preprocessed tickers
tickers = [f.split("_")[0] for f in os.listdir("preprocessed") if f.endswith(".npz")]

# 🚀 Model training loop
for ticker in tickers:
    print(f"\n🚀 Training Models for: {ticker}")

    try:
        # Load preprocessed data
        data = np.load(f"preprocessed/{ticker}_processed.npz")
        X_train, y_train = data["X_train"], data["y_train"]
        X_test, y_test = data["X_test"], data["y_test"]
        min_, max_ = data["scaler_min"], data["scaler_max"]

        def rescale(y):
            return y * (max_ - min_) + min_

        y_true = rescale(y_test.reshape(-1, 1))

        # --- LSTM ---
        lstm_model = Sequential([
            LSTM(32, return_sequences=True, input_shape=(X_train.shape[1], 1)),
            LSTM(16),
            Dense(1)
        ])
        lstm_model.compile(optimizer="adam", loss="mean_squared_error")
        lstm_model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=0)
        lstm_preds = lstm_model.predict(X_test)
        lstm_preds_rescaled = rescale(lstm_preds)
        lstm_rmse = sqrt(mean_squared_error(y_true, lstm_preds_rescaled))
        lstm_model.save(f"models/lstm_{ticker}.h5")

        # --- GRU ---
        gru_model = Sequential([
            GRU(32, return_sequences=True, input_shape=(X_train.shape[1], 1)),
            GRU(16),
            Dense(1)
        ])
        gru_model.compile(optimizer="adam", loss="mean_squared_error")
        gru_model.fit(X_train, y_train, epochs=3, batch_size=32, verbose=0)
        gru_preds = gru_model.predict(X_test)
        gru_preds_rescaled = rescale(gru_preds)
        gru_rmse = sqrt(mean_squared_error(y_true, gru_preds_rescaled))
        gru_model.save(f"models/gru_{ticker}.h5")

        # --- Prophet ---
        prophet_rmse = None
        forecast_close = None
        try:
            # Guess source folder
            src_folder = "static" if not ticker.endswith("_NS") and ticker.isupper() else "live"
            p1 = f"data/processed/enriched/{src_folder}/{ticker}_WITH_INDICATORS__clean.csv"
            p2 = f"data/processed/enriched/{src_folder}/{ticker}_live.csv"
            path = p1 if os.path.exists(p1) else p2

            df = pd.read_csv(path)
            if "unnamed: 4" in df.columns:
                df.rename(columns={"unnamed: 4": "close"}, inplace=True)
            if "Date" not in df.columns:
                df.reset_index(inplace=True)
                if "index" in df.columns:
                    df.rename(columns={"index": "Date"}, inplace=True)

            df = df[["Date", "close"]].dropna()
            df.rename(columns={"Date": "ds", "close": "y"}, inplace=True)
            df["ds"] = pd.to_datetime(df["ds"], errors="coerce")
            df.dropna(subset=["ds", "y"], inplace=True)
            if df["ds"].dt.tz is not None:
                df["ds"] = df["ds"].dt.tz_localize(None)

            prophet_train = df.iloc[:int(len(df) * 0.8)]
            prophet_model = Prophet()
            prophet_model.fit(prophet_train)
            future = prophet_model.make_future_dataframe(periods=len(df) - len(prophet_train))
            forecast = prophet_model.predict(future)
            forecast_close = forecast[["yhat"]].values[-len(y_true):]
            prophet_rmse = sqrt(mean_squared_error(y_true, forecast_close))
            forecast.to_csv(f"models/prophet_{ticker}.csv", index=False)
        except Exception as e:
            print(f"  ⚠️ Prophet failed for {ticker}: {e}")

        # Plot results
        plt.figure(figsize=(10, 5))
        plt.plot(y_true, label="Actual", color="black")
        plt.plot(lstm_preds_rescaled, label="LSTM", alpha=0.6)
        plt.plot(gru_preds_rescaled, label="GRU", alpha=0.6)
        if forecast_close is not None:
            plt.plot(forecast_close, label="Prophet", alpha=0.6)
        plt.title(f"Comparison – {ticker}")
        plt.legend()
        plt.grid()
        plt.tight_layout()
        plt.savefig(f"plots/{ticker}_comparison.png")
        plt.close()

        print(f"  ✅ Done: LSTM RMSE={lstm_rmse:.2f}, GRU RMSE={gru_rmse:.2f}, Prophet RMSE={prophet_rmse if prophet_rmse else 'N/A'}")

    except Exception as e:
        print(f"  ❌ Failed on {ticker}: {e}")

print("\n🏆 All models trained and plots saved.")



🚀 Training Models for: ITC
[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step




[1m813/813[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step


DEBUG:cmdstanpy:input tempfile: /tmp/tmpbp9blb_d/rebewhgg.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpbp9blb_d/_x6kkore.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=18474', 'data', 'file=/tmp/tmpbp9blb_d/rebewhgg.json', 'init=/tmp/tmpbp9blb_d/_x6kkore.json', 'output', 'file=/tmp/tmpbp9blb_d/prophet_model2dp2z94q/prophet_model-20250502122857.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
12:28:57 - cmdstanpy - INFO - Chain [1] start processing
INFO:cmdstanpy:Chain [1] start processing
12:34:26 - cmdstanpy - INFO - Chain [1] done processing
INFO:cmdstanpy:Chain [1] done processing


  ✅ Done: LSTM RMSE=0.52, GRU RMSE=0.40, Prophet RMSE=2465.4261586426346

🚀 Training Models for: AAPL
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step




  ⚠️ Prophet failed for AAPL: [Errno 2] No such file or directory: 'data/processed/enriched/static/AAPL_live.csv'
  ✅ Done: LSTM RMSE=9.32, GRU RMSE=6.10, Prophet RMSE=N/A

🏆 All models trained and plots saved.
