**Step 1: Upload and Configure kaggle.json**

In [None]:
from google.colab import userdata
import json, os

kaggle_json = {
    "username": userdata.get('KAGGLE_USERNAME'),
    "key": userdata.get('KAGGLE_KEY')
}

os.makedirs("/root/.kaggle", exist_ok=True)
with open("/root/.kaggle/kaggle.json", "w") as f:
    json.dump(kaggle_json, f)

os.chmod("/root/.kaggle/kaggle.json", 0o600)

**Step 2: Download Kaggle Datasets**

In [None]:
# Download all 3 datasets
!kaggle datasets download -d rohanrao/nifty50-stock-market-data
!kaggle datasets download -d debashis74017/stock-market-data-nifty-100-stocks-5-min-data
!kaggle datasets download -d s3programmer/stock-market-dataset-for-financial-analysis
# Unzip into raw folder
!mkdir -p data/static_raw
!unzip -q nifty50-stock-market-data.zip -d data/static_raw/nifty50
!unzip -q stock-market-data-nifty-100-stocks-5-min-data.zip -d data/static_raw/nifty100_5min
!unzip -q stock-market-dataset-for-financial-analysis.zip -d data/static_raw/financial_analysis

Dataset URL: https://www.kaggle.com/datasets/rohanrao/nifty50-stock-market-data
License(s): CC0-1.0
Dataset URL: https://www.kaggle.com/datasets/debashis74017/stock-market-data-nifty-100-stocks-5-min-data
License(s): CC0-1.0
Dataset URL: https://www.kaggle.com/datasets/s3programmer/stock-market-dataset-for-financial-analysis
License(s): CC0-1.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

**Step 3: Preprocessing Function**

In [None]:
import pandas as pd
import os

def preprocess_stock_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        if 'Date' in df.columns:
            df['Date'] = pd.to_datetime(df['Date'])
        elif 'date' in df.columns:
            df['Date'] = pd.to_datetime(df['date'])
        else:
            return None  # Skip if no date

        df.set_index('Date', inplace=True)
        df.columns = [col.lower() for col in df.columns]
        needed = ['open', 'high', 'low', 'close', 'volume']
        df = df[[col for col in needed if col in df.columns]]
        df.dropna(inplace=True)
        return df
    except:
        return None

**Step 4: Batch Process All Raw CSVs**

In [None]:
from pathlib import Path

folders = [
    "data/static_raw/nifty50",
    "data/static_raw/nifty100_5min",
    "data/static_raw/financial_analysis"
]

processed_dir = Path("data/processed/static")
processed_dir.mkdir(parents=True, exist_ok=True)

for folder in folders:
    for file in os.listdir(folder):
        if file.endswith(".csv"):
            stock_name = file.replace(".csv", "").upper()
            path = os.path.join(folder, file)
            df_clean = preprocess_stock_csv(path)
            if df_clean is not None:
                df_clean.to_csv(processed_dir / f"{stock_name}_clean.csv")