In [13]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, classification_report
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_auc_score,
    f1_score,
    precision_score,
    recall_score
)



In [2]:
df = pd.read_csv("E://PHD/Course materials/Sem 1/Data Mining/Assignment/workspace/stock_price_trend_prediction/stock_price_pred_data_mining/Data/05_cluster_output/cluster_output_Kmeans.csv")

In [8]:
# 1. Compute next-week (5 trading days) forward return per stock
df['next_week_close'] = df.groupby('stock_id')['Close'].shift(-5)
df['next_week_return'] = (df['next_week_close'] - df['Close']) / df['Close']
# 2. Binary target: 1 = uptrend, 0 = downtrend or flat
df["target"] = (df["Close"].shift(-5) > df["Close"]).astype(int)
# 3. Drop rows where we can't compute future return (last 5 days per stock)
df = df.dropna(subset=['next_week_return'])
# Optional: drop helper column
df = df.drop(columns=['next_week_close'])
drop_cols = ['Date', 'stock_id', 'cluster', 'target', 'next_week_return']
feature_cols = [c for c in df.columns if c not in drop_cols]

print("Number of features:", len(feature_cols))
print("Features:", feature_cols)

Number of features: 23
Features: ['Open', 'High', 'Low', 'Close', 'Volume', 'MA7', 'MA21', 'EMA20', 'EMA50', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower', 'CCI_14', 'CMF_20', 'Stoch_K', 'Stoch_D', 'Momentum_10', 'Daily_Return', 'Log_Return']


In [None]:
cluster_models = {}       # Store LightGBM models per cluster
cluster_metrics = {}      # Store metrics


In [15]:

for cl in df['cluster'].unique():

    print(f"\nðŸ“Œ Training LightGBM for Cluster {cl}")

    sub = df[df['cluster'] == cl].copy().sort_values('Date')

    X = sub[feature_cols]
    y = sub['target']

    # Time-series split
    split_idx = int(len(sub) * 0.7)
    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    # LightGBM Sklearn API
    model = lgb.LGBMClassifier(
        objective='binary',
        learning_rate=0.05,
        num_leaves=31,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=5,
        n_estimators=500
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_test, y_test)],
        eval_metric='auc'
    )

    # Store model
    cluster_models[cl] = model

    # Predictions
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = (y_prob > 0.5).astype(int)

    # ---- Metrics ----
    roc_auc = roc_auc_score(y_test, y_prob)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    macro_precision = precision_score(y_test, y_pred, average='macro')
    macro_recall = recall_score(y_test, y_pred, average='macro')

    print(f"Cluster {cl} â†’ ROC-AUC: {roc_auc:.4f}")
    print(f"Cluster {cl} â†’ Macro F1 Score: {macro_f1:.4f}")
    print(f"Cluster {cl} â†’ Precision: {macro_precision:.4f}")
    print(f"Cluster {cl} â†’ Recall: {macro_recall:.4f}")

    # Save metrics
    cluster_metrics[cl] = {
        "roc_auc": roc_auc,
        "macro_f1": macro_f1,
        "precision": macro_precision,
        "recall": macro_recall
    }


ðŸ“Œ Training LightGBM for Cluster 1
[LightGBM] [Info] Number of positive: 20900, number of negative: 18614
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5865
[LightGBM] [Info] Number of data points in the train set: 39514, number of used features: 23
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.528926 -> initscore=0.115835
[LightGBM] [Info] Start training from score 0.115835
Cluster 1 â†’ ROC-AUC: 0.5097
Cluster 1 â†’ Macro F1 Score: 0.4918
Cluster 1 â†’ Precision: 0.5040
Cluster 1 â†’ Recall: 0.5036

ðŸ“Œ Training LightGBM for Cluster 0
[LightGBM] [Info] Number of positive: 5740, number of negative: 5109
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000720 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5865
[LightGBM] [Info] Number of data points