In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("E://PHD/Course materials/Sem 1/Data Mining/Assignment/workspace/stock_price_trend_prediction/stock_price_pred_data_mining/Data/05_cluster_output/cluster_output_Agglomerative.csv")

In [3]:
df.head(3)

Unnamed: 0,Date,stock_id,Open,High,Low,Close,Volume,MA7,MA21,EMA20,...,BB_middle,BB_lower,CCI_14,CMF_20,Stoch_K,Stoch_D,Momentum_10,Daily_Return,Log_Return,agg_cluster
0,2021-12-14,ANANDRATHI,287.24,294.42,270.75,279.37,27861900.0,274.57858,284.5581,283.8015,...,283.8015,310.46637,-4213.020993,-0.169092,80.29088,84.21576,-5.51,0.002398,0.002395,3
1,2021-12-15,ANANDRATHI,280.06,286.67,276.71,280.04,2675624.0,274.57858,284.5581,283.8015,...,283.8015,310.46637,-4213.020993,-0.169092,80.29088,84.21576,-5.51,0.002398,0.002395,3
2,2021-12-16,ANANDRATHI,280.54,284.61,271.01,273.31,1329736.0,274.57858,284.5581,283.8015,...,283.8015,310.46637,-4213.020993,-0.169092,80.29088,84.21576,-5.51,-0.024032,-0.024326,3


In [4]:
df.shape

(72848, 26)

In [5]:
df.columns

Index(['Date', 'stock_id', 'Open', 'High', 'Low', 'Close', 'Volume', 'MA7',
       'MA21', 'EMA20', 'EMA50', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_hist',
       'BB_upper', 'BB_middle', 'BB_lower', 'CCI_14', 'CMF_20', 'Stoch_K',
       'Stoch_D', 'Momentum_10', 'Daily_Return', 'Log_Return', 'agg_cluster'],
      dtype='object')

In [6]:
# Sort by stock & time (very important for time-based target)
df = df.sort_values(['stock_id', 'Date'])

In [7]:
# 1. Compute next-week (5 trading days) forward return per stock
df['next_week_close'] = df.groupby('stock_id')['Close'].shift(-5)
df['next_week_return'] = (df['next_week_close'] - df['Close']) / df['Close']

In [8]:
# 2. Binary target: 1 = uptrend, 0 = downtrend or flat
df['target'] = (df['next_week_return'] > 0).astype(int)

In [9]:
# 3. Drop rows where we can't compute future return (last 5 days per stock)
df = df.dropna(subset=['next_week_return'])

In [10]:
# Optional: drop helper column
df = df.drop(columns=['next_week_close'])

In [17]:
drop_cols = ['Date', 'stock_id', 'agg_cluster', 'target', 'next_week_return']

In [18]:
feature_cols = [c for c in df.columns if c not in drop_cols]

print("Number of features:", len(feature_cols))
print("Features:", feature_cols)

Number of features: 23
Features: ['Open', 'High', 'Low', 'Close', 'Volume', 'MA7', 'MA21', 'EMA20', 'EMA50', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower', 'CCI_14', 'CMF_20', 'Stoch_K', 'Stoch_D', 'Momentum_10', 'Daily_Return', 'Log_Return']


In [19]:
cluster_models = {}          # store trained models
cluster_results = {}         # store evaluation metrics

In [20]:
for cl in sorted(df['agg_cluster'].unique()):
    print(f"\n==============================")
    print(f"Training model for cluster {cl}")
    print(f"==============================")
    
    cluster_df = df[df['agg_cluster'] == cl].copy()
    
    # 1. Time-based split: use dates, not random
    unique_dates = np.sort(cluster_df['Date'].unique())
    if len(unique_dates) < 10:
        print(f"Cluster {cl} has very few dates ({len(unique_dates)}). Be careful with evaluation.")
    
    split_idx = int(len(unique_dates) * 0.7)
    split_date = unique_dates[split_idx]
    
    print(f"Cluster {cl} split_date (70%): {split_date}")
    
    train_df = cluster_df[cluster_df['Date'] <= split_date]
    test_df  = cluster_df[cluster_df['Date'] > split_date]
    
    print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
    
    # 2. Prepare X, y
    X_train = train_df[feature_cols]
    y_train = train_df['target']
    
    X_test  = test_df[feature_cols]
    y_test  = test_df['target']

     # 3. Train Random Forest (you can tune hyperparameters later)
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'    # helps with imbalance between up/down
    )
    
    rf.fit(X_train, y_train)
    
    # 4. Evaluate
    y_pred = rf.predict(X_test)
    
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, digits=4))
    
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # 5. Store model and results
    cluster_models[cl] = rf
    cluster_results[cl] = {
        "split_date": split_date,
        "train_size": len(train_df),
        "test_size": len(test_df),
    }


Training model for cluster 0
Cluster 0 split_date (70%): 2024-01-02
Train size: 3976, Test size: 1700

Classification report:
              precision    recall  f1-score   support

           0     0.5459    0.5886    0.5665       858
           1     0.5445    0.5012    0.5220       842

    accuracy                         0.5453      1700
   macro avg     0.5452    0.5449    0.5442      1700
weighted avg     0.5452    0.5453    0.5444      1700

Confusion matrix:
[[505 353]
 [420 422]]

Training model for cluster 1
Cluster 1 split_date (70%): 2024-01-02
Train size: 10934, Test size: 4727

Classification report:
              precision    recall  f1-score   support

           0     0.4770    0.5627    0.5163      2280
           1     0.5106    0.4250    0.4639      2447

    accuracy                         0.4914      4727
   macro avg     0.4938    0.4939    0.4901      4727
weighted avg     0.4943    0.4914    0.4892      4727

Confusion matrix:
[[1283  997]
 [1407 1040]]

Trai

In [21]:
from sklearn.metrics import roc_auc_score

cluster_auc_scores = {}

for cl in df['agg_cluster'].unique():

    cluster_df = df[df['agg_cluster'] == cl].copy()
    if cluster_df.empty:
        continue

    # Time-series split
    split_idx = int(len(cluster_df) * 0.7)
    train = cluster_df.iloc[:split_idx]
    test  = cluster_df.iloc[split_idx:]

    X_train = train[feature_cols]
    y_train = train['target']

    X_test  = test[feature_cols]
    y_test  = test['target']

    # Train the model
    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)

    # --- ROC-AUC Calculation ---
    y_prob = model.predict_proba(X_test)[:, 1]   # probability for class = 1

    auc_score = roc_auc_score(y_test, y_prob)

    cluster_auc_scores[cl] = auc_score

    print(f"Cluster {cl} → ROC-AUC: {auc_score:.4f}")


Cluster 3 → ROC-AUC: 0.5159
Cluster 4 → ROC-AUC: 0.4497
Cluster 2 → ROC-AUC: 0.5007
Cluster 1 → ROC-AUC: 0.5108
Cluster 0 → ROC-AUC: 0.4668
