In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("E://PHD/Course materials/Sem 1/Data Mining/Assignment/workspace/stock_price_trend_prediction/stock_price_pred_data_mining/Data/05_cluster_output/cluster_output_Kmeans.csv")

In [4]:
df.head(3)

Unnamed: 0,Date,stock_id,Open,High,Low,Close,Volume,MA7,MA21,EMA20,...,BB_middle,BB_lower,CCI_14,CMF_20,Stoch_K,Stoch_D,Momentum_10,Daily_Return,Log_Return,cluster
0,2021-12-14,ANANDRATHI,287.24,294.42,270.75,279.37,27861900.0,274.57858,284.5581,283.8015,...,283.8015,310.46637,-4213.020993,-0.169092,80.29088,84.21576,-5.51,0.002398,0.002395,1
1,2021-12-15,ANANDRATHI,280.06,286.67,276.71,280.04,2675624.0,274.57858,284.5581,283.8015,...,283.8015,310.46637,-4213.020993,-0.169092,80.29088,84.21576,-5.51,0.002398,0.002395,1
2,2021-12-16,ANANDRATHI,280.54,284.61,271.01,273.31,1329736.0,274.57858,284.5581,283.8015,...,283.8015,310.46637,-4213.020993,-0.169092,80.29088,84.21576,-5.51,-0.024032,-0.024326,1


In [5]:
df.shape

(72848, 26)

In [6]:
df.columns

Index(['Date', 'stock_id', 'Open', 'High', 'Low', 'Close', 'Volume', 'MA7',
       'MA21', 'EMA20', 'EMA50', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_hist',
       'BB_upper', 'BB_middle', 'BB_lower', 'CCI_14', 'CMF_20', 'Stoch_K',
       'Stoch_D', 'Momentum_10', 'Daily_Return', 'Log_Return', 'cluster'],
      dtype='object')

In [None]:
# Sort by stock & time (very important for time-based target)
df = df.sort_values(['stock_id', 'Date'])

In [None]:
# 1. Compute next-week (5 trading days) forward return per stock
df['next_week_close'] = df.groupby('stock_id')['Close'].shift(-5)
df['next_week_return'] = (df['next_week_close'] - df['Close']) / df['Close']

In [9]:
# 2. Binary target: 1 = uptrend, 0 = downtrend or flat
df['target'] = (df['next_week_return'] > 0).astype(int)

In [10]:
# 3. Drop rows where we can't compute future return (last 5 days per stock)
df = df.dropna(subset=['next_week_return'])

In [11]:
# Optional: drop helper column
df = df.drop(columns=['next_week_close'])

In [12]:
drop_cols = ['Date', 'stock_id', 'cluster', 'target', 'next_week_return']

In [14]:
feature_cols = [c for c in df.columns if c not in drop_cols]

print("Number of features:", len(feature_cols))
print("Features:", feature_cols)

Number of features: 23
Features: ['Open', 'High', 'Low', 'Close', 'Volume', 'MA7', 'MA21', 'EMA20', 'EMA50', 'RSI_14', 'MACD', 'MACD_signal', 'MACD_hist', 'BB_upper', 'BB_middle', 'BB_lower', 'CCI_14', 'CMF_20', 'Stoch_K', 'Stoch_D', 'Momentum_10', 'Daily_Return', 'Log_Return']


In [15]:
cluster_models = {}          # store trained models
cluster_results = {}         # store evaluation metrics

In [18]:

for cl in sorted(df['cluster'].unique()):
    print(f"\n==============================")
    print(f"Training model for cluster {cl}")
    print(f"==============================")
    
    cluster_df = df[df['cluster'] == cl].copy()
    
    # 1. Time-based split: use dates, not random
    unique_dates = np.sort(cluster_df['Date'].unique())
    if len(unique_dates) < 10:
        print(f"Cluster {cl} has very few dates ({len(unique_dates)}). Be careful with evaluation.")
    
    split_idx = int(len(unique_dates) * 0.7)
    split_date = unique_dates[split_idx]
    
    print(f"Cluster {cl} split_date (70%): {split_date}")
    
    train_df = cluster_df[cluster_df['Date'] <= split_date]
    test_df  = cluster_df[cluster_df['Date'] > split_date]
    
    print(f"Train size: {len(train_df)}, Test size: {len(test_df)}")
    
    # 2. Prepare X, y
    X_train = train_df[feature_cols]
    y_train = train_df['target']
    
    X_test  = test_df[feature_cols]
    y_test  = test_df['target']

     # 3. Train Random Forest (you can tune hyperparameters later)
    rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced'    # helps with imbalance between up/down
    )
    
    rf.fit(X_train, y_train)
    
    # 4. Evaluate
    y_pred = rf.predict(X_test)
    
    print("\nClassification report:")
    print(classification_report(y_test, y_pred, digits=4))
    
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # 5. Store model and results
    cluster_models[cl] = rf
    cluster_results[cl] = {
        "split_date": split_date,
        "train_size": len(train_df),
        "test_size": len(test_df),
    }


Training model for cluster 0
Cluster 0 split_date (70%): 2024-01-02
Train size: 10934, Test size: 4675

Classification report:
              precision    recall  f1-score   support

           0     0.5173    0.5075    0.5124      2266
           1     0.5449    0.5546    0.5497      2409

    accuracy                         0.5318      4675
   macro avg     0.5311    0.5310    0.5310      4675
weighted avg     0.5315    0.5318    0.5316      4675

Confusion matrix:
[[1150 1116]
 [1073 1336]]

Training model for cluster 1
Cluster 1 split_date (70%): 2024-01-02
Train size: 37677, Test size: 19262

Classification report:
              precision    recall  f1-score   support

           0     0.5092    0.4115    0.4552      9526
           1     0.5152    0.6120    0.5594      9736

    accuracy                         0.5128     19262
   macro avg     0.5122    0.5117    0.5073     19262
weighted avg     0.5123    0.5128    0.5079     19262

Confusion matrix:
[[3920 5606]
 [3778 5958]]

In [20]:
new_df = pd.DataFrame({
    'Date': [
        '2025-02-01', 
        '2025-02-01', 
        '2025-02-01'
    ],
    'stock_id': [
        'RELIANCE', 
        'TCS', 
        'HDFCBANK'
    ],
    'cluster': [
        1,    # generated by your KMeans model
        0,
        1
    ],

    # === Sample technical features (normalized already) ===
    'Close': [0.5312, 0.4421, 0.6023],
    'Volume': [0.2123, 0.0554, 0.1321],

    'RSI_14': [0.6811, 0.4029, 0.7564],
    'MACD': [-0.0123, 0.0211, 0.0054],

    'EMA_20': [0.4551, 0.3890, 0.5210],
    'EMA_50': [0.4922, 0.4011, 0.5492],

    'BB_upper': [0.7632, 0.5821, 0.8011],
    'BB_lower': [0.2210, 0.1402, 0.3010],

    'ADX_14': [0.3321, 0.2121, 0.6312],
    'Stoch_K': [0.7230, 0.4902, 0.8120],
    'Stoch_D': [0.6810, 0.4561, 0.7510],

    # Daily return for context (optional)
    'daily_return': [0.0051, -0.0032, 0.0022]
})

new_df


Unnamed: 0,Date,stock_id,cluster,Close,Volume,RSI_14,MACD,EMA_20,EMA_50,BB_upper,BB_lower,ADX_14,Stoch_K,Stoch_D,daily_return
0,2025-02-01,RELIANCE,1,0.5312,0.2123,0.6811,-0.0123,0.4551,0.4922,0.7632,0.221,0.3321,0.723,0.681,0.0051
1,2025-02-01,TCS,0,0.4421,0.0554,0.4029,0.0211,0.389,0.4011,0.5821,0.1402,0.2121,0.4902,0.4561,-0.0032
2,2025-02-01,HDFCBANK,1,0.6023,0.1321,0.7564,0.0054,0.521,0.5492,0.8011,0.301,0.6312,0.812,0.751,0.0022


In [22]:
from sklearn.metrics import roc_auc_score

cluster_auc_scores = {}

for cl in df['cluster'].unique():

    cluster_df = df[df['cluster'] == cl].copy()
    if cluster_df.empty:
        continue

    # Time-series split
    split_idx = int(len(cluster_df) * 0.7)
    train = cluster_df.iloc[:split_idx]
    test  = cluster_df.iloc[split_idx:]

    X_train = train[feature_cols]
    y_train = train['target']

    X_test  = test[feature_cols]
    y_test  = test['target']

    # Train the model
    model = RandomForestClassifier(n_estimators=200, random_state=42)
    model.fit(X_train, y_train)

    # --- ROC-AUC Calculation ---
    y_prob = model.predict_proba(X_test)[:, 1]   # probability for class = 1

    auc_score = roc_auc_score(y_test, y_prob)

    cluster_auc_scores[cl] = auc_score

    print(f"Cluster {cl} → ROC-AUC: {auc_score:.4f}")


Cluster 1 → ROC-AUC: 0.5223
Cluster 0 → ROC-AUC: 0.4589
