In [1]:
"""
Stage 2: Classification on Text Cluster Labels
EPGD Programming for Data Science – IITM
Author: <your name>
"""

import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

def load_clustered_data(input_path):
    """Load PCA features and cluster labels."""
    with open(input_path, "rb") as f:
        data = pickle.load(f)
    return data["pca_features"], data["cluster_labels"]

def train_classifier(X_train, y_train):
    """Train RandomForest classifier."""
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train, y_train)
    return clf

def evaluate_model(clf, X_test, y_test):
    """Print confusion matrix and classification report."""
    y_pred = clf.predict(X_test)
    print("📊 Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\n📈 Classification Report:")
    print(classification_report(y_test, y_pred))

def save_model(clf, model_path):
    """Persist trained model to disk."""
    joblib.dump(clf, model_path)
    print(f"💾 Model saved to {model_path}")

def main():
    input_path = "results/text_stage1_results.pkl"
    model_path = "results/text_classifier.pkl"

    print("🔁 Loading clustered data...")
    X, y = load_clustered_data(input_path)

    print("✂️ Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    print("🎯 Training classifier...")
    clf = train_classifier(X_train, y_train)

    print("🔍 Evaluating model...")
    evaluate_model(clf, X_test, y_test)

    print("💾 Saving model...")
    os.makedirs("results", exist_ok=True)
    save_model(clf, model_path)

    print("✅ Done!")

if __name__ == "__main__":
    main()


🔁 Loading clustered data...
✂️ Splitting data...
🎯 Training classifier...
🔍 Evaluating model...
📊 Confusion Matrix:
[[ 70   0   2   1   0]
 [  0  27   2   1   0]
 [  2   1 122   2   0]
 [  0   0   4  57   0]
 [  1   0   5   0   3]]

📈 Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.96      0.96        73
           1       0.96      0.90      0.93        30
           2       0.90      0.96      0.93       127
           3       0.93      0.93      0.93        61
           4       1.00      0.33      0.50         9

    accuracy                           0.93       300
   macro avg       0.95      0.82      0.85       300
weighted avg       0.93      0.93      0.93       300

💾 Saving model...
💾 Model saved to results/text_classifier.pkl
✅ Done!
