In [4]:
"""
Stage 1: Clustering for Text Data
EPGD Programming for Data Science – IITM
Author: <your name>
"""

import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import os

def load_text_data(input_path):
    """Load raw text data from pickle file."""
    with open(input_path, "rb") as f:
        return pickle.load(f)

def extract_features(text_data, max_features=1000):
    """Convert raw text to TF-IDF features."""
    vectorizer = TfidfVectorizer(stop_words="english", max_features=max_features)
    return vectorizer.fit_transform(text_data)

def reduce_dimensionality(X_tfidf, n_components=50):
    """Apply PCA to reduce dimensions."""
    pca = PCA(n_components=n_components, random_state=42)
    return pca.fit_transform(X_tfidf.toarray())

def cluster_data(X_pca, n_clusters=5):
    """Apply KMeans clustering."""
    model = KMeans(n_clusters=n_clusters, random_state=42)
    labels = model.fit_predict(X_pca)
    score = silhouette_score(X_pca, labels)
    return labels, score

def save_results(X_pca, labels, score, output_path):
    """Save PCA features, labels, and score to pickle."""
    result = {
        "pca_features": X_pca,
        "cluster_labels": labels,
        "silhouette_score": score
    }
    with open(output_path, "wb") as f:
        pickle.dump(result, f)

def main():
    input_path = "../../../data/train_data_text.pkl"
    output_path = "results/text_stage1_results.pkl"
    
    print("🔁 Loading text data...")
    text_data = load_text_data(input_path)
    
    print("✍️ Extracting TF-IDF features...")
    X_tfidf = extract_features(text_data)

    print("📉 Reducing dimensions with PCA...")
    X_pca = reduce_dimensionality(X_tfidf)

    print("🔍 Clustering with KMeans...")
    labels, score = cluster_data(X_pca)

    print(f"✅ Silhouette Score: {score:.4f}")

    print("💾 Saving results...")
    os.makedirs("results", exist_ok=True)
    save_results(X_pca, labels, score, output_path)

    print(f"🎉 Done! Output saved to: {output_path}")

if __name__ == "__main__":
    main()


🔁 Loading text data...
✍️ Extracting TF-IDF features...
📉 Reducing dimensions with PCA...
🔍 Clustering with KMeans...
✅ Silhouette Score: 0.0495
💾 Saving results...
🎉 Done! Output saved to: results/text_stage1_results.pkl
