# Part 1

In [7]:
# Fix sklearn ImportError (validate_data): upgrade scikit-learn. Run this cell once, then restart kernel and run from top.
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "scikit-learn"])



0

In [8]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

# Load main dataset
df = pd.read_csv("main.csv")
print(f"Total reviews: {len(df)}")
df.head(3)

ImportError: cannot import name 'validate_data' from 'sklearn.utils.validation' (/opt/anaconda3/lib/python3.11/site-packages/sklearn/utils/validation.py)

## Task 1.1 - Question 1

In [None]:
# Token length = number of tokens (whitespace-split words) per review
def token_length(text):
    if pd.isna(text):
        return 0
    return len(str(text).split())

df["token_length"] = df["review_text"].apply(token_length)
lengths = df["token_length"]

q25 = lengths.quantile(0.25)
q75 = lengths.quantile(0.75)
print(f"Q25 (25th percentile): {q25:.0f} tokens")
print(f"Q75 (75th percentile): {q75:.0f} tokens")

# Retain only Short (<= q25) or Long (>= q75)
short_mask = lengths <= q25
long_mask = lengths >= q75
df_task1 = df[short_mask | long_mask].copy()
df_task1["pseudo_label"] = np.where(df_task1["token_length"] <= q25, "Short", "Long")

n_retained = len(df_task1)
n_short = (df_task1["pseudo_label"] == "Short").sum()
n_long = (df_task1["pseudo_label"] == "Long").sum()
avg_short = df_task1.loc[df_task1["pseudo_label"] == "Short", "token_length"].mean()
avg_long = df_task1.loc[df_task1["pseudo_label"] == "Long", "token_length"].mean()

print("\n--- Question 1 Report ---")
print(f"Number of reviews retained: {n_retained} (Short: {n_short}, Long: {n_long})")
print(f"Average token length of Short reviews: {avg_short:.2f}")
print(f"Average token length of Long reviews:  {avg_long:.2f}")

Q25 (25th percentile): 11 tokens
Q75 (75th percentile): 179 tokens

--- Question 1 Report ---
Number of reviews retained: 20497 (Short: 10463, Long: 10034)
Average token length of Short reviews: 6.39
Average token length of Long reviews:  493.00


## Task 1.2 - Question 2


In [None]:
# TF-IDF on retained reviews (Task 1): unigrams, min_df=3, English stopwords
tfidf = TfidfVectorizer(ngram_range=(1, 1), min_df=3, stop_words="english")
X_tfidf = tfidf.fit_transform(df_task1["review_text"].fillna(""))

print("TF-IDF matrix:")
print(f"  Shape: {X_tfidf.shape} (samples × features)")
print(f"  Sparsity: {(1 - (X_tfidf.nnz / (X_tfidf.shape[0] * X_tfidf.shape[1]))):.4%} zero entries")

TF-IDF matrix:
  Shape: (20497, 25085) (samples × features)
  Sparsity: 99.6763% zero entries


In [None]:
# MiniLM embeddings on retained reviews
from sentence_transformers import SentenceTransformer

encoder = SentenceTransformer("all-MiniLM-L6-v2")
texts = df_task1["review_text"].fillna("").tolist()
X_minilm = encoder.encode(texts)

print("MiniLM matrix:")
print(f"  Shape: {X_minilm.shape} (samples × dimensions)")
print(f"  Dense: no sparsity (all entries are non-zero floats)")

MiniLM matrix:
  Shape: (20497, 384) (samples × dimensions)
  Dense: no sparsity (all entries are non-zero floats)


In [None]:
# Question 2 Report: matrix dimensions
print("--- Question 2 Report ---")
print(f"TF-IDF: {X_tfidf.shape[0]} × {X_tfidf.shape[1]} (reviews × vocabulary)")
print(f"MiniLM: {X_minilm.shape[0]} × {X_minilm.shape[1]} (reviews × embedding dim)")


--- Question 2 Report ---
TF-IDF: 20497 × 25085 (reviews × vocabulary)
MiniLM: 20497 × 384 (reviews × embedding dim)


TF-IDF produces a sparse matrix because each review uses only a small subset of the vocabulary, so most entries are zero. MiniLM produces a dense matrix because every review is a fixed-length vector of 384 floats with (effectively) no zeros.

## Task 1.3 - Question 3

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score, adjusted_rand_score, adjusted_mutual_info_score

# Install umap-learn and hdbscan in the notebook's Python if missing (fixes "No module named 'umap'")
try:
    import umap
    import hdbscan
except ModuleNotFoundError:
    import subprocess, sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "umap-learn", "hdbscan", "-q"])
    import umap
    import hdbscan

# Ground truth: Short=0, Long=1
y_true = (df_task1["pseudo_label"] == "Long").astype(int).values

# Precompute reduced representations (so we don't repeat slow steps)
# TF-IDF: raw (sparse), SVD(50), SVD(50)->UMAP(50)
svd_tfidf = TruncatedSVD(n_components=50, random_state=42)
X_tfidf_svd = svd_tfidf.fit_transform(X_tfidf)
X_tfidf_umap = umap.UMAP(n_components=50, random_state=42, metric="cosine").fit_transform(X_tfidf_svd)

# MiniLM: raw (dense), SVD(50), UMAP(50)
X_minilm_dense = np.asarray(X_minilm, dtype=np.float64)
svd_minilm = TruncatedSVD(n_components=50, random_state=42)
X_minilm_svd = svd_minilm.fit_transform(X_minilm_dense)
X_minilm_umap = umap.UMAP(n_components=50, random_state=42, metric="cosine").fit_transform(X_minilm_dense)

NameError: name 'df_task1' is not defined

In [None]:
def compute_metrics(y_true, y_pred):
    return {
        "homogeneity": homogeneity_score(y_true, y_pred),
        "completeness": completeness_score(y_true, y_pred),
        "v_measure": v_measure_score(y_true, y_pred),
        "ARI": adjusted_rand_score(y_true, y_pred),
        "AMI": adjusted_mutual_info_score(y_true, y_pred),
    }

# Pipelines to run: (representation, DR, X_input, run_agglo, run_hdbscan)
# TF-IDF + None: only K-Means (sparse); Agglomerative needs dense (infeasible); HDBSCAN typically needs dense
# TF-IDF + SVD: all three
# TF-IDF + UMAP: all three (already SVD then UMAP)
# MiniLM + None / SVD / UMAP: all three
pipelines = [
    ("TF-IDF", "None", X_tfidf, False, False),           # sparse: K-Means only
    ("TF-IDF", "SVD(50)", X_tfidf_svd, True, True),
    ("TF-IDF", "UMAP(50)", X_tfidf_umap, True, True),
    ("MiniLM", "None", X_minilm_dense, True, True),
    ("MiniLM", "SVD(50)", X_minilm_svd, True, True),
    ("MiniLM", "UMAP(50)", X_minilm_umap, True, True),
]

results = []
for rep, dr_name, X, do_agglo, do_hdb in pipelines:
    # K-Means accepts sparse; Agglomerative/HDBSCAN need dense (we skip them for TF-IDF+None to avoid huge dense matrix)
    km = KMeans(n_clusters=2, random_state=42, n_init=10)
    m = compute_metrics(y_true, km.fit_predict(X))  # X can be sparse for K-Means
    results.append({"Representation": rep, "DR": dr_name, "Clustering": "K-Means", **m})
    if do_agglo:
        X_dense = X.toarray() if hasattr(X, "toarray") else np.asarray(X)
        ac = AgglomerativeClustering(n_clusters=2)
        m = compute_metrics(y_true, ac.fit_predict(X_dense))
        results.append({"Representation": rep, "DR": dr_name, "Clustering": "Agglomerative", **m})
    else:
        results.append({"Representation": rep, "DR": dr_name, "Clustering": "Agglomerative", "homogeneity": None, "completeness": None, "v_measure": None, "ARI": None, "AMI": None, "skip_reason": "Agglomerative requires dense input; TF-IDF without DR is too large to convert."})
    if do_hdb:
        X_dense = X.toarray() if hasattr(X, "toarray") else np.asarray(X)
        hdb_model = hdbscan.HDBSCAN(min_cluster_size=2)
        m = compute_metrics(y_true, hdb_model.fit_predict(X_dense))
        results.append({"Representation": rep, "DR": dr_name, "Clustering": "HDBSCAN", **m})
    else:
        results.append({"Representation": rep, "DR": dr_name, "Clustering": "HDBSCAN", "homogeneity": None, "completeness": None, "v_measure": None, "ARI": None, "AMI": None, "skip_reason": "HDBSCAN requires dense input; TF-IDF without DR is too large to convert."})

In [None]:
# Build table (only rows with numeric metrics)
df_results = pd.DataFrame(results)
# Add skip_reason column if missing
if "skip_reason" not in df_results.columns:
    df_results["skip_reason"] = None
df_metrics = df_results[df_results["v_measure"].notna()].copy()
df_skipped = df_results[df_results["v_measure"].isna() & df_results["skip_reason"].notna()]

print("--- Question 3 Report ---\n")
print("Skipped pipelines (TF-IDF + None):")
print(df_skipped[["Representation", "DR", "Clustering", "skip_reason"]].to_string(index=False))
display_cols = ["Representation", "DR", "Clustering", "homogeneity", "completeness", "v_measure", "ARI", "AMI"]
print("\nClustering agreement metrics (with ground-truth length labels):")
print(df_metrics[display_cols].round(4).to_string(index=False))

In [None]:
# Summary table and best pipeline
summary = df_metrics[display_cols].round(4)
summary

In [None]:
# Best-performing pipeline (by V-measure; tie-break by ARI)
best_idx = df_metrics.sort_values(["v_measure", "ARI"], ascending=[False, False]).index[0]
best = df_metrics.loc[best_idx]
print("Best-performing pipeline:")
print(f"  Representation: {best['Representation']}, DR: {best['DR']}, Clustering: {best['Clustering']}")
print(f"  Homogeneity: {best['homogeneity']:.4f}  Completeness: {best['completeness']:.4f}  V-measure: {best['v_measure']:.4f}  ARI: {best['ARI']:.4f}  AMI: {best['AMI']:.4f}")