In [12]:
import argparse
import os
from pathlib import Path
import re
import string
from collections import defaultdict
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.metrics import homogeneity_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sentence_transformers import SentenceTransformer




In [14]:
out_dir = Path("outputs")
fig_dir = out_dir / "figures"
fig_dir.mkdir(parents=True, exist_ok=True)

In [2]:
STOPWORDS = set(stopwords.words("english"))
PUNCTUATION = set(string.punctuation)

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def nltk_tag_to_wordnet(nltk_tag: str):
    if nltk_tag.startswith("J"):
        return "a"
    if nltk_tag.startswith("V"):
        return "v"
    if nltk_tag.startswith("N"):
        return "n"
    if nltk_tag.startswith("R"):
        return "r"
    return "n"


def clean_and_tokenise(text: str):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t.isalpha() and t not in STOPWORDS]
    return tokens


def stem_pipeline(texts):
    return [" ".join(stemmer.stem(t) for t in clean_and_tokenise(doc)) for doc in texts]


def lemmatise_pipeline(texts):
    processed = []
    for doc in texts:
        tokens = clean_and_tokenise(doc)
        if not tokens:
            processed.append("")
            continue
        pos_tags = pos_tag(tokens)
        lemmas = [lemmatizer.lemmatize(t, nltk_tag_to_wordnet(p)) for t, p in pos_tags]
        processed.append(" ".join(lemmas))
    return processed

In [3]:
df = pd.read_csv('musicLyrics.csv')
print(f"Loaded {len(df):,} lyrics.")

raw_texts = df["Lyric"].astype(str).tolist()

Loaded 2,999 lyrics.


In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\itspa\AppData\Roaming\nltk_data...


True

In [11]:
print("Running stemming pipeline …")
stemmed_texts = stem_pipeline(raw_texts)

print("Running lemmatisation pipeline …")
lemm_texts = lemmatise_pipeline(raw_texts)

Running stemming pipeline …
Running lemmatisation pipeline …


In [13]:
model = SentenceTransformer("all-MiniLM-L6-v2")

print("Computing embeddings (stemming) …")
embeddings_stem = model.encode(stemmed_texts, batch_size=64, show_progress_bar=True)

print("Computing embeddings (lemmatisation) …")
embeddings_lem = model.encode(lemm_texts, batch_size=64, show_progress_bar=True)

Computing embeddings (stemming) …


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

Computing embeddings (lemmatisation) …


Batches:   0%|          | 0/47 [00:00<?, ?it/s]

In [16]:
def elbow_and_silhouette(embeddings, title_prefix):
    distortions = []
    silhouettes = []
    K_range = range(2, 16)
    for k in tqdm(K_range, desc="K‑Means sweep"):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(embeddings)
        distortions.append(kmeans.inertia_)
        silhouettes.append(silhouette_score(embeddings, labels))

    fig1 = plt.figure()
    plt.plot(list(K_range), distortions, marker="o")
    plt.xlabel("K")
    plt.ylabel("Inertia (sum of squared distances)")
    plt.title(f"{title_prefix} | Elbow method")
    fig1.savefig(fig_dir / f"{title_prefix.lower().replace(' ', '_')}_elbow.png", dpi=300, bbox_inches="tight")
    plt.close(fig1)

    fig2 = plt.figure()
    plt.plot(list(K_range), silhouettes, marker="o")
    plt.xlabel("K")
    plt.ylabel("Silhouette score")
    plt.title(f"{title_prefix} | Silhouette vs. K")
    fig2.savefig(fig_dir / f"{title_prefix.lower().replace(' ', '_')}_silhouette.png", dpi=300, bbox_inches="tight")
    plt.close(fig2)

    best_k = int(np.argmax(silhouettes) + 2)
    return best_k, silhouettes[best_k - 2]


In [17]:
k_stem, sil_stem = elbow_and_silhouette(embeddings_stem, "Stemming")
k_lem, sil_lem = elbow_and_silhouette(embeddings_lem, "Lemmatisation")

print(f"Best K (stemming): {k_stem} | silhouette = {sil_stem:.3f}")
print(f"Best K (lemmatisation): {k_lem} | silhouette = {sil_lem:.3f}")

K‑Means sweep:   0%|          | 0/14 [00:00<?, ?it/s]

K‑Means sweep:   0%|          | 0/14 [00:00<?, ?it/s]

Best K (stemming): 2 | silhouette = 0.056
Best K (lemmatisation): 2 | silhouette = 0.063


In [18]:
use_embeddings = embeddings_lem if sil_lem >= sil_stem else embeddings_stem
pipeline_name = "lemmatisation" if sil_lem >= sil_stem else "stemming"

print(f"\nContinuing with the {pipeline_name} pipeline …\n")


Continuing with the lemmatisation pipeline …



In [19]:
k_best = k_lem if sil_lem >= sil_stem else k_stem
kmeans = KMeans(n_clusters=k_best, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(use_embeddings)
kmeans_sil = silhouette_score(use_embeddings, kmeans_labels)

In [20]:
dbscan_results = {}
for eps in np.linspace(0.5, 5.0, 10):
    db = DBSCAN(eps=eps, min_samples=5, metric="cosine").fit(use_embeddings)
    labels = db.labels_
    unique_labels = set(labels)
    if len(unique_labels) <= 1:
        continue
    sil = silhouette_score(use_embeddings, labels)
    dbscan_results[eps] = sil
if dbscan_results:
    best_eps = max(dbscan_results, key=dbscan_results.get)
    dbscan = DBSCAN(eps=best_eps, min_samples=5, metric="cosine")
    dbscan_labels = dbscan.fit_predict(use_embeddings)
    dbscan_sil = dbscan_results[best_eps]
else:
    best_eps = None
    dbscan_labels = None
    dbscan_sil = None

In [21]:
best_sil_agg = -1
best_k_agg = None
best_labels_agg = None
for k in range(2, 16):
    agg = AgglomerativeClustering(n_clusters=k, linkage="ward")
    labels = agg.fit_predict(use_embeddings)
    sil = silhouette_score(use_embeddings, labels)
    if sil > best_sil_agg:
        best_sil_agg = sil
        best_k_agg = k
        best_labels_agg = labels
agg_sil = best_sil_agg

In [22]:
pca = PCA(n_components=2, random_state=42)
embeddings_2d = pca.fit_transform(use_embeddings)

def plot_clusters(labels, title, filename):
    fig = plt.figure(figsize=(6, 5))
    scatter = plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=labels, s=8, alpha=0.7)
    plt.title(title)
    plt.xlabel("PCA‑1")
    plt.ylabel("PCA‑2")
    plt.tight_layout()
    fig.savefig(fig_dir / filename, dpi=300)
    plt.close(fig)

plot_clusters(kmeans_labels, f"K‑Means (K={k_best})", f"clusters_kmeans_k{k_best}.png")
if dbscan_labels is not None:
    plot_clusters(dbscan_labels, f"DBSCAN (eps={best_eps:.2f})", f"clusters_dbscan_eps{best_eps:.2f}.png")
plot_clusters(best_labels_agg, f"Agglomerative (K={best_k_agg})", f"clusters_agg_k{best_k_agg}.png")

In [23]:
examples_per_cluster = defaultdict(list)
for idx, (cluster_id, lyric) in enumerate(zip(kmeans_labels, raw_texts)):
    if len(examples_per_cluster[cluster_id]) < 3:
        examples_per_cluster[cluster_id].append(lyric[:400].strip().replace("\n", " ") + "…")

In [24]:
report_path = out_dir / "report.md"
with report_path.open("w", encoding="utf-8") as fp:
    fp.write("# Lyrics Clustering Report\n\n")
    fp.write(f"**Dataset size:** {len(df):,} songs\n\n")
    fp.write("## Pre‑processing Comparison\n")
    fp.write(f"* Stemming pipeline — best K‑Means silhouette = {sil_stem:.3f} at K = {k_stem}\n")
    fp.write(f"* Lemmatisation pipeline — best K‑Means silhouette = {sil_lem:.3f} at K = {k_lem}\n\n")
    fp.write(f"We proceeded with **{pipeline_name}** because it yielded the higher silhouette score.\n\n")

    fp.write("## Final Clustering Scores\n")
    fp.write("| Algorithm | #Clusters | Silhouette | Notes |\n")
    fp.write("|-----------|-----------|------------|-------|\n")
    fp.write(f"| K‑Means | {k_best} | {kmeans_sil:.3f} | elbow + silhouette selected |\n")
    if dbscan_labels is not None:
        fp.write(f"| DBSCAN | {len(set(dbscan_labels))- (1 if -1 in dbscan_labels else 0)} | {dbscan_sil:.3f} | eps = {best_eps:.2f} |\n")
    else:
        fp.write("| DBSCAN | — | — | failed to find >1 cluster |\n")
    fp.write(f"| Agglomerative | {best_k_agg} | {agg_sil:.3f} | Ward linkage |\n\n")

    fp.write("## Cluster Examples (K‑Means)\n")
    for cid, samples in examples_per_cluster.items():
        fp.write(f"### Cluster {cid}\n")
        for s in samples:
            fp.write(f"* {s}\n")
        fp.write("\n")

    fp.write("---\n")
    fp.write("**Homogeneity** was _not_ computed because the dataset does not contain ground‑truth genre labels.\n")

print(f"Saved visualisations in {fig_dir}/ and report at {report_path}")


Saved visualisations in outputs\figures/ and report at outputs\report.md
