# GVGAI Description Clustering

Normalization → dimensionality reduction → clustering are orchestrated in staged loops so the comparisons stay concise and presentation-ready.

In [None]:

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, List, Sequence

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import umap
from IPython.display import display
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import Normalizer, StandardScaler

plt.style.use('seaborn-v0_8')
RANDOM_STATE = 42
DEFAULT_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

@dataclass
class GameDescription:
    identifier: str
    description: str

def embed_descriptions(items: Sequence[GameDescription], model_name: str = DEFAULT_MODEL_NAME) -> np.ndarray:
    model = SentenceTransformer(model_name)
    texts = [item.description for item in items]
    embeddings = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True)
    return embeddings

def normalize_descriptions(descriptions: Dict[str, str]) -> List[GameDescription]:
    items: List[GameDescription] = []
    for key, text in descriptions.items():
        cleaned = text.strip()
        if cleaned:
            items.append(GameDescription(key, cleaned))
    return items

def prepare_game_list(path: Path) -> List[str]:
    raw = json.loads(path.read_text())
    frame = pd.DataFrame(raw)
    return frame.iloc[:, 1].astype(str).str.lower().tolist()

def load_description_dict(path: Path) -> Dict[str, str]:
    return json.loads(path.read_text())



In [None]:

GAME_LIST_PATH = Path("game_list.json")
DESCRIPTION_PATH = Path("all_game_descs.json")

whitelist = set(prepare_game_list(GAME_LIST_PATH))
raw_descriptions = load_description_dict(DESCRIPTION_PATH)
filtered_descriptions = {
    key: value
    for key, value in raw_descriptions.items()
    if not key.lower().startswith('testgame') and key.lower() in whitelist and value
}

description_items = normalize_descriptions(filtered_descriptions)
game_names = [item.identifier for item in description_items]
description_vectors = embed_descriptions(description_items)
print(f"Prepared {len(description_items)} descriptions with embedding shape {description_vectors.shape}.")


In [None]:
NORMALIZATION_MODES = {
    'none': lambda X: X,
    'l2': lambda X: Normalizer(norm='l2').fit_transform(X),
    'scaling': lambda X: StandardScaler().fit_transform(X),
}

DIMENSION_REDUCERS = {
    "none": lambda X: X,
    'pca': lambda X: PCA(n_components=min(50, X.shape[1]), random_state=RANDOM_STATE).fit_transform(X),
    'umap': lambda X: umap.UMAP(
        n_neighbors=15,
        min_dist=0.15,
        n_components=2,
        metric='cosine',
        random_state=RANDOM_STATE
    ).fit_transform(X),

}

KMEANS_K_VALUES = list(range(5, 15))
KMEANS_INERTIA_RANGE = list(KMEANS_K_VALUES)
KMEANS_SILHOUETTE_RANGE = list(KMEANS_K_VALUES)
SELECTED_CLUSTER_K = 9  # adjust manually after inspecting the comparison plots

DBSCAN_EPS_VALUES = np.round(np.linspace(0.3, 1.5, 7), 2)
DBSCAN_MIN_SAMPLES = [3, 5, 10, 15]

INCLUDE_CLUSTER_MEDOIDS = True
LIST_FULL_CLUSTER_MEMBERS = True


In [None]:
import numpy as np

def stage_label(norm_key: str, reducer_key: str) -> str:
    return f"Norm={norm_key.upper()} | DimRed={reducer_key.upper()}"

def evaluate_kmeans(X: np.ndarray, label: str):
    inertias: List[float] = []
    for k in KMEANS_INERTIA_RANGE:
        model = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init='auto')
        model.fit(X)
        inertias.append(model.inertia_)

    silhouettes: List[float] = []
    for k in KMEANS_SILHOUETTE_RANGE:
        labels = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init='auto').fit_predict(X)
        silhouettes.append(silhouette_score(X, labels))

    best_idx = int(np.argmax(silhouettes))
    best_k = KMEANS_SILHOUETTE_RANGE[best_idx]
    best_score = silhouettes[best_idx]

    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].plot(KMEANS_INERTIA_RANGE, inertias, marker='o')
    axes[0].set_title(f'KMeans Elbow | {label}') #### Uncomment here TODO
    # axes[0].set_title('KMeans Elbow')
    axes[0].set_xlabel('k')
    axes[0].set_ylabel('inertia')

    axes[1].plot(KMEANS_SILHOUETTE_RANGE, silhouettes, marker='o', label='silhouette')
    axes[1].axvline(best_k, color='red', linestyle='--', label=f'peak k={best_k}')
    axes[1].set_title(f'KMeans Silhouette | {label}') #### Uncomment here TODO
    # axes[1].set_title(f'KMeans Silhouette')
    axes[1].set_xlabel('k')
    axes[1].set_ylabel('silhouette')
    axes[1].legend()
    plt.tight_layout()
    plt.show()

    print(f"Peak silhouette {best_score:.3f} at k={best_k}. Inspect manually before choosing k.")

    return {
        'k_values': list(KMEANS_SILHOUETTE_RANGE),
        'inertias': inertias,
        'silhouettes': silhouettes,
        'best_k': best_k,
        'best_score': best_score,
    }

def evaluate_dbscan(X: np.ndarray, label: str) -> pd.DataFrame:
    rows = []
    for eps in DBSCAN_EPS_VALUES:
        for min_samples in DBSCAN_MIN_SAMPLES:
            model = DBSCAN(eps=float(eps), min_samples=int(min_samples))
            labels = model.fit_predict(X)
            cluster_ids = np.unique(labels)
            cluster_count = int(np.sum(cluster_ids >= 0))
            noise_ratio = float(np.mean(labels == -1))
            silhouette = silhouette_score(X, labels) if cluster_count > 1 else np.nan
            rows.append({
                'eps': float(eps),
                'min_samples': int(min_samples),
                'clusters': cluster_count,
                'noise_ratio': noise_ratio,
                'silhouette': silhouette,
            })

    df = pd.DataFrame(rows)
    pivot = df.pivot(index='min_samples', columns='eps', values='silhouette')
    plt.figure(figsize=(8, 5))
    sns.heatmap(pivot, annot=True, fmt='.2f', cmap='crest', cbar_kws={'label': 'silhouette'})
    plt.title(f'DBSCAN Silhouette | {label}')
    plt.ylabel('min_samples')
    plt.xlabel('eps')
    plt.tight_layout()
    plt.show()
    return df

def summarize_clusters(
    labels: np.ndarray,
    identifiers: Sequence[str],
    vectors: np.ndarray,
    *,
    include_medoid: bool = False,
    show_full_members: bool = False,
    top_n: int = 5,
) -> pd.DataFrame:
    summary_rows = []
    label_array = np.asarray(labels)
    for cid in sorted(set(label_array)):
        if cid < 0:
            continue
        member_indices = np.where(label_array == cid)[0]
        if member_indices.size == 0:
            continue
        members = [identifiers[idx] for idx in member_indices]
        if include_medoid:
            cluster_vectors = vectors[member_indices]
            centroid = cluster_vectors.mean(axis=0)
            distances = np.linalg.norm(cluster_vectors - centroid, axis=1)
            medoid_name = members[int(np.argmin(distances))]
        else:
            medoid_name = None

        if show_full_members:
            member_str = ', '.join(members)
        else:
            member_str = ', '.join(members[:top_n])

        row = {
            'cluster': int(cid),
            'size': len(members),
            'members': member_str,
        }
        if include_medoid and medoid_name is not None:
            row['medoid'] = medoid_name
        summary_rows.append(row)

    df = pd.DataFrame(summary_rows)
    ordered_cols = ['cluster', 'size']
    if include_medoid:
        ordered_cols.append('medoid')
    ordered_cols.append('members')
    if df.empty:
        return pd.DataFrame(columns=ordered_cols)
    return df[ordered_cols]

def visualize_embedding(
    X: np.ndarray,
    labels: np.ndarray,
    title: str,
    identifiers: Sequence[str] | None = None,
    show_center_names: bool = False,
) -> None:
    label_array = np.asarray(labels)

    # Identify non-noise clusters
    cluster_ids = sorted({int(cid) for cid in np.unique(label_array) if cid >= 0})

    # Compute medoid indices per cluster (in original feature space X)
    medoid_indices: Dict[int, int] = {}
    for cid in cluster_ids:
        member_idx = np.where(label_array == cid)[0]
        if member_idx.size == 0:
            continue
        cluster_vectors = X[member_idx]
        centroid = cluster_vectors.mean(axis=0)
        distances = np.linalg.norm(cluster_vectors - centroid, axis=1)
        medoid_indices[cid] = int(member_idx[int(np.argmin(distances))])

    # Dimensionality reduction for visualization
    if X.shape[1] > 2:
        viz_data = umap.UMAP(
            n_neighbors=15,
            min_dist=0.15,
            n_components=2,
            metric='cosine',
            random_state=RANDOM_STATE
        ).fit_transform(X)
        subtitle = ' (UMAP preview)'
    else:
        viz_data = X
        subtitle = ''

    plt.figure(figsize=(7, 4))

    # Use a discrete colormap so we can show a legend with cluster numbers
    cmap = plt.cm.get_cmap('tab10', len(cluster_ids))

    # Plot each cluster separately so legend shows "Cluster k" with the right color
    for color_idx, cid in enumerate(cluster_ids):
        member_mask = (label_array == cid)
        plt.scatter(
            viz_data[member_mask, 0],
            viz_data[member_mask, 1],
            s=25,
            color=cmap(color_idx),
            alpha=0.7,
            label=f'{cid}',
        )

        # Highlight and label the cluster center (medoid)
        if cid in medoid_indices:
            midx = medoid_indices[cid]
            mx, my = viz_data[midx]
            plt.scatter(
                mx,
                my,
                marker='*',
                s=200,
                edgecolors='k',
                linewidths=1.2,
                color=cmap(color_idx),
            )
            if identifiers is not None and show_center_names:
                plt.annotate(
                    identifiers[midx],
                    (mx, my),
                    textcoords="offset points",
                    xytext=(5, 5),
                    ha='left',
                    fontsize=10,
                )

    # Optional: show noise points (e.g., from DBSCAN) in grey
    noise_mask = (label_array < 0)
    if np.any(noise_mask):
        plt.scatter(
            viz_data[noise_mask, 0],
            viz_data[noise_mask, 1],
            s=15,
            color='lightgrey',
            alpha=0.5,
            label='Noise',
        )

    # plt.title(f'{title}{subtitle}')
    plt.title(f'K-means clustering{subtitle}')
    plt.xlabel('component 1')
    plt.ylabel('component 2')
    plt.xlim(right=3)

    # Legend now shows color ↔ cluster number instead of a colorbar
    plt.legend(
        title='Cluster no.',
        loc='center left',
        bbox_to_anchor=(1.05, 0.5)
    )
    plt.tight_layout()
    plt.show()



In [None]:
analysis_records = []
kmeans_curve_records = []

for norm_name, norm_fn in NORMALIZATION_MODES.items():
    normalized = norm_fn(description_vectors)
    for reducer_name, reducer_fn in DIMENSION_REDUCERS.items():
        reduced = reducer_fn(normalized)
        label = stage_label(norm_name, reducer_name)
        # label = f"K-means"
        print()
        print(f"=== {label} ===")

        kmeans_metrics = evaluate_kmeans(reduced, f'{label} | KMEANS')
        kmeans_curve_records.append({
            'label': label,
            'k_values': kmeans_metrics['k_values'],
            'inertias': kmeans_metrics['inertias'],
            'silhouettes': kmeans_metrics['silhouettes'],
        })

        silhouette_lookup = dict(zip(kmeans_metrics['k_values'], kmeans_metrics['silhouettes']))
        selected_k = max(min(SELECTED_CLUSTER_K, KMEANS_K_VALUES[-1]), KMEANS_K_VALUES[0])
        selected_silhouette = silhouette_lookup.get(selected_k, float('nan'))

        kmeans_model = KMeans(n_clusters=selected_k, random_state=RANDOM_STATE, n_init='auto')
        kmeans_labels = kmeans_model.fit_predict(reduced)
        visualize_embedding(
        reduced,
        kmeans_labels,
        f'KMeans k={selected_k} | {label}',
        # "K means clustering", # Uncomment here, this one is for only one visualization
        identifiers=game_names,
        show_center_names=True,
        )

        display(
            summarize_clusters(
                kmeans_labels,
                game_names,
                reduced,
                include_medoid=INCLUDE_CLUSTER_MEDOIDS,
                show_full_members=LIST_FULL_CLUSTER_MEMBERS,
            )
        )

        analysis_records.append({
            'normalization': norm_name,
            'reduction': reducer_name,
            'manual_k': selected_k,
            'manual_k_silhouette': selected_silhouette,
            'best_k_by_silhouette': kmeans_metrics['best_k'],
            'peak_silhouette_score': kmeans_metrics['best_score'],
        })


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

if kmeans_curve_records:
    # 1. Split the records into 3 parts (chunks)
    # np.array_split handles the math of dividing uneven lists automatically
    batches = np.array_split(kmeans_curve_records, 3)

    # 2. Iterate through each batch to create a separate figure
    for batch_idx, batch_records in enumerate(batches):
        
        # Skip empty batches (in case you have fewer than 3 records total)
        if len(batch_records) == 0:
            continue

        n = len(batch_records)
        
        # Create a new figure for this batch
        # We add a main title to distinguish the parts
        fig, axes = plt.subplots(n, 2, figsize=(14, 4 * n), sharex='col')
        fig.suptitle(f'KMeans Analysis - Part {batch_idx + 1} of 3', fontsize=16, y=1.02)
        
        if n == 1:
            axes = np.array(axes).reshape(1, 2)

        for idx, record in enumerate(batch_records):
            elbow_ax = axes[idx, 0]
            sil_ax = axes[idx, 1]
            k_values = record['k_values']
            
            # Plot Elbow
            elbow_ax.plot(k_values, record['inertias'], marker='o')
            elbow_ax.set_title(f"Elbow | {record['label']}")
            elbow_ax.set_ylabel('inertia')
            
            # Plot Silhouette
            sil_ax.plot(k_values, record['silhouettes'], marker='o', color='C1')
            sil_ax.set_title(f"Silhouette | {record['label']}")
            sil_ax.set_ylabel('silhouette')
            
            # Ensure ticks are visible
            elbow_ax.tick_params(axis='both', which='both', labelbottom=True)
            sil_ax.tick_params(axis='both', which='both', labelbottom=True)

        # Set x-labels for the bottom row of this specific figure
        for ax in axes[:, 0]:
            ax.set_xlabel('k')
        for ax in axes[:, 1]:
            ax.set_xlabel('k')

        plt.tight_layout()
        plt.show()

else:
    print('No KMeans curves recorded. Run the analysis loop first.')

# The summary dataframe remains unchanged as it aggregates all data
summary_df = pd.DataFrame(analysis_records).sort_values(
    ['manual_k_silhouette', 'peak_silhouette_score'], ascending=False
)
summary_df