# AIML Glossary Analysis Notebook

This notebook explores the AIML glossary as a dataset:
- Load glossary JSON
- Build a graph of terms and links
- Compute basic metrics
- Run clustering analysis
- Visualize clusters
- Log results to MLflow


## Cell 1: Setup, data loading, graph clustering (Louvain)

In [None]:
# Install dependencies if needed (uncomment if running fresh)
# !pip install -r requirements.txt

import os
import json
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import mlflow
import community  # python-louvain

# Set MLflow experiment and run name
mlflow.set_experiment("AIML Glossary Analysis")
RUN_NAME = os.getenv("RUN_NAME", f"notebook-{os.getenv('GITHUB_SHA', 'local')}")

# Ensure output directories exist
os.makedirs("output", exist_ok=True)
os.makedirs("visualizations", exist_ok=True)

# Load Glossary Data
with open("data/aiml_glossary.json", "r", encoding="utf-8") as f:
    glossary = json.load(f)

print(f"Loaded {len(glossary)} glossary entries.")
glossary[0]  # peek at first entry

# Build a Graph of Terms
G = nx.Graph()
for entry in glossary:
    term = entry.get("term")
    if not term:
        continue
    G.add_node(term)
    for rel in entry.get("related_terms", []):
        rel_term = rel["label"] if isinstance(rel, dict) else rel
        if rel_term:
            G.add_edge(term, rel_term)
    for tag in entry.get("tags", []):
        tag_term = tag["label"] if isinstance(tag, dict) else tag
        if tag_term:
            G.add_edge(term, tag_term)

print(f"Graph has {len(G.nodes)} nodes and {len(G.edges)} edges.")

# Compute Metrics
num_terms = len(G.nodes)
num_links = len(G.edges)
avg_degree = sum(dict(G.degree()).values()) / num_terms if num_terms > 0 else 0

print("Number of terms:", num_terms)
print("Number of links:", num_links)
print("Average degree:", avg_degree)

# Clustering Analysis (Louvain)
partition = community.best_partition(G)
num_clusters = len(set(partition.values()))
largest_cluster_size = max(pd.Series(list(partition.values())).value_counts())
modularity = community.modularity(partition, G)

print("Clusters found:", num_clusters)
print("Largest cluster size:", largest_cluster_size)
print("Modularity:", modularity)

# Save Cluster Assignments
cluster_df = pd.DataFrame(list(partition.items()), columns=["term", "cluster_id"])
cluster_df.to_csv("output/cluster_assignments.csv", index=False)

# Visualize Clusters
plt.figure(figsize=(10, 8))
pos = nx.spring_layout(G, seed=42)
nx.draw_networkx_nodes(G, pos, node_color=[partition[n] for n in G.nodes], cmap=plt.cm.Set3)
nx.draw_networkx_edges(G, pos, alpha=0.3)
nx.draw_networkx_labels(G, pos, font_size=8)
plt.title("Glossary Clusters (Louvain)")
plt.tight_layout()
plt.savefig("visualizations/glossary_clusters.png")
plt.show()

# Log Results to MLflow
with mlflow.start_run(run_name=f"{RUN_NAME}-graph"):
    mlflow.log_param("num_terms", num_terms)
    mlflow.log_param("num_links", num_links)
    mlflow.log_metric("avg_degree", avg_degree)
    mlflow.log_metric("num_clusters", num_clusters)
    mlflow.log_metric("largest_cluster_size", largest_cluster_size)
    mlflow.log_metric("modularity", modularity)
    mlflow.log_artifact("data/aiml_glossary.json")
    mlflow.log_artifact("output/cluster_assignments.csv")
    mlflow.log_artifact("visualizations/glossary_clusters.png")


## Cell 2: Semantic Clustering (TF-IDF + KMeans)

In [None]:
# Semantic Clustering with TF-IDF + KMeans
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Ensure output directories exist
os.makedirs("output", exist_ok=True)
os.makedirs("visualizations", exist_ok=True)

# Collect definitions and terms
definitions = [entry.get("definition", "") or "" for entry in glossary]
terms = [entry.get("term", "") or "" for entry in glossary]

# TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(definitions)

# Choose number of clusters (default 5, override via env RUN_K)
k = int(os.getenv("RUN_K", 5))
kmeans = KMeans(n_clusters=k, random_state=42, n_init="auto")
clusters = kmeans.fit_predict(X)

# Build DataFrame of assignments
semantic_df = pd.DataFrame({"term": terms, "cluster_id": clusters})
semantic_df.to_csv("output/semantic_clusters.csv", index=False)
semantic_df.head()

# Inspect Cluster Contents (show first 15 terms per cluster)
for cluster_id in range(k):
    cluster_terms = semantic_df[semantic_df["cluster_id"] == cluster_id]["term"].tolist()
    print(f"\nCluster {cluster_id} ({len(cluster_terms)} terms):")
    print(", ".join(cluster_terms[:15]))

# Visualize Semantic Clusters (2D Projection)
try:
    X_2d = PCA(n_components=2, random_state=42).fit_transform(X.toarray())
    plt.figure(figsize=(8, 6))
    plt.scatter(X_2d[:, 0], X_2d[:, 1], c=clusters, cmap="Set3", alpha=0.7, s=20)
    plt.title("Semantic Clusters of Glossary Terms (TF-IDF + KMeans)")
    plt.tight_layout()
    plt.savefig("visualizations/semantic_clusters.png")
    plt.show()
except Exception as e:
    print(f"Note: PCA visualization skipped due to: {e}")

# Log Semantic Clustering Results to MLflow
with mlflow.start_run(run_name=f"{RUN_NAME}-semantic"):
    mlflow.log_param("algo", "tfidf-kmeans")
    mlflow.log_param("k", k)
    mlflow.log_param("num_terms", len(terms))
    mlflow.log_metric("inertia", float(kmeans.inertia_))
    mlflow.log_artifact("output/semantic_clusters.csv")
    if os.path.exists("visualizations/semantic_clusters.png"):
        mlflow.log_artifact("visualizations/semantic_clusters.png")


✅ Outcome
Glossary terms are grouped by semantic similarity of definitions.

A 2D scatter plot shows clusters visually.

MLflow logs the semantic cluster assignments as an artifact, alongside the graph‑based clustering runs.

This gives two complementary views:

Graph‑based clustering → connectivity of terms via tags/related terms.

Semantic clustering → similarity of definitions via text embeddings.

Together, they provide a richer picture of how the glossary is structured.

## Cell 3: Cluster Comparison (ARI)

In [None]:
# Cluster Comparison (Graph vs Semantic Clusters)
import os
from sklearn.metrics import adjusted_rand_score

# Ensure output directories exist
os.makedirs("output", exist_ok=True)
os.makedirs("visualizations", exist_ok=True)

# Align terms from both clustering methods
graph_clusters = []
semantic_clusters = []

for term in terms:
    # Graph cluster assignment
    graph_clusters.append(partition.get(term, -1))
    # Semantic cluster assignment
    match = semantic_df.loc[semantic_df["term"] == term, "cluster_id"]
    semantic_clusters.append(int(match.values[0]) if not match.empty else -1)

# Compute Adjusted Rand Index
ari = adjusted_rand_score(graph_clusters, semantic_clusters)
print("Adjusted Rand Index (Graph vs Semantic):", ari)

# Save ARI metric to JSON
import json
with open("output/ari_metrics.json", "w", encoding="utf-8") as f:
    json.dump({"adjusted_rand_index": ari}, f, indent=2)

# Log Comparison Metric to MLflow
with mlflow.start_run(run_name=f"{RUN_NAME}-comparison"):
    mlflow.log_metric("adjusted_rand_index", ari)
    mlflow.log_artifact("output/cluster_assignments.csv")
    mlflow.log_artifact("output/semantic_clusters.csv")
    mlflow.log_artifact("output/ari_metrics.json")


✅ Outcome
Adjusted Rand Index (ARI) ranges from:

1.0 → perfect agreement between clustering methods.

0.0 → random agreement.

Negative values → worse than random.

This shows how much overlap exists between graph‑based clusters (connectivity) and semantic clusters (definition similarity).

MLflow will log the ARI metric alongside your other experiments, to track consistency across runs.

This comparison is a quantitative way to evaluate whether glossary terms that are strongly linked also share semantic similarity.

## Cell 4: Dashboard Visualizations

In [None]:
# Dashboard Visualizations for ARI Metrics
import matplotlib.pyplot as plt

# Ensure output directories exist
os.makedirs("visualizations", exist_ok=True)

# Collect ARI Metrics Across Runs from MLflow
client = mlflow.tracking.MlflowClient()
experiment = client.get_experiment_by_name("AIML Glossary Analysis")

runs = client.search_runs([experiment.experiment_id]) if experiment else []
ari_data = []
for run in runs:
    metrics = run.data.metrics
    if "adjusted_rand_index" in metrics:
        ari_data.append({
            "run_id": run.info.run_id,
            "run_name": run.data.tags.get("mlflow.runName", ""),
            "timestamp": run.info.start_time,
            "adjusted_rand_index": metrics["adjusted_rand_index"]
        })

ari_df = pd.DataFrame(ari_data)
if not ari_df.empty:
    ari_df["timestamp"] = pd.to_datetime(ari_df["timestamp"], unit="ms")
    ari_df.sort_values("timestamp", inplace=True)

    # Line chart: ARI over time
    plt.figure(figsize=(10, 6))
    plt.plot(ari_df["timestamp"], ari_df["adjusted_rand_index"], marker="o", linestyle="-")
    plt.title("Adjusted Rand Index Across Glossary Runs")
    plt.xlabel("Run Timestamp")
    plt.ylabel("Adjusted Rand Index")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig

✅ Outcome
Line chart → shows ARI values over time, to indicate trends in consistency between graph and semantic clustering.

Bar chart → compares ARI values across individual runs, useful for spotting which glossary versions align better.

Both charts can be saved as artifacts (ari_trend.png, ari_bar.png) and logged to MLflow for dashboard‑style tracking.

This gives a visual dashboard inside Jupyter and MLflow, so this glossary project operates as an experiment pipeline.

This completes the loop: glossary → clustering → comparison → dashboard → MLflow logging.