In [1]:
# %% Smoke test + bootstrap + MLflow

import os, sys, json
from pathlib import Path

# --- Bootstrap: set repo root for imports and paths ---
repo_root = Path.cwd().parent
os.chdir(repo_root)
sys.path.insert(0, str(repo_root))

print("Repo root set to:", repo_root)
print("Working directory now:", Path.cwd())
print("‚úÖ Python executable:", sys.executable)

# --- Directory checks ---
for d in ["data", "visualizations", "src"]:
    p = repo_root / d
    if not p.exists():
        p.mkdir(parents=True, exist_ok=True)
        print(f"üìÇ Created missing directory: {p}")
    else:
        print(f"{d} exists:", p.exists(), "->", p)

# --- Module imports ---
try:
    import src.generate_outputs
    import src.cluster_analysis
    import src.semantic_clustering
    import src.evaluate_clusters
    print("‚úÖ Core modules imported successfully")
except Exception as e:
    print("‚ùå Import failure:", e)

# --- Glossary file check ---
glossary_file = repo_root / "data/aiml_glossary.json"
print("Glossary file exists:", glossary_file.exists(), "->", glossary_file)
if glossary_file.exists():
    try:
        with open(glossary_file, "r", encoding="utf-8") as f:
            glossary_dict = json.load(f)
        print(f"‚úÖ Glossary JSON parsed, {len(glossary_dict)} entries")
    except Exception as e:
        print("‚ùå Glossary JSON parse error:", e)

# --- MLflow smoke test ---
try:
    import mlflow
    with mlflow.start_run(run_name="smoke_test", nested=True):
        mlflow.log_param("smoke_test", True)
        mlflow.log_metric("smoke_metric", 1.0)
    print("‚úÖ MLflow run started and logged successfully")
except Exception as e:
    print("‚ùå MLflow smoke test failed:", e)


Repo root set to: /home/ian/dev/aiml-glossary
Working directory now: /home/ian/dev/aiml-glossary
‚úÖ Python executable: /home/ian/miniforge3/envs/glossary/bin/python
data exists: True -> /home/ian/dev/aiml-glossary/data
visualizations exists: True -> /home/ian/dev/aiml-glossary/visualizations
src exists: True -> /home/ian/dev/aiml-glossary/src
‚úÖ Core modules imported successfully
Glossary file exists: True -> /home/ian/dev/aiml-glossary/data/aiml_glossary.json
‚úÖ Glossary JSON parsed, 98 entries
‚úÖ MLflow run started and logged successfully


In [2]:
# %% Setup
import sys
from pathlib import Path

REPO_ROOT = Path.cwd()  # bootstrap already set cwd to repo root
sys.path.append(str(REPO_ROOT))

def resolve_uri(uri: str) -> Path:
    prefix, name = uri.split(":", 1)
    if prefix == "data":
        return REPO_ROOT / "data" / name
    elif prefix == "output":
        return REPO_ROOT / "output" / name
    elif prefix == "visualizations":
        return REPO_ROOT / "visualizations" / name
    else:
        raise ValueError(f"Unknown URI prefix: {prefix}")

from src.generate_outputs import generate
from src.cluster_analysis import run_clustering
from src.semantic_clustering import run_semantic_clustering
from src.evaluate_clusters import evaluate_clusters
from src.enrich_glossary import enrich_glossary
from src.link_dictionary import build_link_dictionary
from src.coverage_report import generate_report


In [3]:
# %% Step 1: Generate outputs
generate("data/aiml_glossary.json", "output")

‚úÖ Outputs generated: /home/ian/dev/aiml-glossary/output/terms.csv, /home/ian/dev/aiml-glossary/output/glossary_copy.json


In [4]:
# %% Step 2: Build link dictionary
build_link_dictionary("data/aiml_glossary.json")

‚úÖ Link dictionary built: /home/ian/dev/aiml-glossary/link_dictionary.json
üìä Link dictionary logged to MLflow


{'Supervised Learning': [],
 'Unsupervised Learning': [],
 'Classification': ['Supervised Learning'],
 'Regression': ['Supervised Learning'],
 'Clustering': ['Supervised Learning', 'Unsupervised Learning'],
 'Data Cleaning': [],
 'Feature Scaling': [],
 'Dimensionality Reduction': [],
 'Encoding Categorical Variables': [],
 'Feature Extraction': [],
 'Cross-Validation': ['Generalization'],
 'Confusion Matrix': ['Classification'],
 'Precision': [],
 'Recall': [],
 'F1 Score': ['Classification', 'Precision', 'Recall'],
 'Hyperparameter Tuning': [],
 'Grid Search': [],
 'Random Search': [],
 'Bayesian Optimization': [],
 'Early Stopping': [],
 'Artificial Neural Network': [],
 'Activation Function': [],
 'Backpropagation': [],
 'Convolutional Neural Network': [],
 'Recurrent Neural Network': [],
 'Model Deployment': [],
 'Model Serving': [],
 'Monitoring': [],
 'A/B Testing': [],
 'Rollback': [],
 'Concept Drift': [],
 'Temporal Dependency': [],
 'Replay Methods': ['Continual Learning'],


In [5]:
# %% Step 3: Enrich glossary
enrich_glossary("data/aiml_glossary.json", "data/link_dictionary.json")

‚úÖ Enriched glossary saved: /home/ian/dev/aiml-glossary/data/enriched_glossary.json
üìä Enriched glossary logged to MLflow


{'supervised-learning': {'id': 1,
  'term': 'Supervised Learning',
  'definition': 'A type of machine learning where the model is trained on labeled data to learn a mapping from inputs to outputs.',
  'tags': ['machine learning', 'labeled data', 'prediction'],
  'related_terms': ['Classification', 'Regression'],
  'examples': ['Predicting house prices using historical sales data.'],
  'source': 'Scikit-learn documentation',
  'last_updated': '2025-10-23',
  'key_slug': 'supervised-learning',
  'original_term': 'Supervised Learning',
  'linked_terms': ['Artificial Intelligence']},
 'unsupervised-learning': {'id': 2,
  'term': 'Unsupervised Learning',
  'definition': 'A machine learning approach where the model identifies patterns in data without labeled outcomes.',
  'tags': ['clustering', 'dimensionality reduction', 'pattern discovery'],
  'related_terms': ['Clustering', 'Principal Component Analysis'],
  'examples': ['Segmenting customers based on purchasing behavior.'],
  'source': '

In [6]:
# %% Step 4: Graph clustering
run_clustering("data/aiml_glossary.json", "data/link_dictionary.json")

‚úÖ Graph cluster assignments saved: /home/ian/dev/aiml-glossary/data/cluster_assignments.csv
üìà Cluster visualization saved: /home/ian/dev/aiml-glossary/visualizations/glossary_clusters.png
üìä Graph clustering logged to MLflow


<networkx.classes.graph.Graph at 0x7136da9ba950>

In [7]:
# %% Step 5: Semantic clustering
run_semantic_clustering("data/aiml_glossary.json", n_clusters=8)

‚úÖ Semantic cluster assignments saved: /home/ian/dev/aiml-glossary/data/semantic_cluster_assignments.csv
üìà Semantic cluster visualization saved: /home/ian/dev/aiml-glossary/visualizations/semantic_clusters.png
üìä Semantic clustering logged to MLflow


array([2, 1, 1, 2, 7, 6, 2, 4, 1, 2, 1, 1, 2, 5, 5, 0, 5, 1, 1, 2, 6, 2,
       0, 3, 5, 5, 1, 1, 2, 5, 2, 2, 6, 7, 6, 1, 1, 7, 1, 6, 2, 2, 2, 1,
       6, 6, 6, 1, 3, 6, 2, 6, 6, 5, 6, 5, 2, 7, 3, 2, 2, 2, 2, 1, 5, 5,
       1, 7, 1, 6, 5, 2, 4, 7, 7, 1, 2, 7, 0, 1, 1, 2, 4, 5, 2, 2, 0, 1,
       0, 1, 1, 5, 1, 1, 1, 7, 1, 1], dtype=int32)

In [8]:
# %% Step 6: Evaluate clusters
summary = evaluate_clusters()
print(summary)

‚úÖ Saved /home/ian/dev/aiml-glossary/data/graph_stats.json
‚úÖ Saved /home/ian/dev/aiml-glossary/data/ari_metrics.json
üìä Cluster evaluation logged to MLflow
{'graph_stats': {'total_terms': 0, 'agreements': 0, 'agreement_ratio': 0.0}, 'ari_metrics': {'adjusted_rand_index': 0.0}}


In [9]:
# %% Step 7: Coverage report
generate_report()

Coverage Report:
‚úÖ data:aiml_glossary.json (True)
‚úÖ data:terms.csv (True)
‚úÖ data:glossary_copy.json (True)
‚úÖ data:link_dictionary.json (True)
‚úÖ data:enriched_glossary.json (True)
‚úÖ data:cluster_assignments.csv (True)
‚úÖ data:semantic_cluster_assignments.csv (True)
‚úÖ data:graph_stats.json (True)
‚úÖ data:ari_metrics.json (True)
‚úÖ data:coverage_report.json (True)
‚úÖ visualizations:glossary_clusters.png (True)
‚úÖ visualizations:semantic_clusters.png (True)

Coverage report written to /home/ian/dev/aiml-glossary/data/coverage_report.json


{'data:aiml_glossary.json': True,
 'data:terms.csv': True,
 'data:glossary_copy.json': True,
 'data:link_dictionary.json': True,
 'data:enriched_glossary.json': True,
 'data:cluster_assignments.csv': True,
 'data:semantic_cluster_assignments.csv': True,
 'data:graph_stats.json': True,
 'data:ari_metrics.json': True,
 'data:coverage_report.json': True,
 'visualizations:glossary_clusters.png': True,
 'visualizations:semantic_clusters.png': True}

In [10]:
# %% Status Badge
from src.coverage_report import generate_report

report = generate_report()

# Determine overall status
if all(v is True for v in report.values()):
    badge = "![Workflow Status](https://img.shields.io/badge/Workflow-‚úÖ%20Passed-brightgreen)"
    print("Workflow ‚úÖ Passed")
else:
    badge = "![Workflow Status](https://img.shields.io/badge/Workflow-‚ùå%20Issues-red)"
    print("Workflow ‚ùå Issues detected")

# Display badge inline in notebook
from IPython.display import Markdown
Markdown(badge)


Coverage Report:
‚úÖ data:aiml_glossary.json (True)
‚úÖ data:terms.csv (True)
‚úÖ data:glossary_copy.json (True)
‚úÖ data:link_dictionary.json (True)
‚úÖ data:enriched_glossary.json (True)
‚úÖ data:cluster_assignments.csv (True)
‚úÖ data:semantic_cluster_assignments.csv (True)
‚úÖ data:graph_stats.json (True)
‚úÖ data:ari_metrics.json (True)
‚úÖ data:coverage_report.json (True)
‚úÖ visualizations:glossary_clusters.png (True)
‚úÖ visualizations:semantic_clusters.png (True)

Coverage report written to /home/ian/dev/aiml-glossary/data/coverage_report.json
Workflow ‚úÖ Passed


![Workflow Status](https://img.shields.io/badge/Workflow-‚úÖ%20Passed-brightgreen)

## ‚úÖ Expected Artifact Checklist

After running this runbook end‚Äëto‚Äëend, the following artifacts should exist:

### üìÇ Output directory (`output/`)
- [ ] `terms.csv` ‚Üí glossary terms and definitions in CSV format  
- [ ] `glossary_copy.json` ‚Üí copy of the glossary JSON  
- [ ] `link_dictionary.json` ‚Üí generated term‚Äëto‚Äëterm link dictionary  
- [ ] `enriched_glossary.json` ‚Üí glossary entries enriched with metadata (length, characters, link counts)  
- [ ] `cluster_assignments.csv` ‚Üí graph clustering assignments  
- [ ] `semantic_cluster_assignments.csv` ‚Üí semantic clustering assignments  
- [ ] `graph_stats.json` ‚Üí node/edge counts and cluster stats from graph clustering  
- [ ] `ari_metrics.json` ‚Üí evaluation metric (Adjusted Rand Index)  
- [ ] `coverage_report.json` ‚Üí summary of which artifacts exist ‚úÖ/‚ùå  

### üìÇ Visualizations directory (`visualizations/`)
- [ ] `glossary_clusters.png` ‚Üí graph clustering visualization  
- [ ] `semantic_clusters.png` ‚Üí bar chart of semantic cluster sizes  

### üìä Diagnostics
- Console output should show:
  - Graph stats (nodes, edges, clusters)  
  - Semantic cluster sizes  
  - ARI metric summary  
  - Coverage report with ‚úÖ/‚ùå markers
