# Semsi Interactive Notebook

This notebook reproduces the core Semsi workflow—parsing a `contents.txt` file, building tag embeddings with TF–IDF, and exploring the resulting cosine similarity matrix. It relies exclusively on the lightweight `semsi` package shipped in this repository, so you can run it locally without the legacy Colab setup.

## How to use this notebook

1. Install the project in editable mode (once per environment):
   ```bash
   pip install -e .
   ```
2. (Optional) Install extras for nicer tables and widgets:
   ```bash
   pip install pandas ipywidgets
   ```
3. Open this notebook with Jupyter Lab, VS Code, or any other front-end and execute the cells top to bottom.

In [None]:
from pathlib import Path

from semsi import (
    parse_contents_file,
    TagEmbeddingModel,
    build_similarity_matrix,
    get_top_similar,
)

try:
    from IPython.display import display  # type: ignore
except ImportError:  # pragma: no cover - IPython is optional outside notebooks
    display = None

try:
    import pandas as pd  # type: ignore
except ImportError:  # pragma: no cover - pandas is optional
    pd = None

try:
    import ipywidgets as widgets  # type: ignore
except ImportError:  # pragma: no cover - widgets are optional
    widgets = None

# Point to the contents.txt file you want to analyse.
CONTENTS_PATH = Path("example_data/contents.txt")
assert CONTENTS_PATH.exists(), f"Could not find {CONTENTS_PATH.resolve()}"


In [None]:
documents = parse_contents_file(CONTENTS_PATH)
print(f"Parsed {len(documents)} documents from {CONTENTS_PATH}.")
print("First 3 entries:")
for document in documents[:3]:
    print(f"- {document.identifier}: {', '.join(document.tags)}")


In [None]:
model = TagEmbeddingModel()
embeddings = model.fit_transform(documents)
similarity = build_similarity_matrix(documents, embeddings)
print(f"Similarity matrix shape: {len(similarity.labels)} x {len(similarity.labels)}")


In [None]:
def similarity_dataframe(matrix):
    if pd is None:
        return None
    return pd.DataFrame(matrix.values, index=matrix.labels, columns=matrix.labels)

if pd is not None and display is not None:
    df = similarity_dataframe(similarity)
    display(df.round(3))
else:
    print("Pandas (or IPython display) is not available; showing a textual preview instead.
")
    print(similarity.preview(limit=5))


In [None]:
def show_top_matches(target: str, top_n: int = 5):
    matches = get_top_similar(similarity, target, top_n=top_n)
    if pd is not None and display is not None:
        df = pd.DataFrame(matches, columns=["identifier", "score"])
        display(df)
    else:
        for label, score in matches:
            print(f"{label}: {score:.3f}")

available_targets = sorted(similarity.labels)

if widgets is not None and display is not None:
    target_dropdown = widgets.Dropdown(options=available_targets, description="Target")
    top_slider = widgets.IntSlider(value=5, min=1, max=min(15, len(available_targets) - 1), description="Top N")
    ui = widgets.VBox([target_dropdown, top_slider])
    out = widgets.interactive_output(
        lambda target, top_n: show_top_matches(target, top_n),
        {"target": target_dropdown, "top_n": top_slider},
    )
    display(ui, out)
else:
    print("ipywidgets is not available. Call show_top_matches() manually, e.g.:")
    example = available_targets[0]
    print(f"show_top_matches('{example}', top_n=5)")


In [None]:
# Optional: save the similarity matrix to disk (CSV, JSON, or pickle)
from pathlib import Path

output_dir = Path("notebooks/outputs")
output_dir.mkdir(parents=True, exist_ok=True)
(similarity.save_csv(output_dir / "similarity.csv"))
(similarity.save_json(output_dir / "similarity.json"))
print(f"Saved CSV and JSON copies under {output_dir.resolve()}")
