In [1]:
import marimo as mo

mo.md("# Welcome to Jargon Test! üåäüçÉ")

In [2]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
jargon_dict = ["RNN", "neural network", "backpropagation", "gradient descent"]  # Your domain list
text = "The RNN, also called neural network, uses backpropagation for training, and use gradient desc."

# Embed
text_tokens = [w for w in text.split() if len(w) > 2]
text_embeds = model.encode(text_tokens)  # Chunk text
jargon_embeds = model.encode(jargon_dict)

# Fuzzy match
sims = cosine_similarity(text_embeds, jargon_embeds)
matches = np.where(sims > 0.70)  # Threshold for fuzzy jargon

jargon_matches = sorted({jargon_dict[i] for i in matches[1]})
print("Detected jargon:", jargon_matches)

Detected jargon: ['RNN', 'backpropagation', 'gradient descent', 'neural network']


In [3]:
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import os
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

jargon_dict = ["RNN", "neural network", "backpropagation", "gradient descent"]
text = "The RNN, also called neural network, uses backpropagation for training, and use gradient desc."

# Embed (batch all inputs together for efficiency)
text_tokens = [w for w in text.split() if len(w) > 2]
all_inputs = text_tokens + jargon_dict

response = client.embeddings.create(
    model="text-embedding-3-large",
    input=all_inputs
)

embeddings = [item.embedding for item in response.data]
text_embeds = np.array(embeddings[:len(text_tokens)])  # First N are text tokens
jargon_embeds = np.array(embeddings[len(text_tokens):])  # Rest are jargon

# Fuzzy match (identical to before)
sims = cosine_similarity(text_embeds, jargon_embeds)
matches = np.where(sims > 0.7)
jargon_matches = sorted({jargon_dict[i] for i in matches[1]})
print("Detected jargon:", jargon_matches)


Detected jargon: ['RNN', 'backpropagation']


In [4]:
import json
import os
from openai import OpenAI

api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise RuntimeError("OPENAI_API_KEY environment variable is required.")

client = OpenAI(api_key=api_key)

def replace_jargon_with_llm(text: str, jargon_terms: list[str]) -> tuple[str, dict[str, str]]:
    """Call OpenAI to swap detected jargon with predefined replacements."""
    replacements_catalog = {
        "RNN": "Recurrent Neural Net",
        "neural network": "artificial intelligence model",
        "backpropagation": "error correction process",
        "gradient descent": "iterative optimization step",
    }
    requested_replacements = {
        term: replacements_catalog[term]
        for term in jargon_terms
        if term in replacements_catalog
    }
    if not requested_replacements:
        return text, {}
    system_prompt = (
        "You are a meticulous copy editor. Replace only the exact jargon keys"
        " provided in the replacement_map with their mapped phrases."
        " Keep every other character, word, and formatting identical to the"
        " original text."
    )
    user_prompt = (
        "Original text:\n"
        f"{text}\n\n"
        "replacement_map (JSON):\n"
        f"{json.dumps(requested_replacements, ensure_ascii=False)}\n\n"
        "Return JSON with keys 'rewritten_text' (the text after applying the"
        " replacements) and 'applied_replacements' (the subset actually used)."
    )
    response = client.responses.create(
        model="gpt-4o-mini",
        input=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ],
        temperature=0.2,
    )
    raw_output = response.output_text.strip()
    try:
        payload = json.loads(raw_output)
    except json.JSONDecodeError as exc:
        raise ValueError(f"Model returned invalid JSON: {raw_output}") from exc
    rewritten = payload.get("rewritten_text", text)
    applied = payload.get("applied_replacements", requested_replacements)
    return rewritten, applied

rewritten_text, suggested_replacements = replace_jargon_with_llm(text, jargon_matches)

highlighted_text = rewritten_text
for plain in suggested_replacements.values():
    highlighted_text = highlighted_text.replace(plain, f"[REPLACED: {plain}]")

print("Original text:", text)
print("jargon Dictionary:", jargon_dict)
print("Suggested replacements:", suggested_replacements)
print("Rewritten text:", rewritten_text)
print("Rewritten text (highlighted):", highlighted_text)

Original text: The RNN, also called neural network, uses backpropagation for training, and use gradient desc.
jargon Dictionary: ['RNN', 'neural network', 'backpropagation', 'gradient descent']
Suggested replacements: {'RNN': 'Recurrent Neural Net', 'backpropagation': 'error correction process'}
Rewritten text: The Recurrent Neural Net, also called neural network, uses error correction process for training, and use gradient desc.
Rewritten text (highlighted): The [REPLACED: Recurrent Neural Net], also called neural network, uses [REPLACED: error correction process] for training, and use gradient desc.


In [5]:
summary_table = mo.ui.table({
    "Field": [
        "Original text",
        "Jargon dictionary",
        "Suggested replacements",
        "Rewritten text",
        "Rewritten text (highlighted)",
    ],
    "Value": [
        text,
        ", ".join(jargon_dict),
        json.dumps(suggested_replacements, ensure_ascii=False),
        rewritten_text,
        highlighted_text,
    ],
})
summary_table