## Gathering Docs

### Run the following command in the jupyter terminal to clone MDN glossary repo: git clone https://github.com/mdn/content.git

In [1]:
import os 

print(os.getcwd())

/home/jovyan/TechGlossary/TechGlossary


In [None]:
from pathlib import Path
mdn_glossary_path = Path(f"{os.getcwd()}/content/files/en-us/glossary")

# veriyfing that all the files are there
for item in mdn_glossary_path.iterdir():
    print(item)

## Step 1. Parse MDN glossary repo

In [3]:
%pip install markdown


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install python-frontmatter


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [5]:
#To investigate content of a single document
import frontmatter
import pandas as pd
import markdown
from bs4 import BeautifulSoup

term = "png"
with open(f"{os.getcwd()}/content/files/en-us/glossary/{term}/index.md", "r") as f:
    content = f.read()
    post = frontmatter.loads(content)
    
    # Convert Markdown to HTML
    html = markdown.markdown(post.content)
    
    # Extract plain text from HTML
    plain_text = BeautifulSoup(html, "html.parser").get_text()
    
    # Create a single-row DataFrame with only the content
    df = pd.DataFrame([plain_text], columns=["content"])
    
    print(df)

                                             content
0  PNG (Portable Network Graphics) is a graphics ...


In [6]:
# Creating df to build pyterrier index with html for later display 
import os
import pandas as pd
import frontmatter
import re
import markdown
from bs4 import BeautifulSoup

def load_glossary_for_pyterrier(glossary_root: str) -> pd.DataFrame:
    """
    Iterate through all subfolders in the glossary folder,
    read each Markdown file, clean text for indexing, replace
    all double-bracketed shortcodes with just the first term,
    remove Markdown formatting, and return a DataFrame with columns:
    - docno: unique document identifier
    - text: cleaned plain text for search/indexing
    """
    rows = []

    # Regex to match any {{Shortcode("term", ...)}}
    double_bracket_regex = re.compile(
        r'\{\{\s*[\w]+\(\s*["\'](.*?)["\'].*?\)\s*\}\}', re.IGNORECASE
    )

    for term_folder in os.listdir(glossary_root):
        folder_path = os.path.join(glossary_root, term_folder)
        if os.path.isdir(folder_path):
            md_file = os.path.join(folder_path, "index.md")
            if os.path.exists(md_file):
                try:
                    # Load Markdown with frontmatter
                    post = frontmatter.load(md_file)

                    # Replace any {{Shortcode(...)}} with just the first argument
                    content_no_placeholders = double_bracket_regex.sub(r'\1', post.content)

                    # Convert Markdown → HTML
                    html_content = markdown.markdown(content_no_placeholders)

                    # Convert HTML → plain text (removes **bold**, links, code formatting, etc.)
                    plain_text = BeautifulSoup(html_content, "html.parser").get_text()

                    # Clean up extra whitespace
                    clean_text = re.sub(r"\s+", " ", plain_text).strip()

                    # Assign docno (using folder name)
                    docno = term_folder

                    # Add row to DataFrame
                    rows.append({
                        "docno": docno,
                        "text": clean_text
                    })

                except Exception as e:
                    print(f"Error processing {md_file}: {e}")

    # Build DataFrame
    df = pd.DataFrame(rows, columns=["docno", "text"])
    return df

# Usage
glossary_root = f"{os.getcwd()}/content/files/en-us/glossary"
df = load_glossary_for_pyterrier(glossary_root)

pd.set_option("display.max_colwidth", None)  # do not truncate text
print(df)

                    docno  \
0                 closure   
1                   cache   
2              decryption   
3           type_coercion   
4                     ril   
..                    ...   
597              copyleft   
598          cryptography   
599                xforms   
600  flow_relative_values   
601                  host   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           

In [6]:
%pip install -U python-terrier jpype1


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
# imports
import os, pyterrier as pt

# Point PyTerrier to your freshly installed JDK
os.environ["JAVA_HOME"] = os.path.expanduser("~/jdk/jdk-17.0.8+7")
os.environ["JVM_PATH"]  = os.path.join(os.environ["JAVA_HOME"], "lib/server/libjvm.so")
os.environ["PATH"]      = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Confirm libjvm.so exists
!ls $JVM_PATH

# Initialize PyTerrier
if not pt.java.started():
     pt.java.init()

print(" PyTerrier initialized successfully with local JDK.")

/home/jovyan/jdk/jdk-17.0.8+7/lib/server/libjvm.so
 PyTerrier initialized successfully with local JDK.


Java started and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]


## Step 2. Pre-process 
Preprocessing choices determine what the model can “see.” We will adopt a minimal but effective pipeline:

- Lowercasing to normalize case.
- Tokenization on alphabetic words of length ≥3.
- Stopword removal to reduce high-frequency function words.
- Stemming to conflate morphological variants (e.g., “running” → “run”).

In [8]:
%pip install nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


## Tokenization

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [w.lower() for w in tokens if w.isalpha() and w not in stop_words]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)

## Building the Pyterrier Index

In [10]:
df['clean_text'] = df['text'].apply(clean_text)

# Prepare documents for PyTerrier
docs_for_index = [{"docno": row["docno"], "text": row["clean_text"]} for _, row in df.iterrows()]

# Build index
# indexer = pt.IterDictIndexer("./glossary_index", overwrite=True)  # folder to store index
# index_ref = indexer.index(docs_for_index)

# uncomment 2 previous lines if haven't build index yet
index_ref = pt.IndexFactory.of("./glossary_index/data.properties")
print("Indexing complete! Your glossary is ready for retrieval.")

Indexing complete! Your glossary is ready for retrieval.


## Quick Index Stats

In [12]:
# index_ref = pt.IndexFactory.of(index_ref)
# uncomment previous line if loading index for the first time
stats = index_ref.getCollectionStatistics()
print(f"Documents: {stats.getNumberOfDocuments()}")
print(f"Unique terms: {stats.getNumberOfUniqueTerms()}")
print(f"Average doc length: {stats.getAverageDocumentLength():.2f}")

Documents: 602
Unique terms: 3387
Average doc length: 72.25


## Applying BM25 

In [13]:
bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25") 
results = bm25.search("javascript ") 
print(results)

    qid  docid            docno  rank     score        query
0     1    402       javascript     0  3.282391  javascript 
1     1    403      sloppy_mode     1  3.126721  javascript 
2     1    449           number     2  3.087212  javascript 
3     1    560       typescript     3  3.066131  javascript 
4     1    362  privileged_code     4  3.023111  javascript 
..   ..    ...              ...   ...       ...          ...
139   1    309     shallow_copy   139  1.027380  javascript 
140   1    108       call_stack   140  0.954080  javascript 
141   1    215         function   141  0.938857  javascript 
142   1    546     void_element   142  0.872066  javascript 
143   1    387    vendor_prefix   143  0.792223  javascript 

[144 rows x 6 columns]


  bm25 = pt.BatchRetrieve(index_ref, wmodel="BM25")


## Building the UI

In [14]:
!pip install python-terrier datasets nltk flask


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [15]:
from flask import Flask, request, render_template_string

In [16]:
# Initialize a new Flask web application
# Flask is a lightweight Python web framework ideal for demos
app = Flask(__name__)

In [17]:
# HTML template with modern Bootstrap 5 styling and background gradient
HTML_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>BM25 Search</title>
    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
    <style>
        body {
            background: linear-gradient(to right, #c9d6ff, #e2e2e2);
            font-family: 'Segoe UI', sans-serif;
            padding-top: 60px;
        }
        .search-container {
            max-width: 800px;
            margin: auto;
            background-color: white;
            border-radius: 12px;
            padding: 30px;
            box-shadow: 0 4px 12px rgba(0,0,0,0.1);
        }
        .form-control {
            height: 45px;
            font-size: 16px;
        }
        .result-card {
            border-left: 5px solid #0d6efd;
            padding: 15px 20px;
            margin-bottom: 15px;
            background-color: #f8f9fa;
            border-radius: 8px;
        }
        .rank-badge {
            font-weight: bold;
            color: #0d6efd;
        }
        .label-tag {
            background-color: #d1ecf1;
            color: #0c5460;
            padding: 3px 8px;
            border-radius: 4px;
            font-size: 0.85rem;
            margin-left: 10px;
        }
    </style>
</head>
<body>
    <div class="search-container">
        <h2 class="text-center mb-4"> BM25 Document Search</h2>
        <form method="GET">
            <div class="input-group mb-4">
                <input type="text" name="q" class="form-control" placeholder="Enter your search query..." value="{{ query|default('') }}" required>
                <button class="btn btn-primary" type="submit">Search</button>
            </div>
        </form>

        {% if results %}
            <h4 class="mb-3">Top {{ results|length }} Results</h4>
            {% for r in results %}
            <div class="result-card">
                <div><span class="rank-badge">Rank {{ r.rank }}</span>
                <span class="label-tag">{{ r.label }}</span></div>
                <div><strong>Score:</strong> {{ r.score }}</div>
                <div><strong>Snippet:</strong> {{ r.snippet }}</div>
            </div>
            {% endfor %}
        {% elif query %}
            <p class="text-muted">No results found for "{{ query }}"</p>
        {% endif %}
    </div>
</body>
</html>
"""

In [None]:
# Search endpoint: This route handles search requests like: /search?q=privacy
@app.route("/", methods=["GET"])
def search():
    query = request.args.get("q", "")
    results = None

    if query:
        res = bm25.search(query).head(5)
        results = []
        for _, row in res.iterrows():
            doc = df.loc[df["docno"] == row["docno"]].iloc[0]
            results.append({
                "rank": len(results) + 1,
                "docno": row["docno"],
                "snippet": doc["text"][:200],
                "score": round(row["score"], 3)
            })

    return render_template_string(HTML_TEMPLATE, results=results, query=query)

In [None]:
# Run app (on jupyter server)
print("BM25 search ui running at:")
print("https://jupyter.rttl.uw.edu/2025-autumn-info-376-a/user/laurak11/proxy/5000/")
app.run(host="0.0.0.0", port=5000, debug=False)