## Gathering Docs

### Run the following command in the jupyter terminal to clone MDN glossary repo: git clone https://github.com/mdn/content.git

In [1]:
import os 

print(os.getcwd())

/Users/sanjanapavani/TechGlossary


In [2]:
from pathlib import Path
mdn_glossary_path = Path(f"{os.getcwd()}/content/files/en-us/glossary")

# veriyfing that all the files are there
for item in mdn_glossary_path.iterdir():
    print(item)

/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/svg
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/slug
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/continuous_integration
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/request_header
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/content_header
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/hotlink
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/blink
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/bitwise_flags
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/idl
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/gutters
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/block-level_content
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/ide
/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary/replaced_elements
/Users/sanjanapavani/TechGlossary/con

## Step 1. Parse MDN glossary repo

In [3]:
%pip install markdown

Defaulting to user installation because normal site-packages is not writeable
Collecting markdown
  Downloading markdown-3.9-py3-none-any.whl.metadata (5.1 kB)
Downloading markdown-3.9-py3-none-any.whl (107 kB)
Installing collected packages: markdown
  NOTE: The current PATH contains path(s) starting with `~`, which may not be expanded by all applications.[0m[33m
[0mSuccessfully installed markdown-3.9

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:
%pip install python-frontmatter

Defaulting to user installation because normal site-packages is not writeable
Collecting python-frontmatter
  Downloading python_frontmatter-1.1.0-py3-none-any.whl.metadata (4.1 kB)
Downloading python_frontmatter-1.1.0-py3-none-any.whl (9.8 kB)
Installing collected packages: python-frontmatter
Successfully installed python-frontmatter-1.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
#To investigate content of a single document
import frontmatter
import pandas as pd
import markdown
from bs4 import BeautifulSoup

with open("content/files/en-us/glossary/png/index.md", "r") as f:
    content = f.read()
post = frontmatter.loads(content)

# Convert Markdown to HTML
html = markdown.markdown(post.content)

# Extract plain text from HTML
plain_text = BeautifulSoup(html, "html.parser").get_text()

# Create a single-row DataFrame with only the content
df = pd.DataFrame([plain_text], columns=["content"])

print(df)

                                             content
0  PNG (Portable Network Graphics) is a graphics ...


In [14]:
# Creating df to build pyterrier index with html for later display 
import os
import os
import pandas as pd
import frontmatter
import markdown
from bs4 import BeautifulSoup
import re

def load_glossary_for_pyterrier(glossary_root: str) -> pd.DataFrame:
    """
    Iterate through all subfolders in the glossary folder,
    read each Markdown file, clean text for indexing, keep HTML
    for display, and return a DataFrame with columns:
    - docno: unique document identifier
    - text: plain text (for search/indexing)
    - html: HTML version (for display)
    """
    rows = []

    for term_folder in os.listdir(glossary_root):
        folder_path = os.path.join(glossary_root, term_folder)
        if os.path.isdir(folder_path):
            md_file = os.path.join(folder_path, "index.md")
            if os.path.exists(md_file):
                try:
                    # Load Markdown with frontmatter
                    post = frontmatter.load(md_file)

                    # Remove {{…}} placeholders for clean text
                    content_no_placeholders = re.sub(r"\{\{.*?\}\}", "", post.content)

                    # Convert Markdown → HTML
                    html_content = markdown.markdown(content_no_placeholders)

                    # Convert HTML → plain text
                    plain_text = BeautifulSoup(html_content, "html.parser").get_text()
                    clean_text = re.sub(r"\s+", " ", plain_text).strip()

                    # Assign docno (using folder name)
                    docno = term_folder

                    # Add row
                    rows.append({
                        "docno": docno,
                        "text": clean_text,
                        "html": html_content
                    })

                except Exception as e:
                    print(f"Error processing {md_file}: {e}")

    # Build DataFrame
    df = pd.DataFrame(rows, columns=["docno", "text", "html"])
    return df

# Usage
glossary_root = "/Users/sanjanapavani/TechGlossary/content/files/en-us/glossary"
df = load_glossary_for_pyterrier(glossary_root)

pd.set_option("display.max_colwidth", None)  # do not truncate text/HTML

print(df)

                      docno  \
0                       svg   
1                      slug   
2    continuous_integration   
3            request_header   
4            content_header   
..                      ...   
597              ciphertext   
598               flex_item   
599      fallback_alignment   
600                  router   
601                     sql   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

In [9]:
%pip install -U python-terrier jpype1

Defaulting to user installation because normal site-packages is not writeable
Collecting python-terrier
  Downloading python_terrier-1.0-py3-none-any.whl.metadata (987 bytes)
Collecting jpype1
  Downloading jpype1-1.6.0.tar.gz (1.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m[31m2.0 MB/s[0m eta [36m0:00:01[0m
  Installing build dependencies ... [?2done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pyterrier>=1.0 (from pyterrier[all]>=1.0->python-terrier)
  Downloading pyterrier-1.0.0-py3-none-any.whl.metadata (7.3 kB)
Downloading python_terrier-1.0-py3-none-any.whl (1.4 kB)
Downloading pyterrier-1.0.0-py3-none-any.whl (205 kB)
Building wheels for collected packages: jpype1
done
[?25h  Created wheel for jpype1: filename=jpype1-1.6.0-cp39-cp39-macosx_10_9_universal2.whl size=586697 sha256=e874cc40d660323e73

In [15]:
# imports
import os, pyterrier as pt

# Point PyTerrier to your freshly installed JDK
os.environ["JAVA_HOME"] = os.path.expanduser("~/jdk/jdk-17.0.8+7")
os.environ["JVM_PATH"]  = os.path.join(os.environ["JAVA_HOME"], "lib/server/libjvm.so")
os.environ["PATH"]      = os.environ["JAVA_HOME"] + "/bin:" + os.environ["PATH"]

# Confirm libjvm.so exists
!ls $JVM_PATH

# Initialize PyTerrier
if not pt.java.started():
     pt.java.init()

print(" PyTerrier initialized successfully with local JDK.")

ls: /Users/sanjanapavani/jdk/jdk-17.0.8+7/lib/server/libjvm.so: No such file or directory


AttributeError: module 'pyterrier' has no attribute 'java'

## Step 2. Pre-process 
Preprocessing choices determine what the model can “see.” We will adopt a minimal but effective pipeline:

- Lowercasing to normalize case.
- Tokenization on alphabetic words of length ≥3.
- Stopword removal to reduce high-frequency function words.
- Stemming to conflate morphological variants (e.g., “running” → “run”).

In [None]:
%pip install nltk

In [None]:
# --- TOKENIZATION & CLEANUP ---

import re
from collections import Counter
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def tokenize(text):
    tokens = nltk.word_tokenize(text.lower()) 
    return [t for t in tokens if t.isalpha() and t not in stop_words]

df['tokens'] = df['text'].apply(tokenize)

flat_tokens = [t for tokens in df['tokens'] for t in tokens]
vocab_counter = Counter(flat_tokens)

In [None]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    '''
     Step 1: Convert the input to lowercase and extract valid words
             (e.g., alphabetic tokens of at least 3 letters).
     Hint: You can use regex (e.g., re.findall) for token extraction.

     Step 2: Remove common stop words
             (you can use a predefined list like NLTK's stopwords or your own).

     Step 3: Apply stemming or
             (use something like PorterStemmer or WordNetLemmatizer).

    Step 4: Join the cleaned tokens back into a single string.

     Return the cleaned text
     '''
    tokens = nltk.word_tokenize(text)
    tokens = [w.lower() for w in tokens if w.isalpha() and len(w) >= 3 and w not in stop_words]
    tokens = [stemmer.stem(w) for w in tokens]
    return " ".join(tokens)