# Lemma Constraint

Make sure that gaps are only generated for lemmas that appear in the page text

In [1]:
import json
from collections import Counter, defaultdict

from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
import seaborn as sns

torch.set_float32_matmul_precision("high")

## Data

Combine the human annotations with the generated cloze exercises and source texts

In [2]:
annotations_df = (
    pd.read_csv("../data/testResults_from_2025-05-07.csv")
    # Convert dictionaries to lists
    .assign(
        answers=lambda x: x["answers"].apply(
            lambda row: list(json.loads(row).values())
        ),
        correctAnswers=lambda x: x["correctAnswers"].apply(
            lambda row: list(json.loads(row).values())
        ),
        annotation_counts=lambda x: x["annotations"].apply(
            lambda row: Counter(json.loads(row).values())
        ),
        annotations=lambda x: x["annotations"].apply(
            lambda row: list(json.loads(row).values())
        ),
    )
    # Calculate percentage correct
    .assign(
        pct_correct=lambda x: x.apply(
            lambda row: sum(
                a == c for a, c in zip(row["answers"], row["correctAnswers"])
            )
            / len(row["answers"])
            * 100,
            axis=1,
        )
    )
)

cloze_df = pd.read_json(
    "../results/cloze_exercises_kl_divergence.jsonl", lines=True
).assign(passageId=lambda x: x.index + 1)

df = pd.merge(cloze_df, annotations_df, on="passageId").query(
    'method == "contextuality_plus"'
)
df.sample(2)

Unnamed: 0,volume,page,summary,markdown,text,contextuality,contextuality_plus,keyword,passageId,id,...,method,score,timeSpent,answers,correctAnswers,annotations,holisticScore,timestamp,annotation_counts,pct_correct
26,research-methods-in-psychology,1-methods-of-knowing,The text discusses various methods of acquirin...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Describe the 5 metho...,{'text': 'The text discusses various methods o...,{'text': 'The text discusses various methods o...,{'text': 'The text discusses various methods o...,5,fee694i0Gd4YjmNfg4oG,...,contextuality_plus,77.777778,53761.963,"[instincts, involves, prove, incorrect, based,...","[instincts, involves, shows, incorrect, based,...","[sentence, sentence, sentence, sentence, sente...",4,2025-05-08T19:05:33.627Z,"{'sentence': 7, 'source': 2}",77.777778
31,research-methods-in-psychology,1-methods-of-knowing,The text discusses various methods of acquirin...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Describe the 5 metho...,{'text': 'The text discusses various methods o...,{'text': 'The text discusses various methods o...,{'text': 'The text discusses various methods o...,5,WzaDDDH3gMS2wthasHHv,...,contextuality_plus,77.777778,91.867,"[instincts, involves, shows, incorrect, based,...","[instincts, involves, shows, incorrect, based,...","[source, sentence, unpredictable, unpredictabl...",1,2025-06-01T20:19:48.729Z,"{'source': 1, 'sentence': 2, 'unpredictable': 6}",77.777778


# Test Restricted Generation

Only choose gaps whose lemmatized form appears in the source text

In [3]:
class ContextualityGapper:
    def __init__(self, model_name: str = "answerdotai/ModernBERT-large"):
        # Load SpaCy for sentence splitting and preprocessing
        self.nlp = spacy.load("en_core_web_sm")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForMaskedLM.from_pretrained(model_name).to(self.device)
        self.model.eval()

        self.min_blank_distance = 7  # Minimum distance between blanks

        # Minimum log-predictability of alternatives
        self.min_predictability = np.log(0.05)

        # Part-of-Speech Blacklist (do not delete these words)
        self.blacklist = [
            "PROPN",  # Proper nouns
            "NUM",  # Numbers
            "PUNCT",  # Punctuation
            "SYM",  # Symbols
            "X",  # Other
        ]

    def _get_leading_ws_tokens(self, doc: Doc) -> list[str]:
        """The ModernBERT Tokenizer will work fine if we give it tokens with leading spaces.
        SpaCy normally handles whitespace in terms of trailing space."""
        if not len(doc):
            return [""]

        tokens = [doc[0].text]
        # For tokens after the 0th, prepend trailing whitespace from the previous token.
        tokens += [doc[i - 1].whitespace_ + doc[i].text for i in range(1, len(doc))]
        return tokens

    def get_token_mappings(self, tokens: list[str]) -> dict[int, list[int]]:
        """Get mappings between word positions and token positions"""
        # Tokenize while keeping track of word IDs
        tokenized = self.tokenizer(
            tokens, return_tensors="pt", is_split_into_words=True
        )
        word_ids = tokenized.word_ids()

        # Create mapping from word position to token positions
        word_to_tokens = defaultdict(list)

        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is not None:
                word_to_tokens[word_idx].append(token_idx)

        return word_to_tokens

    def get_masked_logits(
        self, tokens: list[str], mask_idx: int
    ) -> tuple[torch.Tensor, int]:
        """Get model logits for a masked position in text"""
        # Get the word tokens and their alignment info
        word_to_tokens = self.get_token_mappings(tokens)

        # Find all token positions for the word we want to mask
        token_positions = word_to_tokens[mask_idx]

        # Create masked version of the text
        input_ids = self.tokenizer(
            tokens, is_split_into_words=True, return_tensors="pt"
        ).input_ids[0]
        masked_ids = input_ids.clone()

        # ID of the first subword token that we masked
        first_token_id = input_ids[token_positions[0]]

        # Mask all tokens corresponding to our target word
        masked_ids[token_positions] = self.tokenizer.mask_token_id

        # Get model outputs
        outputs = self.model(input_ids.unsqueeze(0).to(self.device))

        # Get logits
        logits = outputs.logits[0, token_positions, :]

        return logits, first_token_id

    def get_contextuality_score(
        self,
        page_doc: Doc,
        summary_doc: Doc,
        sent: Span,
        tok: Token,
        method: str = "kl",
    ) -> float:
        """Calculate contextuality score for a word position using full page context

        Args:
            page_doc: The full page text as a spaCy Doc
            summary_doc: The summary text as a spaCy Doc
            sent: The sentence from the summary containing the token
            tok: The token from the summary to evaluate
            method: "kl" for kl-divergence or "contextuality" for contextuality score

        Returns:
            Contextuality score
        """

        # Get logits for both full text and sentence text
        # For the full text context, we use the page + summary
        full_toks = self._get_leading_ws_tokens(page_doc) + self._get_leading_ws_tokens(
            summary_doc
        )
        full_pos = len(page_doc) + tok.i  # Position of token in full document
        full_logits, word_id = self.get_masked_logits(full_toks, full_pos)

        # For the local context, we use just the sentence from the summary
        sent_pos = tok.i - sent.start  # Position of token in the sentence
        sent_logits, _ = self.get_masked_logits([tok.text for tok in sent], sent_pos)

        # Calculate probabilities using first sub-word token
        full_probs = torch.softmax(full_logits[0], dim=0)
        sent_probs = torch.softmax(sent_logits[0], dim=0)

        p = full_probs[word_id]
        q = sent_probs[word_id]

        if method == "kl":
            # KL-divergence is p*log(p/q)
            score = float(p * torch.log2(p / q))
        elif method == "contextuality":
            # Contextuality is distance between full-text and sentence probability
            score = float(p - q)
        else:
            raise ValueError("Unknown method.")

        return score

    def choose_blank_positions(
        self, page_doc: Doc, summary_doc: Doc, num_blanks: int
    ) -> list[int]:
        """Choose positions to blank in the summary based on contextuality scores with full page"""
        scores = []
        valid_positions = []

        page_lemmas = {tok.lemma_ for tok in page_doc}

        # Calculate scores for each position in the summary
        for i, sent in enumerate(summary_doc.sents):
            if i == 0:
                continue  # Skip first sentence
            for tok in sent:
                if (
                    len(tok.text) < 3
                    or tok.pos_ in self.blacklist
                    or tok.is_stop
                    or not tok.text.isalpha()
                    or tok.lemma_ not in page_lemmas
                ):
                    scores.append(-float("inf"))
                else:
                    # Calculate contextuality using both the full page and summary
                    score = self.get_contextuality_score(
                        page_doc, summary_doc, sent, tok
                    )
                    scores.append(score)
                valid_positions.append(tok.i)

        # Convert to numpy for easier manipulation
        scores = np.array(scores)

        # Choose positions greedily while maintaining minimum distance
        positions = []
        for _ in range(num_blanks):
            if np.all(scores == -float("inf")):
                break

            # Choose highest scoring position
            idx = np.argmax(scores)
            pos = valid_positions[idx]
            positions.append(pos)

            # Zero out scores within minimum distance
            start = max(0, idx - self.min_blank_distance)
            end = min(len(scores), idx + self.min_blank_distance + 1)
            scores[start:end] = -float("inf")

        return sorted(positions)

    def get_alternates(self, tokens: list[str], topk=5) -> list[dict]:
        """Get top k predictions for the masked positions in tokens

        Returns:
            List of dictionaries, one per masked position, with candidate words and their probabilities
        """
        predictions = []

        # Find all mask positions
        mask_positions = [i for i, token in enumerate(tokens) if token == "[MASK]"]

        for mask_pos in mask_positions:
            word_candidates = {}

            # Try different mask lengths (1, 2, or 3 tokens)
            for mask_length in range(1, 4):
                # Replace the single mask with multiple if needed
                masked_tokens = (
                    tokens[:mask_pos]
                    + ["[MASK]"] * mask_length
                    + tokens[mask_pos + 1 :]
                )

                # Get initial predictions for first token
                current_candidates = []
                logits, _ = self.get_masked_logits(masked_tokens, mask_pos)
                probs = torch.softmax(logits[0], dim=0)
                top_values, top_indices = torch.topk(probs, topk)

                # Start with first token candidates
                for idx, prob in zip(top_indices.tolist(), top_values.tolist()):
                    current_candidates.append(([idx], prob))

                # Build up multi-token predictions if needed
                for token_idx in range(1, mask_length):
                    new_candidates = []
                    for token_ids, prob in current_candidates:
                        # Fill in what we've predicted so far
                        partial_filled = tokens.copy()
                        filled_text = self.tokenizer.decode(token_ids)
                        remaining_masks = mask_length - token_idx

                        partial_filled = (
                            tokens[:mask_pos]
                            + [filled_text]
                            + ["[MASK]"] * remaining_masks
                            + tokens[mask_pos + 1 :]
                        )

                        # Get prediction for next position
                        next_logits, _ = self.get_masked_logits(
                            partial_filled, mask_pos + 1
                        )
                        next_probs = torch.softmax(next_logits[0], dim=0)
                        next_values, next_indices = torch.topk(next_probs, 1)

                        # Add to candidates
                        new_token_ids = token_ids + [next_indices[0].item()]
                        new_prob = prob * next_values[0].item()
                        new_candidates.append((new_token_ids, new_prob))

                    current_candidates = new_candidates

                # Add final decoded words
                for token_ids, prob in current_candidates:
                    word = self.tokenizer.decode(token_ids).strip()
                    if " " in word:
                        # Word contains a space (is actually multiple words)
                        continue
                    if word not in word_candidates or prob > word_candidates[word]:
                        word_candidates[word] = prob

            # Sort candidates by probability
            sorted_candidates = sorted(
                word_candidates.items(), key=lambda x: x[1], reverse=True
            )
            predictions.append({word: prob for word, prob in sorted_candidates[:topk]})

        return predictions

    def generate_cloze(
        self,
        summary_text: str,
        page_text: str = "",
        num_blanks: int = 10,
    ) -> tuple[str, list[str], list[dict[str, float]]]:
        """Generate a cloze text from summary using page for context

        Args:
            page_text: The full page text
            summary_text: The summary text to create gaps in
            num_blanks: Number of blanks to create

        Returns:
            Tuple of (cloze_text, answers, alternates)
        """
        # Process both texts
        page_doc = self.nlp(page_text)
        summary_doc = self.nlp(summary_text)

        # Choose positions to blank in the summary
        masked_positions = self.choose_blank_positions(
            page_doc, summary_doc, num_blanks
        )

        # Get the answers (the original words that will be blanked)
        answers = [summary_doc[pos].text for pos in masked_positions]

        # Replace tokens with mask
        summary_tokens = np.array(self._get_leading_ws_tokens(summary_doc))
        summary_tokens[masked_positions] = "[MASK]"
        summary_tokens = summary_tokens.tolist()

        # Construct cloze token input for gap predictions
        cloze_tokens = self._get_leading_ws_tokens(page_doc) + summary_tokens

        # Collect gaps
        gaps = []
        for tok in summary_doc:
            if tok.i in masked_positions:
                gaps.append((tok.text, tok.idx, len(tok.text)))

        return gaps

In [33]:
gapper = ContextualityGapper(model_name="answerdotai/ModernBERT-large")

In [34]:
restricted = []

for row in tqdm(df.itertuples(), total=len(df)):
    restricted.append(gapper.generate_cloze(row.summary, page_text=row.text))



In [40]:
df["restricted_answers"] = [
    [answer for answer, _, _ in answer_list] for answer_list in restricted
]

## Compare Gaps

Without collecting additional annotations, the best we can do is look at which gaps would be retained with this method and which gaps would be removed.

If this method disproportionately removes gaps that were more difficult to answer, then we can infer that the lemma overlap restriction will make the cloze exercise easier (which is what we want).

First, we see that the lemma overlap constraint does not substantially decrease the number of gaps that are generated (8.65 gaps per exercise vs. 9 gaps per exercise).

In [49]:
display(df.restricted_answers.str.len().describe())
display(df.correctAnswers.str.len().describe())

count    23.000000
mean      8.652174
std       1.112274
min       7.000000
25%       7.500000
50%       9.000000
75%       9.000000
max      10.000000
Name: restricted_answers, dtype: float64

count    23.000000
mean      9.000000
std       0.738549
min       8.000000
25%       8.500000
50%       9.000000
75%       9.500000
max      10.000000
Name: correctAnswers, dtype: float64

## Results

The lemma overlap restriction retains 44% of gaps that are "source-predictable" (ideal), and removes 31% of gaps that were scored as "unpredictable" (bad gaps). This is a good indication that the lemma overlap constraint improves the cloze exercise.

In [60]:
unchanged = []
removed = []
# added = []


def normalize_counter(c: Counter):
    total = sum(c.values())
    for key in c:
        c[key] = round(c[key] / total, 2)
    return c


for row in df.itertuples():
    for answer, rating in zip(row.correctAnswers, row.annotations):
        if answer in row.restricted_answers:
            unchanged.append(rating)
        else:
            removed.append(rating)

print("Retained gaps")
print(sorted(normalize_counter(Counter(unchanged)).items()))
print("\nRemoved gaps")
print(sorted(normalize_counter(Counter(removed)).items()))

Retained gaps
[('passage', 0.18), ('sentence', 0.3), ('source', 0.44), ('unpredictable', 0.08)]

Removed gaps
[('passage', 0.17), ('sentence', 0.43), ('source', 0.09), ('unpredictable', 0.31)]
