In [1]:
import json
from pprint import pp
from collections import defaultdict

from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Doc, Span, Token
import yake
import torch
from transformers import (
    AutoModelForTokenClassification,
    AutoModelForMaskedLM,
    AutoTokenizer,
)

torch.set_float32_matmul_precision("high")

## Data

In [2]:
df = pd.read_csv("../data/itell-pages.csv")
df

Unnamed: 0,volume,page,summary,markdown,text
0,research-methods-in-psychology-demo,4-science-and-common-sense-1,The necessity of a scientific approach in psyc...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Explain the limitati...
1,formal-operator-training-guidelines,7-other-training,The outlined training requirements for operati...,### 7.1 New Plant Training\n\nThe following ta...,7.1 New Plant Training\n\nThe following table ...
2,research-methods-in-psychology-demo,9-generating-good-research-questions-1,"When developing a research idea, transforming ...","<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Describe some techni...
3,research-methods-in-psychology,2-understanding-science,"Psychology, often surprising to many, is a sci...","<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Define science.\n2. ...
4,research-methods-in-psychology,1-methods-of-knowing,The text discusses various methods of acquirin...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Describe the 5 metho...
5,research-methods-in-psychology-demo,11-designing-a-research-study-1,"In psychological research, generating a hypoth...","<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Define the concept o...
6,research-methods-in-psychology-demo,13-drawing-conclusions-and-reporting-the-resul...,Scientific theories are continually evaluated ...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Identify the conclus...
7,research-methods-in-psychology,4-science-and-common-sense,The necessity of a scientific approach in psyc...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Explain the limitati...
8,research-methods-in-psychology,16-from-moral-principles-to-ethics-codes,The evolution of ethical standards in research...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Describe the history...
9,communication-for-business-success,18-4-divergent-cultural-characteristics,The text explores the inherent inequalities an...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Discuss divergent cu...


## Finetuned Transformer

In [3]:
class TokenClassificationGapper:
    def __init__(self, model_path=None, tokenizer_path=None):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path).to(
            self.device
        )
        self.model.eval()

    def generate_cloze(self, text, threshold=0.5):
        """Generate a cloze exercise from input text."""
        encoding = self.tokenizer(
            text,
            return_tensors="pt",
            return_offsets_mapping=True,
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        encoding.to("cuda")

        offset_mapping = encoding.pop("offset_mapping")

        # Get model predictions
        outputs = self.model(**encoding)
        probabilities = torch.softmax(outputs.logits, dim=-1)
        predictions = (probabilities[0, :, 1] > threshold).cpu().numpy()

        # Convert predictions to gaps in the text
        offset_mapping = offset_mapping[0].cpu().numpy()
        text_list = list(text)

        gaps = []
        # Replace predicted tokens with underscores
        for idx, pred in enumerate(predictions):
            if pred:
                start, end = offset_mapping[idx]
                if start < len(text_list):  # Check if within text bounds
                    kw = text[start + 1 : end]
                    gaps.append((kw, start + 1, len(kw)))

        return gaps

## Keyword

In [14]:
class KeywordGapper:
    def __init__(self):
        self.extractor = yake.KeywordExtractor(n=1, top=10)
        self.nlp = spacy.load("en_core_web_sm")

    def generate_cloze(self, text, min_distance=30):
        """
        Args:
          text (str): input passage to be gapped
          min_distance (int): minimum number of characters between gap start indexes
        """
        # Get keywords with scores
        kws = self.extractor.extract_keywords(text)

        # Sort keywords by score (higher score = more important)
        sorted_kws = sorted(kws, key=lambda x: x[1], reverse=True)

        # Collect gaps
        gaps = []
        for kw, score in sorted_kws:
            # Start searching after the first sentence
            start_idx = next(self.nlp(text).sents).end_char
            while start_idx < len(text):
                idx = text.find(kw, start_idx)
                if idx == -1:
                    break

                # Check if this position is too close to any existing gap
                too_close = False
                for _, pos, length in gaps:
                    if abs(idx - pos) < min_distance:
                        too_close = True
                        break

                # If not too close, gap this occurrence
                if not too_close:
                    gaps.append((kw, idx, len(kw)))

                # Move to next potential occurrence
                start_idx = idx + 1

        return gaps

## Probability-based Method

In [15]:
class ContextualityGapper:
    def __init__(self, model_name: str = "answerdotai/ModernBERT-large"):
        # Load SpaCy for sentence splitting and preprocessing
        self.nlp = spacy.load("en_core_web_sm")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForMaskedLM.from_pretrained(model_name).to(self.device)
        self.model.eval()

        self.min_blank_distance = 7  # Minimum distance between blanks

        # Minimum log-predictability of alternatives
        self.min_predictability = np.log(0.05)

        # Part-of-Speech Blacklist (do not delete these words)
        self.blacklist = [
            "PROPN",  # Proper nouns
            "NUM",  # Numbers
            "PUNCT",  # Punctuation
            "SYM",  # Symbols
            "X",  # Other
        ]

    def _get_leading_ws_tokens(self, doc: Doc) -> list[str]:
        """The ModernBERT Tokenizer will work fine if we give it tokens with leading spaces.
        SpaCy normally handles whitespace in terms of trailing space."""
        if not len(doc):
            return [""]

        tokens = [doc[0].text]
        # For tokens after the 0th, prepend trailing whitespace from the previous token.
        tokens += [doc[i - 1].whitespace_ + doc[i].text for i in range(1, len(doc))]
        return tokens

    def get_token_mappings(self, tokens: list[str]) -> dict[int, list[int]]:
        """Get mappings between word positions and token positions"""
        # Tokenize while keeping track of word IDs
        tokenized = self.tokenizer(
            tokens, return_tensors="pt", is_split_into_words=True
        )
        word_ids = tokenized.word_ids()

        # Create mapping from word position to token positions
        word_to_tokens = defaultdict(list)

        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is not None:
                word_to_tokens[word_idx].append(token_idx)

        return word_to_tokens

    def get_masked_logits(
        self, tokens: list[str], mask_idx: int
    ) -> tuple[torch.Tensor, int]:
        """Get model logits for a masked position in text"""
        # Get the word tokens and their alignment info
        word_to_tokens = self.get_token_mappings(tokens)

        # Find all token positions for the word we want to mask
        token_positions = word_to_tokens[mask_idx]

        # Create masked version of the text
        input_ids = self.tokenizer(
            tokens, is_split_into_words=True, return_tensors="pt"
        ).input_ids[0]
        masked_ids = input_ids.clone()

        # ID of the first subword token that we masked
        first_token_id = input_ids[token_positions[0]]

        # Mask all tokens corresponding to our target word
        masked_ids[token_positions] = self.tokenizer.mask_token_id

        # Get model outputs
        outputs = self.model(input_ids.unsqueeze(0).to(self.device))

        # Get logits
        logits = outputs.logits[0, token_positions, :]

        return logits, first_token_id

    def get_contextuality_score(
        self,
        page_doc: Doc,
        summary_doc: Doc,
        sent: Span,
        tok: Token,
        method: str = "kl",
    ) -> float:
        """Calculate contextuality score for a word position using full page context

        Args:
            page_doc: The full page text as a spaCy Doc
            summary_doc: The summary text as a spaCy Doc
            sent: The sentence from the summary containing the token
            tok: The token from the summary to evaluate
            method: "kl" for kl-divergence or "contextuality" for contextuality score

        Returns:
            Contextuality score
        """

        # Get logits for both full text and sentence text
        # For the full text context, we use the page + summary
        full_toks = self._get_leading_ws_tokens(page_doc) + self._get_leading_ws_tokens(
            summary_doc
        )
        full_pos = len(page_doc) + tok.i  # Position of token in full document
        full_logits, word_id = self.get_masked_logits(full_toks, full_pos)

        # For the local context, we use just the sentence from the summary
        sent_pos = tok.i - sent.start  # Position of token in the sentence
        sent_logits, _ = self.get_masked_logits([tok.text for tok in sent], sent_pos)

        # Calculate probabilities using first sub-word token
        full_probs = torch.softmax(full_logits[0], dim=0)
        sent_probs = torch.softmax(sent_logits[0], dim=0)

        p = full_probs[word_id]
        q = sent_probs[word_id]

        if method == "kl":
            # KL-divergence is p*log(p/q)
            score = float(p * torch.log2(p / q))
        elif method == "contextuality":
            # Contextuality is distance between full-text and sentence probability
            score = float(p - q)
        else:
            raise ValueError("Unknown method.")

        return score

    def choose_blank_positions(
        self, page_doc: Doc, summary_doc: Doc, num_blanks: int
    ) -> list[int]:
        """Choose positions to blank in the summary based on contextuality scores with full page"""
        scores = []
        valid_positions = []

        # Calculate scores for each position in the summary
        for i, sent in enumerate(summary_doc.sents):
            if i == 0:
                continue  # Skip first sentence
            for tok in sent:
                if (
                    len(tok.text) < 3
                    or tok.pos_ in self.blacklist
                    or tok.is_stop
                    or not tok.text.isalpha()
                ):
                    scores.append(-float("inf"))
                else:
                    # Calculate contextuality using both the full page and summary
                    score = self.get_contextuality_score(
                        page_doc, summary_doc, sent, tok
                    )
                    scores.append(score)
                valid_positions.append(tok.i)

        # Convert to numpy for easier manipulation
        scores = np.array(scores)

        # Choose positions greedily while maintaining minimum distance
        positions = []
        for _ in range(num_blanks):
            if np.all(scores == -float("inf")):
                break

            # Choose highest scoring position
            idx = np.argmax(scores)
            pos = valid_positions[idx]
            positions.append(pos)

            # Zero out scores within minimum distance
            start = max(0, idx - self.min_blank_distance)
            end = min(len(scores), idx + self.min_blank_distance + 1)
            scores[start:end] = -float("inf")

        return sorted(positions)

    def get_alternates(self, tokens: list[str], topk=5) -> list[dict]:
        """Get top k predictions for the masked positions in tokens

        Returns:
            List of dictionaries, one per masked position, with candidate words and their probabilities
        """
        predictions = []

        # Find all mask positions
        mask_positions = [i for i, token in enumerate(tokens) if token == "[MASK]"]

        for mask_pos in mask_positions:
            word_candidates = {}

            # Try different mask lengths (1, 2, or 3 tokens)
            for mask_length in range(1, 4):
                # Replace the single mask with multiple if needed
                masked_tokens = (
                    tokens[:mask_pos]
                    + ["[MASK]"] * mask_length
                    + tokens[mask_pos + 1 :]
                )

                # Get initial predictions for first token
                current_candidates = []
                logits, _ = self.get_masked_logits(masked_tokens, mask_pos)
                probs = torch.softmax(logits[0], dim=0)
                top_values, top_indices = torch.topk(probs, topk)

                # Start with first token candidates
                for idx, prob in zip(top_indices.tolist(), top_values.tolist()):
                    current_candidates.append(([idx], prob))

                # Build up multi-token predictions if needed
                for token_idx in range(1, mask_length):
                    new_candidates = []
                    for token_ids, prob in current_candidates:
                        # Fill in what we've predicted so far
                        partial_filled = tokens.copy()
                        filled_text = self.tokenizer.decode(token_ids)
                        remaining_masks = mask_length - token_idx

                        partial_filled = (
                            tokens[:mask_pos]
                            + [filled_text]
                            + ["[MASK]"] * remaining_masks
                            + tokens[mask_pos + 1 :]
                        )

                        # Get prediction for next position
                        next_logits, _ = self.get_masked_logits(
                            partial_filled, mask_pos + 1
                        )
                        next_probs = torch.softmax(next_logits[0], dim=0)
                        next_values, next_indices = torch.topk(next_probs, 1)

                        # Add to candidates
                        new_token_ids = token_ids + [next_indices[0].item()]
                        new_prob = prob * next_values[0].item()
                        new_candidates.append((new_token_ids, new_prob))

                    current_candidates = new_candidates

                # Add final decoded words
                for token_ids, prob in current_candidates:
                    word = self.tokenizer.decode(token_ids).strip()
                    if " " in word:
                        # Word contains a space (is actually multiple words)
                        continue
                    if word not in word_candidates or prob > word_candidates[word]:
                        word_candidates[word] = prob

            # Sort candidates by probability
            sorted_candidates = sorted(
                word_candidates.items(), key=lambda x: x[1], reverse=True
            )
            predictions.append({word: prob for word, prob in sorted_candidates[:topk]})

        return predictions

    def generate_cloze(
        self,
        summary_text: str,
        page_text: str = "",
        num_blanks: int = 10,
    ) -> tuple[str, list[str], list[dict[str, float]]]:
        """Generate a cloze text from summary using page for context

        Args:
            page_text: The full page text
            summary_text: The summary text to create gaps in
            num_blanks: Number of blanks to create

        Returns:
            Tuple of (cloze_text, answers, alternates)
        """
        # Process both texts
        page_doc = self.nlp(page_text)
        summary_doc = self.nlp(summary_text)

        # Choose positions to blank in the summary
        masked_positions = self.choose_blank_positions(
            page_doc, summary_doc, num_blanks
        )

        # Get the answers (the original words that will be blanked)
        answers = [summary_doc[pos].text for pos in masked_positions]

        # Replace tokens with mask
        summary_tokens = np.array(self._get_leading_ws_tokens(summary_doc))
        summary_tokens[masked_positions] = "[MASK]"
        summary_tokens = summary_tokens.tolist()

        # Construct cloze token input for gap predictions
        cloze_tokens = self._get_leading_ws_tokens(page_doc) + summary_tokens

        # Collect gaps
        gaps = []
        for tok in summary_doc:
            if tok.i in masked_positions:
                gaps.append((tok.text, tok.idx, len(tok.text)))

        return gaps

# Generate Cloze Exercises

In [16]:
cloze_methods = {
    "contextuality": ContextualityGapper(model_name="answerdotai/ModernBERT-large"),
    "contextuality_plus": ContextualityGapper(
        model_name="answerdotai/ModernBERT-large"
    ),
    # Dropping the finetuned model from the analysis since it generates 0 gaps for many summaries
    # "finetuned": TokenClassificationGapper(
    #     model_path="../bin/cloze-model",
    #     tokenizer_path="microsoft/deberta-v3-base",
    # ),
    "keyword": KeywordGapper(),
}

In [17]:
def format_cloze(text, gaps):
    result = list(text)
    for word, st, word_len in gaps:
        for i in range(word_len):
            result[st + i] = "_"
    return "".join(result)

In [18]:
cloze_exercises = defaultdict(list)

for row in tqdm(df.itertuples(), total=len(df)):
    for key, method in cloze_methods.items():
        if key == "contextuality_plus":
            gaps = method.generate_cloze(row.summary, page_text=row.text)
        else:
            gaps = method.generate_cloze(row.summary)
        cloze_exercise_dict = {"text": format_cloze(row.summary, gaps), "gaps": gaps}
        cloze_exercises[key].append(cloze_exercise_dict)

  0%|          | 0/16 [00:00<?, ?it/s]

In [19]:
cloze_df = pd.concat([df, pd.DataFrame(cloze_exercises)], axis=1)
cloze_df.to_json(
    "../results/cloze_exercises_kl_divergence.jsonl", orient="records", lines=True
)
cloze_df

Unnamed: 0,volume,page,summary,markdown,text,contextuality,contextuality_plus,keyword
0,research-methods-in-psychology-demo,4-science-and-common-sense-1,The necessity of a scientific approach in psyc...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Explain the limitati...,{'text': 'The necessity of a scientific approa...,{'text': 'The necessity of a scientific approa...,{'text': 'The necessity of a scientific approa...
1,formal-operator-training-guidelines,7-other-training,The outlined training requirements for operati...,### 7.1 New Plant Training\n\nThe following ta...,7.1 New Plant Training\n\nThe following table ...,{'text': 'The outlined training requirements f...,{'text': 'The outlined training requirements f...,{'text': 'The outlined training requirements f...
2,research-methods-in-psychology-demo,9-generating-good-research-questions-1,"When developing a research idea, transforming ...","<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Describe some techni...,"{'text': 'When developing a research idea, tra...","{'text': 'When developing a research idea, tra...","{'text': 'When developing a research idea, tra..."
3,research-methods-in-psychology,2-understanding-science,"Psychology, often surprising to many, is a sci...","<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Define science.\n2. ...,"{'text': 'Psychology, often surprising to many...","{'text': 'Psychology, often surprising to many...","{'text': 'Psychology, often surprising to many..."
4,research-methods-in-psychology,1-methods-of-knowing,The text discusses various methods of acquirin...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Describe the 5 metho...,{'text': 'The text discusses various methods o...,{'text': 'The text discusses various methods o...,{'text': 'The text discusses various methods o...
5,research-methods-in-psychology-demo,11-designing-a-research-study-1,"In psychological research, generating a hypoth...","<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Define the concept o...,"{'text': 'In psychological research, generatin...","{'text': 'In psychological research, generatin...","{'text': 'In psychological research, generatin..."
6,research-methods-in-psychology-demo,13-drawing-conclusions-and-reporting-the-resul...,Scientific theories are continually evaluated ...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Identify the conclus...,{'text': 'Scientific theories are continually ...,{'text': 'Scientific theories are continually ...,{'text': 'Scientific theories are continually ...
7,research-methods-in-psychology,4-science-and-common-sense,The necessity of a scientific approach in psyc...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Explain the limitati...,{'text': 'The necessity of a scientific approa...,{'text': 'The necessity of a scientific approa...,{'text': 'The necessity of a scientific approa...
8,research-methods-in-psychology,16-from-moral-principles-to-ethics-codes,The evolution of ethical standards in research...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Describe the history...,{'text': 'The evolution of ethical standards i...,{'text': 'The evolution of ethical standards i...,{'text': 'The evolution of ethical standards i...
9,communication-for-business-success,18-4-divergent-cultural-characteristics,The text explores the inherent inequalities an...,"<i-callout variant=""info"" title=""Learning Obje...",Learning Objectives\n\n1. Discuss divergent cu...,{'text': 'The text explores the inherent inequ...,{'text': 'The text explores the inherent inequ...,{'text': 'The text explores the inherent inequ...


# Load Data

In [23]:
cloze_df = pd.read_json("../results/cloze_exercises_kl_divergence.jsonl", lines=True)
cloze_df.iloc[1]

volume                              formal-operator-training-guidelines
page                                                   7-other-training
summary               The outlined training requirements for operati...
markdown              ### 7.1 New Plant Training\n\nThe following ta...
text                  7.1 New Plant Training\n\nThe following table ...
contextuality         {'text': 'The outlined training requirements f...
contextuality_plus    {'text': 'The outlined training requirements f...
keyword               {'text': 'The outlined training requirements f...
Name: 1, dtype: object

In [24]:
cloze_df.iloc[2].contextuality_plus["gaps"]

[['achieved', 109, 8],
 ['researchers', 181, 11],
 ['conceptualizing', 280, 15],
 ['exploring', 343, 9],
 ['involves', 435, 8],
 ['uncertain', 536, 9],
 ['implications', 589, 12],
 ['tried', 672, 5],
 ['continuity', 732, 10]]

In [26]:
for cloze_dict in cloze_df["contextuality_plus"]:
    print(cloze_dict["text"])
    print("*" * 80)

The necessity of a scientific approach in psychology is often questioned, with many relying on common sense or intuition—known as folk psychology—for understanding human behavior. However, scientific research frequently contradicts these _________ beliefs, revealing inaccuracies. For instance, the belief that expressing anger can _________ it has been debunked, as has the notion that false ___________ are rare. Common myths, such as using only 10% of our brain or the effectiveness of _______-reducing diets, persist due to heuristics and confirmation bias. Psychologists emphasize __________ and the pursuit of empirical evidence to _________ these misconceptions. Additionally, they embrace ___________, welcoming unanswered questions as opportunities for scientific ___________.
********************************************************************************
The outlined training requirements for operations personnel in a Refinery Business Unit emphasize the necessary qualifications for wo