In [27]:
import spacy
from spacy.tokens import Doc, Span, Token
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np
from typing import List, Tuple

class RationalClozeGenerator:
    def __init__(self, model_name: str = "answerdotai/ModernBERT-base"):
        # Load SpaCy for sentence splitting and preprocessing
        self.nlp = spacy.load("en_core_web_sm")
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForMaskedLM.from_pretrained(model_name).to(self.device)
        self.model.eval()
        
        # Set minimum distance between blanks
        self.min_blank_distance = 7
        self.min_predictability = 0.15

        # Part-of-Speech Blacklist (do not delete these words)
        self.blacklist = [
            "PROPN", # Proper nouns
            "NUM", # Numbers
            "PUNCT", # Punctuation
            "SYM", # Symbols
            "X", # Other
        ]

    def get_token_mappings(self, text: str) -> Tuple[List[int], List[int]]:
        """Get mappings between word positions and token positions"""
        # Tokenize while keeping track of word IDs
        tokenized = self.tokenizer(text, return_tensors="pt", is_split_into_words=True)
        word_ids = tokenized.word_ids()
        
        # Create mapping from word position to token positions
        word_to_tokens = {}
        
        for token_idx, word_idx in enumerate(word_ids):
            if word_idx is not None:
                if word_idx not in word_to_tokens:
                    word_to_tokens[word_idx] = []
                word_to_tokens[word_idx].append(token_idx)
            
        return word_to_tokens
        
    def get_masked_logits(self, span: Doc|Span, mask_idx: int) -> torch.Tensor:
        """Get model logits for a masked position in text"""
        # Get the word tokens and their alignment info
        tokens = [tok.text for tok in span]
        word_to_tokens = self.get_token_mappings(tokens)
        
        # Find all token positions for the word we want to mask
        token_positions = word_to_tokens[mask_idx]
        original_word = tokens[mask_idx]
        
        # Create masked version of the text
        tokenized = self.tokenizer(tokens, is_split_into_words=True, return_tensors="pt")
        input_ids = tokenized.input_ids[0].clone()
        
        # Mask all tokens corresponding to our target word
        for pos in token_positions:
            input_ids[pos] = self.tokenizer.mask_token_id
            
        # Get model outputs
        with torch.no_grad():
            outputs = self.model(input_ids.unsqueeze(0).to(self.device))
            
        # Get logits for the first masked token (since we'll use this to predict the whole word)
        first_mask_pos = token_positions[0]
        logits = outputs.logits[0, first_mask_pos, :]
        
        return logits, original_word

    def get_contextuality_score(self, doc: Doc, sent: Span, tok: Token) -> float:
        """Calculate contextuality score for a word position"""
        # Get logits for both full text and sentence contexts
        full_logits, word = self.get_masked_logits(doc, tok.i)
        sent_logits, _ = self.get_masked_logits(sent, tok.i - sent.start)
        
        # Calculate probabilities
        full_probs = torch.softmax(full_logits, dim=0)
        sent_probs = torch.softmax(sent_logits, dim=0)
        
        # Get probability of actual word
        # Here we use the first token of the word as a proxy
        word_tokens = self.tokenizer.encode(word, add_special_tokens=False)
        word_id = word_tokens[0]
        
        # Contextuality is ratio of sentence to full-text probability
        if float(full_probs[word_id]) > self.min_predictability:
            score = float(sent_probs[word_id] - full_probs[word_id])
        else:
            score = -float('inf')
        
        return score
    
    def choose_blank_positions(self, doc: Doc, num_blanks: int) -> list[int]:
        """Choose positions to blank based on contextuality scores"""
        scores = []
        valid_positions = []
        
        # Calculate scores for each position
        for sent in doc.sents:
            for tok in sent:
                # Skip words that shouldn't be blanked
                if (len(tok.text) < 3 or                    # Too short
                    tok.pos_ in self.blacklist or           # Blacklisted POS
                    tok.is_stop or                          # Stop word
                    not tok.text.isalpha()):                # Non-alphabetic
                    scores.append(-float('inf'))
                else:
                    score = self.get_contextuality_score(doc, sent, tok)
                    scores.append(score)
                valid_positions.append(tok.i)
        
        # Convert to numpy for easier manipulation
        scores = np.array(scores)
        
        # Choose positions greedily while maintaining minimum distance
        positions = []
        for _ in range(num_blanks):
            if np.all(scores == -float('inf')):
                break
                
            # Choose highest scoring position
            idx = np.argmax(scores)
            pos = valid_positions[idx]
            positions.append(pos)
            
            # Zero out scores within minimum distance
            start = max(0, idx - self.min_blank_distance)
            end = min(len(scores), idx + self.min_blank_distance + 1)
            scores[start:end] = -float('inf')
            
        return sorted(positions)
    
    def generate_cloze(self, text: str, num_blanks: int) -> tuple[str, list[str]]:
        """Generate a cloze text with blanks and return answers"""
        doc = self.nlp(text)
        positions = self.choose_blank_positions(doc, num_blanks)
        answers = [doc[pos].text for pos in positions]
        
        # Replace words with blanks
        cloze_text = ""
        for tok in doc:
            if tok.i in positions:
                cloze_text += "_" * len(tok.text) + tok.whitespace_
            else:
                cloze_text += tok.text_with_ws
            
        return cloze_text, answers

In [28]:
generator = RationalClozeGenerator()

In [29]:
text = """The cloze procedure, first introduced by Taylor, is a widely used method for creating reading 
comprehension tests inspired by the Gestalt principle of closure. Though many variations have been 
introduced and studied, the core concept is to mask words in prose and task the subject with providing 
the missing words."""

cloze_text, answers = generator.generate_cloze(text, num_blanks=6)
print("Cloze text:")
print(cloze_text)
print("\nAnswers:", answers)

Cloze text:
The cloze procedure, first __________ by Taylor, is a widely used method for creating reading 
comprehension tests ________ by the Gestalt principle of closure. Though many variations have been 
introduced and studied, the core concept is to mask words in prose and task the subject with providing 
the missing words.

Answers: ['introduced', 'inspired']


In [30]:
import json

page_summaries = {}
with open("../data/strapi-page-summaries.json") as f:
    for page in json.load(f):
        if page["PageSummary"]:
            page_summaries[page["Slug"]] = page["PageSummary"]
            
            print("="*80)
            cloze_text, answers = generator.generate_cloze(page["PageSummary"], num_blanks=6)
            print("Cloze text:")
            print(cloze_text)
            print("\nAnswers:", answers)
            print("="*80)

Cloze text:
Experimental psychologists, primarily holding doctoral and master's degrees, conduct scientific ________ in various psychology subfields, often collaborating with students at universities. While some are trained clinicians, most focus on ___-clinical areas such as cognitive or ______ psychology. Their research is crucial for understanding human behavior and developing empirical knowledge, which is vital for clinical practice. The interplay between research and practice is significant, as psychological disorders are empirically testable. The effectiveness of treatments, like psychotherapy, relies on scientific validation. The clinical psychology _________ debates the emphasis on empirically supported treatments, but there is consensus on the ____ for a scientific approach to ensure effective diagnosis and _________.

Answers: ['research', 'non', 'social', 'community', 'need', 'treatment']
Cloze text:
Descriptive statistics are tools used to organize and summarize data, inclu