In [16]:
import spacy
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
import numpy as np

class RationalClozeGenerator:
    def __init__(self, model_name: str = "answerdotai/ModernBERT-base"):
        # Load SpaCy for sentence splitting and preprocessing
        self.nlp = spacy.load("en_core_web_sm")
        
        # Load model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
        self.model.eval()
        
        # Set minimum distance between blanks
        self.min_blank_distance = 7
        
    def get_masked_logits(self, text: str, word_index: int) -> torch.Tensor:
        """Get model logits for a masked position in text"""
        # Create copy of text with word masked
        words = text.split()
        original_word = words[word_index]
        words[word_index] = self.tokenizer.mask_token
        masked_text = " ".join(words)
        
        # Encode and get logits
        inputs = self.tokenizer(masked_text, return_tensors="pt")
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Find position of mask token
        mask_pos = (inputs.input_ids[0] == self.tokenizer.mask_token_id).nonzero().item()
        
        return outputs.logits[0, mask_pos, :]

    def get_contextuality_score(self, text: str, word_index: int) -> float:
        """Calculate contextuality score for a word position"""
        # Get the sentence containing the word
        doc = self.nlp(text)
        words = text.split()
        word = words[word_index]
        
        # Find which sentence contains our word
        word_char_idx = text.find(word, sum(len(w) + 1 for w in words[:word_index]))
        sent = next(sent for sent in doc.sents if sent.start_char <= word_char_idx <= sent.end_char)
        sent_text = sent.text
        
        # Get logits for both full text and sentence contexts
        full_logits = self.get_masked_logits(text, word_index)
        
        # Find word index in sentence
        sent_word_index = len(sent_text[:word_char_idx - sent.start_char].split())
        sent_logits = self.get_masked_logits(sent_text, sent_word_index)
        
        # Calculate probabilities
        full_probs = torch.softmax(full_logits, dim=0)
        sent_probs = torch.softmax(sent_logits, dim=0)
        
        # Get probability of actual word
        word_id = self.tokenizer.encode(word, add_special_tokens=False)[0]
        
        # Contextuality is ratio of sentence to full-text probability
        score = float(sent_probs[word_id].log() - full_probs[word_id].log())
        
        return score
    
    def choose_blank_positions(self, text: str, num_blanks: int) -> list[int]:
        """Choose positions to blank based on contextuality scores"""
        words = text.split()
        scores = []
        
        # Calculate scores for each position
        for i in range(len(words)):
            # Skip very short words and proper nouns
            word = words[i]
            if len(word) < 3 or word[0].isupper():
                scores.append(-float('inf'))
                continue
                
            score = self.get_contextuality_score(text, i)
            scores.append(score)
        
        # Convert to numpy for easier manipulation
        scores = np.array(scores)
        
        # Choose positions greedily while maintaining minimum distance
        positions = []
        for _ in range(num_blanks):
            if np.all(scores == -float('inf')):
                break
                
            # Choose highest scoring position
            pos = np.argmax(scores)
            positions.append(pos)
            
            # Zero out scores within minimum distance
            start = max(0, pos - self.min_blank_distance)
            end = min(len(scores), pos + self.min_blank_distance + 1)
            scores[start:end] = -float('inf')
            
        return sorted(positions)
    
    def generate_cloze(self, text: str, num_blanks: int) -> tuple[str, list[str]]:
        """Generate a cloze text with blanks and return answers"""
        positions = self.choose_blank_positions(text, num_blanks)
        words = text.split()
        answers = [words[pos] for pos in positions]
        
        # Replace words with blanks
        for pos in sorted(positions, reverse=True):
            words[pos] = "___________"
            
        return " ".join(words), answers

In [17]:
generator = RationalClozeGenerator()

In [18]:
text = """The cloze procedure, first introduced by Taylor, is a widely used method for creating reading 
comprehension tests inspired by the Gestalt principle of closure. Though many variations have been 
introduced and studied, the core concept is to mask words in prose and task the subject with providing 
the missing words."""

cloze_text, answers = generator.generate_cloze(text, num_blanks=6)
print("Cloze text:")
print(cloze_text)
print("\nAnswers:", answers)

Cloze text:
The cloze procedure, ___________ introduced by Taylor, is a widely used method for creating ___________ comprehension tests inspired by the Gestalt principle of ___________ Though many variations have been introduced and studied, ___________ core concept is to mask words in ___________ and task the subject with providing the missing ___________

Answers: ['first', 'reading', 'closure.', 'the', 'prose', 'words.']


In [19]:
import json

page_summaries = {}
with open("../data/strapi-page-summaries.json") as f:
    for page in json.load(f):
        if page["PageSummary"]:
            page_summaries[page["Slug"]] = page["PageSummary"]
            
            print("="*80)
            cloze_text, answers = generator.generate_cloze(page["PageSummary"], num_blanks=6)
            print("Cloze text:")
            print(cloze_text)
            print("\nAnswers:", answers)
            print("="*80)

Cloze text:
Experimental psychologists, primarily holding doctoral and master's degrees, conduct scientific research in various psychology subfields, often collaborating with students at universities. While ___________ are trained clinicians, most focus on non-clinical ___________ such as cognitive or social psychology. Their research is crucial for understanding human behavior and developing ___________ knowledge, which is vital for clinical practice. The interplay between research and practice is significant, as psychological ___________ are empirically testable. The effectiveness of treatments, like psychotherapy, relies on ___________ validation. The clinical psychology community debates the emphasis on ___________ supported treatments, but there is consensus on the need for a scientific approach to ensure effective diagnosis and treatment.

Answers: ['some', 'areas', 'empirical', 'disorders', 'scientific', 'empirically']
Cloze text:
Descriptive statistics are tools ___________ to 