# Em and hangman models

It uses a language model based on unigrams (word frequencies) and a "Hangman"-type feature that allows you to predict words from missing letters.

In [10]:
from collections import Counter
import random
from itertools import product

In [11]:
class EMInterpolatedNgramModel:
    def __init__(self, lambda1=0.4, lambda2=0.3, lambda3=0.2, lambda4=0.1):
        self.lambda1 = lambda1  
        self.lambda2 = lambda2 
        self.lambda3 = lambda3  
        self.lambda4 = lambda4 
        
        self.unigram_counts = Counter()
        self.vocab = set()
        self.total_tokens = 0
        self.V = 0
    
    def train(self, corpus):
        for tokens in corpus:
            self.total_tokens += len(tokens)
            for w in tokens:
                self.unigram_counts[w] += 1
        
        self.vocab = set(self.unigram_counts.keys())
        self.V = len(self.vocab)
    
    def unigram_prob(self, w):
        return (self.unigram_counts.get(w, 0) + 1) / (self.total_tokens + self.V)
    
    def hangman(self, incomplete_word, max_suggestions=5):
        missing_indices = [i for i, char in enumerate(incomplete_word) if char == "_"]
        possible_fills = product("abcdefghijklmnopqrstuvwxyz", repeat=len(missing_indices))
        candidates = []
        for fill in possible_fills:
            word_attempt = list(incomplete_word)
            for idx, letter in zip(missing_indices, fill):
                word_attempt[idx] = letter
            candidate_word = "".join(word_attempt)
            if candidate_word in self.vocab:
                candidates.append((candidate_word, self.unigram_prob(candidate_word)))    
        candidates.sort(key=lambda x: x[1], reverse=True)
        return [word for word, _ in candidates[:max_suggestions]]
    

In [12]:
def split_data(corpus):
    random.shuffle(corpus)
    train_size = int(0.8 * len(corpus))
    val_size = int(0.1 * len(corpus))
    train = corpus[:train_size]
    val = corpus[train_size:train_size + val_size]
    test = corpus[train_size + val_size:]
    return train, val, test

In [28]:
tokenized_corpus = [["adios"],["paola"], ["italia"], ["example"], ["language"], ["science"]]
train, val, test = split_data(tokenized_corpus)
model3 = EMInterpolatedNgramModel()
model3.train(train)
examples = ["pa_l_", "_dios", "it_li_"]
for example in examples:
    print(f"Posibles palabras para {example}: {model3.hangman(example)}")

Posibles palabras para pa_l_: ['paola']
Posibles palabras para _dios: ['adios']
Posibles palabras para it_li_: ['italia']
