# Generating random words from NGrams
From 

- https://github.com/josh-freeman/HPshape/blob/main/examples/Word%20generation%20with%20N-Grams%20(Solution).ipynb
- https://github.com/josh-freeman/HPshape/blob/main/examples/ngrams.ipynb  

In [12]:
#!/usr/bin/env python3
!pip install tqdm 
# for the impatients...
import numpy as np
import itertools




[notice] A new release of pip available: 22.3.1 -> 23.0.1
[notice] To update, run: C:\Users\jfreeman\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import nltk
words = nltk.download('words')
from nltk.corpus import words

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\jfreeman\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


# Core Model

In [13]:
class NGram ():
    
    end_token = '\x00'
    
    
    def __init__(self,n=3):
        
        alphabet  = [c for c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-"]
        alphabet.append(self.end_token)
        all_words = np.array([x for x in words.words() if len(x) >= 2])
        np.random.shuffle(all_words)
        
        self.n = n
        self.alphabet = alphabet
        self.words = all_words
        self.n_characters = np.array([np.array(p) for p in itertools.product(alphabet, repeat = n)])
        
        self.combinations  =  np.array([np.array(p) for p in itertools.product(alphabet, repeat = n + 1)])
        self.counters      =  np.zeros((self.combinations.shape[0]), dtype=np.int64)
        self.probabilities =  None
        
    def update(self,n1,n2):
        from tqdm import tqdm
        n = self.n
        counters = self.counters
        combins  = self.combinations
        words    = self.words[n1:n2]
        end_tok  = self.end_token
        
        for w in tqdm(words):
            t = formatting(w,n,end_tok)
            for i in range(n, len(t)):
                gram = np.array(t[i-n:i+1])
                index = find_index(combins, gram)
                counters[index] += 1
                
    def compute_probabilities(self):
        
        probabilities = self.counters.reshape((len(self.n_characters), len(self.alphabet)))
        sum_probs     = probabilities.sum(axis=1, keepdims=True)
        
        #probabilities = np.nan_to_num(probabilities / , 0)
        
        self.probabilities = np.divide(probabilities,sum_probs,where=(sum_probs>0))
                
    def generate(self):
       end_token = self.end_token
       n         = self.n
       result = [end_token]*n
       while True:
           index  = find_index(self.n_characters, result[-n:])
           choice = np.random.choice(self.alphabet,p=fix_p(self.probabilities[index]))
           result.append(choice)
           if choice == np.array(end_token):
                    break
        
       return "".join(result)
   
    
    def probability (self,word):
       
       return(multi_probability(self.counters,self.combinations,word,self.n,self.end_token))
   


# Utilities

In [3]:
def formatting(w,n,end_token):
    return [end_token]*n + tokenizer(w) + [end_token]

In [4]:
def single_probability(counters,combinations,c, gram): 
    gram_plus_c = counters[find_index(combinations, np.array(list(gram) + [c]))]
    gram_total = np.sum(counters)
    
    if gram_total == 0:
        return 0
    else:
        return gram_plus_c / gram_total
    
def multi_probability(counters,combinations,w:str,n,end_token):
    
    probability = 1
    formatted = formatting(w,n,end_token)
    for i in range(n, len(formatted)-1): 
        c = formatted[i]
        gram = np.array(formatted[i-n:i])
        probability *= single_probability(counters,combinations,c,gram)
    return probability    

In [5]:
def fix_p(p):
     if p.sum() == 0:
         p = np.ones((len(p))) / len(p)
     elif p.sum() != 1.0:
         p = p*(1./p.sum())
     return p

In [6]:
def asvoid(arr):
    arr = np.ascontiguousarray(arr)
    return arr.view(np.dtype((np.void, arr.dtype.itemsize * arr.shape[-1])))

In [7]:
def find_index(arr, x):
    arr_as1d = asvoid(arr)
    x = asvoid(x)
    return np.nonzero(arr_as1d == x)[0][0]

In [14]:
def tokenizer(s:str):
    l = [c for c in s] 
    return l

N = 3 #@param {type:"slider", min:1, max:10, step:1}
ngram = NGram(N)

UPDATE_FROM = 0
UPDATE_TO = 2000 #@param {type:"slider", min:1, max:5000, step:1}
ngram.update(UPDATE_FROM,UPDATE_TO)

ngram.compute_probabilities()

WORDS_TO_GENERATE = 20 #@param {type:"slider", min:1, max:3072, step:1}
for i in range(WORDS_TO_GENERATE): print(ngram.generate())

 10%|▉         | 198/2000 [04:47<48:38,  1.62s/it] 