In [213]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
import random
nltk.download('punkt_tab')


In [210]:
class NGram:
    """
    Creates an n-gram Markov representation of a given corpus
    
    Methods
    -----
    generate_text(n)
    
    """
    def __init__(self, corpus, preprocess=True):
        """
        Parameters
        -----
        corpus: a list of sentences to be used for training
        tokenize: whether the corpus should be preprocessed 
                  before being used for training (default: true)
        """
        
        self.corpus = self._preprocess_text(corpus)
        self.trained_n_grams = {}
    
    def generate_text(self, n, soft_length, starting_text=None):
        if not n in self.trained_n_grams.keys():
            self._train_n_gram(n)
        
        curr_state = starting_text.split()[-n:] if starting_text else self._pick_random_start(n)
        generated_string = ""
        generated_string += " ".join(curr_state[1:]) + " "
        
        in_loop = True
        while in_loop:
            next_state = None
            while next_state is None:
                
                next_state = self._pick_next_state(curr_state, n)
                
                if not next_state:
                    curr_state = self._pick_random_start(n)
                    
            curr_state = tuple(list(curr_state[-n+1:]) + next_state)
            if not curr_state in self.trained_n_grams[n].keys() or next_state[0] == "<e>":
                # In this case, special character <e> was encountered
                curr_state = self._pick_random_start(n)
            else:
                generated_string += next_state[0] + " "
             
            if next_state[0] == "<e>" and len(generated_string.split()) >= soft_length:
                in_loop = False
       
            
        return generated_string.rstrip()
            
    
    def _pick_next_state(self, curr_state, n):
        freqs = list(self.trained_n_grams[n][curr_state].values())
        candidates = list(self.trained_n_grams[n][curr_state].keys())
        probabilities = [freq / sum(freqs) for freq in freqs]
        return random.choices(candidates, weights=probabilities)
        
            
    def _pick_random_start(self, n):
        start_keys = [key for key in self.trained_n_grams[n].keys() if key[0] == "<s>"]
        return random.choice(start_keys)
        
        
        
    def _preprocess_text(self, corpus):
        return [["<s>"] + word_tokenize(text) + ["<e>"] for text in corpus]
    
    def _train_n_gram(self, n):
        n_gram = {}
        
        for tokens in self.corpus:
            if len(tokens) <= n:
                continue
            for i in range(len(tokens) - n):
                curr_state = tuple(tokens[i:i + n])
                next_state = tokens[i + n]
                if curr_state in n_gram.keys() and next_state in n_gram[curr_state].keys():
                        n_gram[curr_state][next_state] += 1
                else:
                    n_gram[curr_state] = {next_state: 1}
             
                    
        self.trained_n_grams[n] = n_gram
                
                


In [211]:
n_gram_model = NGram(corpus)

In [212]:
n_gram_model.generate_text(3, 30)

'Attacks are coming thick and fast , primarily from the left , which is the largest auto plant in North America , twice as much as all other carmakers combined .'