# Setup

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import xml.etree.ElementTree as ET # Needed for corpus processing
import random

# Part Three Instructions

Use your bigram model to **generate** sentences. Starting with \<s>, generate 100 sentences
by sampling the next word according to the bigram probabilities associated with your model.  

For example, after generating the word the, if your model says that  

p(dog|the) = 0.6  
p(frog|the) = 0.3  
p(zebra|the) = 0.1    

the word dog will be generated 60% of the time, frog 30% of the time and zebra 10% of the
time.  

Train your model on a publicly available text corpus in a language and genre of your choice.

# Part Three

As before, the first step is to set up the Python class which represents our model. In this case however, I have added an extra function called **generate_n_sentences** which generates a specified amount of sentences, n.

In [2]:
class SmoothedBigramLM:
    """
    Class to represent an unsmoothed bigram LM.
    """

    def __init__(self, corpus=[]):
        """
        Corpus object is a list of the sentences in the corpus as strings.
        Sentence start and end markers appear in each string.
        The unigrams and bigrams are extracted on instantiation.
        """
        self.corpus = corpus
        self.unigram_counts = self.count_unigrams(corpus)
        self.bigram_counts = self.count_bigrams(corpus)

    def count_unigrams(self, corpus):
        """
        Creates a dict with every unigram in the corpus and 
        a count of how many times it appears in the corpus.
        """
        counts = {}
        for s in corpus:
            # Tokenise sentence
            toks = s.split()
            for tok in toks:
                #Normalise case
                tok = tok.casefold()
                if tok in counts:
                    counts[tok] += 1
                else:
                    counts[tok] = 1
        # Return dict with unigram counts
        return counts

    def count_bigrams(self, corpus):
        """
        Creates a dict with every bigram in the corpus and 
        a count of how many times it appears in the corpus.
        """
        counts = {}
        for s in corpus:
            # Tokenise sentence
            toks = s.split()
            for tok1, tok2 in zip(toks, toks[1:]):
                # Normalise case
                tok1 = tok1.casefold()
                tok2 = tok2.casefold()
                bigram = (tok1, tok2)
                if bigram in counts:
                    counts[bigram] += 1
                else:
                    counts[bigram] = 1
        #Return dict with bigram counts
        return counts
        
                 
    def p_unigram(self, word):
        """
        Returns the probability of a unigram w.
        """
        # Strip whitespace and normalise case
        w = word.strip().casefold()

        # Make sure w is only one word
        if len(w.split()) > 1:
            raise ValueError

        #  p(w) = count("w") / count(unigrams)
        return self.unigram_counts[w] / sum(self.unigram_counts.values())
    
    def p_bigram(self, word, prev_word):
        """
        Returns the probability of a bigram - "prev_word word".
        """
        # Strip whitespace and normalise case
        w1 = prev_word.strip().casefold()
        w2 = word.strip().casefold()

        # Make sure both are one word
        if (len(w1.split()) > 1) or (len(w2.split()) > 1):
            raise ValueError
            
        # If bigram has been seen before
        try:
            # p(w2 | w1) = count("w1 w2") / count("w1")
            bigram = (w1, w2)
            return self.bigram_counts[bigram] / self.unigram_counts[w1]
        
        # If bigram unseen
        except KeyError:
            return 0

    def p_word_seq(self, s):
        """
        Returns the probability of a word sequence s.
        """
        # Strip whitespace, normalise case and tokenise
        toks = s.strip().casefold().split()
        
        # Initialise list to store probabilities
        probs = []
        probs.append(self.p_unigram(toks[0]))
        for prev, curr in zip(toks, toks[1:]):
            # Add interpolated smoothing
            probs.append((0.5 * self.p_bigram(curr, prev)) + (0.5 * self.p_unigram(curr)))
        
        #Return product of probs
        return np.prod(probs), self.p_log_space(probs)
    
    def p_log_space(self, probs):
        """
        Returns the log probability of list of word probabilities.
        """
        total = 0
        for p in probs:
            total += np.log(p)
        return total
    
    def generate_n_sentences(self, n):
        sentences = []
        tok1 = "<s>"
        
        i = 1
        # Generate n sentences
        while len(sentences) != n:
            print(f"Generating Sentence {i}")
            
            prev_tok = tok1
            toks = [prev_tok]
            # Keep going until we generate a sentence end marker
            while prev_tok != "</s>":
                
                # Calculates counts and probabilities for next_tok
                counts = {k: v for k, v in self.bigram_counts.items() if k[0] == prev_tok}
                probs = {k: v / sum(counts.values()) for k, v in self.bigram_counts.items() if k[0] == prev_tok}
            
                # Make a random choice based on the probabilities
                try:
                    next_tok = random.choices(list(probs.keys()), weights = probs.values(), k = 1)[0][1]
                # Catch erroneous generations and discard sentence
                except IndexError:
                    break
                
                toks.append(next_tok)
                prev_tok = next_tok
            sentences.append(" ".join(toks))
            i += 1
        
        return sentences

Before I can generate sentences, I have to choose a publicly available text corpus in a language and genre of my choice. The corpus I have chosen contains political apologies, in English, from politicians in the UK, US and Canada. There is a plethora of metadata along with each apology but for the purpose of generating new sentences, we only require the text itself.  

More details about this corpus are given in the README file, and below I extract the sentences from an XML file and write them sentence-by-sentence to a text file for ease.

In [3]:
def extract_sentences_from_xml_to_txt(xml_path, txt_path):
    """
    Function to extract text data from XML file and write this to a TXT file.
    """
    with open(txt_path, 'a') as txt_file:
        
        # Initialise a list to store sentences
        sentences = []
    
        tree = ET.parse(xml_path)
    
        excuses = tree.getroot()[1]
        for i, excuse in enumerate(excuses):
            print(f"Processing: Excuse {i}")
            text = excuse.find("text").text
        
            # Strip whitespace and tokenise text into sentences
            toks = text.strip().split(".")[:-1]
        
            for tok in toks:
                tok = "<s> " + tok.strip() + " . </s>"
                txt_file.write(tok + "\n")

In [4]:
# Don't run again
xml_path = r"data\politician_excuses_en\Excuses.xml"
txt_path = r"data\politician_excuses_en.txt"

# extract_sentences_from_xml_to_txt(xml_path, txt_path)

Now, I read back in the data from the txt file.

In [5]:
# Read in the data
with open(txt_path, 'r') as f:
    corpus = f.read().split("\n")

# Make sure data is OK
corpus

['<s> I agree with those who said that in my first statement after I testified that it was not contrived enough . </s>',
 '<s> I don`t think there is a fancy way to say that I have sinned . </s>',
 '<s> It is important for me that everybody who has been hurt to know that the sorrow I feel is genuine . </s>',
 '<s> First, the most important - my family, also my friends, my staff, my cabinet, Monica Lewinsky and her family, and the American people . </s>',
 '<s> I have asked all for their forgiveness, but I believe that to be forgiven more than sorrow is required . </s>',
 '<s> At least two more things . </s>',
 '<s> First, genuine repentance and determination to change and repair breaches of my own making . </s>',
 '<s> Yes, I apologize . </s>',
 '<s> Not only do I apologize to secretary Clinton and I hope that we can work together on the independent investigation . </s>',
 '<s> I want to apologize to my supporters . </s>',
 '<s> It`s not the title of campaign that we`ve run and if I fi

Checking the amount of sentences...

In [6]:
len(corpus)

1261

There are some erroneous sentences in the list of the form: '\<s>  . \</s>'. I remove these below.

In [7]:
corpus[:] = [s for s in corpus if s != '<s>  . </s>']

In [8]:
len(corpus)

1220

In [9]:
corpus

['<s> I agree with those who said that in my first statement after I testified that it was not contrived enough . </s>',
 '<s> I don`t think there is a fancy way to say that I have sinned . </s>',
 '<s> It is important for me that everybody who has been hurt to know that the sorrow I feel is genuine . </s>',
 '<s> First, the most important - my family, also my friends, my staff, my cabinet, Monica Lewinsky and her family, and the American people . </s>',
 '<s> I have asked all for their forgiveness, but I believe that to be forgiven more than sorrow is required . </s>',
 '<s> At least two more things . </s>',
 '<s> First, genuine repentance and determination to change and repair breaches of my own making . </s>',
 '<s> Yes, I apologize . </s>',
 '<s> Not only do I apologize to secretary Clinton and I hope that we can work together on the independent investigation . </s>',
 '<s> I want to apologize to my supporters . </s>',
 '<s> It`s not the title of campaign that we`ve run and if I fi

In [18]:
words = 0
for s in corpus:
    toks = s.split()[1:-2]
    words += len(toks)

In [19]:
words

20521

I'm now ready to instantiate the bigram language model...

In [10]:
LM = SmoothedBigramLM(corpus)

Checking the counts...

In [11]:
LM.unigram_counts

{'<s>': 1169,
 'i': 746,
 'agree': 6,
 'with': 96,
 'those': 61,
 'who': 83,
 'said': 23,
 'that': 376,
 'in': 345,
 'my': 301,
 'first': 16,
 'statement': 6,
 'after': 4,
 'testified': 2,
 'it': 181,
 'was': 202,
 'not': 153,
 'contrived': 1,
 'enough': 4,
 '.': 1169,
 '</s>': 1169,
 'don`t': 1,
 'think': 14,
 'there': 36,
 'is': 144,
 'a': 334,
 'fancy': 1,
 'way': 28,
 'to': 794,
 'say': 26,
 'have': 202,
 'sinned': 1,
 'important': 15,
 'for': 338,
 'me': 72,
 'everybody': 2,
 'has': 78,
 'been': 76,
 'hurt': 19,
 'know': 43,
 'the': 978,
 'sorrow': 3,
 'feel': 15,
 'genuine': 3,
 'first,': 8,
 'most': 23,
 '-': 32,
 'family,': 13,
 'also': 21,
 'friends,': 12,
 'staff,': 3,
 'cabinet,': 1,
 'monica': 2,
 'lewinsky': 3,
 'and': 765,
 'her': 4,
 'american': 15,
 'people': 71,
 'asked': 7,
 'all': 85,
 'their': 63,
 'forgiveness,': 1,
 'but': 70,
 'believe': 24,
 'be': 93,
 'forgiven': 1,
 'more': 34,
 'than': 21,
 'required': 1,
 'at': 58,
 'least': 2,
 'two': 8,
 'things': 6,
 'rep

In [12]:
LM.bigram_counts

{('<s>', 'i'): 205,
 ('i', 'agree'): 1,
 ('agree', 'with'): 3,
 ('with', 'those'): 3,
 ('those', 'who'): 14,
 ('who', 'said'): 1,
 ('said', 'that'): 1,
 ('that', 'in'): 4,
 ('in', 'my'): 16,
 ('my', 'first'): 1,
 ('first', 'statement'): 1,
 ('statement', 'after'): 1,
 ('after', 'i'): 1,
 ('i', 'testified'): 2,
 ('testified', 'that'): 1,
 ('that', 'it'): 10,
 ('it', 'was'): 39,
 ('was', 'not'): 21,
 ('not', 'contrived'): 1,
 ('contrived', 'enough'): 1,
 ('enough', '.'): 1,
 ('.', '</s>'): 1169,
 ('i', 'don`t'): 1,
 ('don`t', 'think'): 1,
 ('think', 'there'): 1,
 ('there', 'is'): 11,
 ('is', 'a'): 14,
 ('a', 'fancy'): 1,
 ('fancy', 'way'): 1,
 ('way', 'to'): 4,
 ('to', 'say'): 10,
 ('say', 'that'): 5,
 ('that', 'i'): 66,
 ('i', 'have'): 66,
 ('have', 'sinned'): 1,
 ('sinned', '.'): 1,
 ('<s>', 'it'): 49,
 ('it', 'is'): 23,
 ('is', 'important'): 3,
 ('important', 'for'): 1,
 ('for', 'me'): 6,
 ('me', 'that'): 4,
 ('that', 'everybody'): 1,
 ('everybody', 'who'): 1,
 ('who', 'has'): 5,
 ('h

## Sentence Generation

The final step is to generate 100 sentences by sampling the next word based on the bigram probabilities associated with the model.

In [13]:
sentences = LM.generate_n_sentences(100)

Generating Sentence 1
Generating Sentence 2
Generating Sentence 3
Generating Sentence 4
Generating Sentence 5
Generating Sentence 6
Generating Sentence 7
Generating Sentence 8
Generating Sentence 9
Generating Sentence 10
Generating Sentence 11
Generating Sentence 12
Generating Sentence 13
Generating Sentence 14
Generating Sentence 15
Generating Sentence 16
Generating Sentence 17
Generating Sentence 18
Generating Sentence 19
Generating Sentence 20
Generating Sentence 21
Generating Sentence 22
Generating Sentence 23
Generating Sentence 24
Generating Sentence 25
Generating Sentence 26
Generating Sentence 27
Generating Sentence 28
Generating Sentence 29
Generating Sentence 30
Generating Sentence 31
Generating Sentence 32
Generating Sentence 33
Generating Sentence 34
Generating Sentence 35
Generating Sentence 36
Generating Sentence 37
Generating Sentence 38
Generating Sentence 39
Generating Sentence 40
Generating Sentence 41
Generating Sentence 42
Generating Sentence 43
Generating Sentence 

In [14]:
sentences

['<s> how do that i and that this has happened . </s>',
 '<s> i am very real security policy . </s>',
 '<s> my record of utah, lead the same . </s>',
 '<s> good job . </s>',
 '<s> our government of many of dupont, its regret anything at a small accomplishment to those of the choice of iraq in a flight at humor to meeting with the statesman has bullied, attacked, shamed and i need fewer democrats . </s>',
 '<s> i gave offence to offer our shared a number of the situation was reacting as they each other examples of a small number of the middle class that i have treated me i have been enriched . </s>',
 '<s> ” “i have awaited our god . </s>',
 '<s> you expected . </s>',
 '<s> res . </s>',
 '<s> unfortunately, i extend my father would never have caused to everyone for that, mr . </s>',
 '<s> since 1981, the very sad chapter in gaza with monica lewinsky . </s>',
 '<s> i would start a desire to all canadians on sunday . </s>',
 '<s> "what is the floor at belgium and your forgiveness . </s>',

And now, I can write these sentences to a to a txt file...

In [17]:
# Don't run again
txt_path = r"generated_sentences.txt"

with open(txt_path, 'a') as txt_file:
    for sentence in sentences:
        txt_file.write(sentence + "\n")