# Here's our N-gram model: what we have so far

In [1]:
from nltk.lm.preprocessing import pad_both_ends
from nltk import ConditionalFreqDist
from nltk.probability import ConditionalProbDist, ELEProbDist
from nltk.util import pad_sequence
from nltk.lm.preprocessing import pad_both_ends
import re

2. numbers are a problem for n-gram models becayse there are so many of them. we don't want to eliminate them, because they are meaningful, but we want to abstract away from the individual numbers. In addition, we might want to get rid of some other things like parentheticals and quotes, becayse these impossible for our model to keep track of given it's amount of memory. We can take care of these things in the preprocessing function

Here is a final version of our class with all the bells and whistles

In [2]:
from nltk.lm.preprocessing import pad_both_ends
from nltk import ConditionalFreqDist
from nltk.probability import ConditionalProbDist, ELEProbDist
from nltk.util import pad_sequence
from nltk.lm.preprocessing import pad_both_ends
from functools import reduce

class NgramModel():

    
    def __init__(self, corpus, n):
        self.n = n
        tokenized_corpus = self._tokenize(corpus)
        self._ngrams = self._build_ngrams(tokenized_corpus, n)
        self._cpd = self._build_distribution(self._ngrams, n)        

    def _tokenize(self, corpus):
        # The list of regular expressions and replacements to be applied
        # the order here matters! these replacements will happen in order
        replacements = [
             ["[-\n]",                   " "] # Hyphens to whitespace
            ,[r'[][(){}#$%"]',           ""] # Strip unwanted characters like quotes and brackets
            ,[r'\s([./-]?\d+)+[./-]?\s', " [NUMBER] "] # Standardize numbers
            ,[r'\.{3,}',                 " [ELLIPSIS] "] # remove ellipsis
            ,[r'(\w)([.,?!;:])',         r'\1 \2' ]  # separate punctuation from previous word
        ]
        
        # This is a function that applies a single replacement from the list
        resub = lambda words, repls: re.sub(repls[0], repls[1], words)
        
        # we use the resub function to applea each replacement to the entire corpus,
        normalized_corpus = reduce(resub, replacements, corpus)
        
        
        sentences = normalized_corpus.split('.')
        
        tokens = []
        for sentence in sentences:
            words = sentence.split() # split on whitespace
            words = [word.lower() for word in words]
            words = list(pad_both_ends(words, n=self.n))
            tokens += words
        
        return tokens
            
    def _build_ngrams(self, tokenized_corpus, n):
        n_grams = []
        for i in range(n-1, len(tokenized_corpus)): 
            n_grams.append(tuple(tokenized_corpus[i-(n-1):i+1]))    
        return n_grams
    
    def _build_distribution(self, corpus, n):
               
        cfd = ConditionalFreqDist()
        for ngram in self._ngrams:
            condition = tuple(ngram[0:n-1]) 
            outcome = ngram[n-1]
            
            cfd[condition][outcome] += 1
        bins = len(cfd) # we have to pass the number of bins in our freq dist in as a parameter to probability distribution, so we have a bin for every word
        cpd = ConditionalProbDist(cfd, ELEProbDist, bins)
        self.cpd = cpd
        return cpd
        
    def generate(self, num_sentences = 1, seed = []):
        """
        There are two cases to deal with here. Either we have a start string, or we don't. 
        If we are given a start string, we'll have to find the last n-1 gram and condition on that
        If we are not, we need to generate the first n-1 gram. For a trigram model, we need a bigram. But how can we use our model to generate new words when we have fewer than two words to condition on?
        We can use a bigram model! But wait. If we have a bigram model, how do we generate the first token without another token to condition on? 
        We can use a unigram model! 
        Recursion will save us here. Turns out the easiest way to do this will be to recursively construct an n-1gram model and store it in the main model.
        And how can we 
        Either way, we need a seed condition to enter into the loop with.
        """

        # place to put generated tokens
        string = []

        if seed:
            string = string + (list(pad_sequence(seed, self.n, pad_left=True, pad_right=False, left_pad_symbol='<s>') ) )
        else:
            string = string + (list(pad_sequence('', self.n, pad_left=True, pad_right=False, left_pad_symbol='<s>') ) )
        
        for i in range(num_sentences):
            next_token = tuple(string[-(self.n-1):])
            
            # keep generating tokens as long as we havent reached the stop sequence
            while next_token != '</s>':
                
                # get the last n-1 tokens to condition on next
                lessgram = tuple(string[-(self.n-1):])

    
                next_token = self.cpd[lessgram].generate()
                string.append( next_token )

        string = ' '.join(string)
        string = add_stops(string)

        return string

    
    def add_stops(string):
        """
        function to convert the stop/start sequence back into periods.
        strips all the sequences of any number of stop tokens followed by the some number of start tokens
        and replaces them with a period.

        then strips any remaining stop and start sequences (which will occur at the beginning and end of our entire generated sequence)
        """
        string = re.sub(r"</s>(?:\s</s>)*\s<s>(?:\s<s>)*", ".", string)

        string = re.sub(r"(<s>\s)+", "", string) # initial tokens
        string = re.sub(r"(</s>)", "", string) # final token

        return string

In [3]:
def add_stops(string):
    """
    function to convert the stop/start sequence back into periods.
    strips all the sequences of any number of stop tokens followed by the some number of start tokens
    and replaces them with a period.

    then strips any remaining stop and start sequences (which will occur at the beginning and end of our entire generated sequence)
    """
    string = re.sub(r"</s>(?:\s</s>)*\s<s>(?:\s<s>)*", ".", string)

    string = re.sub(r"(<s>\s)+", "", string) # initial tokens
    string = re.sub(r"(</s>)", "", string) # final token

    return string

We try generating a 4-gram model with the King James Bible

Our model expects its training corpus in the form of a single string.

# Let's do a mashup

Intro to beautiful soup for scraping web text

In [4]:
# Define the file path
file_path = "dream_journal.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_dream = file.read()

# Now, the 'text_variable' holds the content of the file

In [39]:
model4 = NgramModel(text_dream, 2)

In [44]:
model4.generate(5)

"she can get sunlight and shove them in french !** i'm trying to chase the camp . they've been placing creepy doll versions of other side of her pockets , i need to my partner knows what's going to abruptly join the ai track . big lots , speed walking through the audience their period stopped . i had to my yard , standing beside the ride . three dimensional layered with hands at a fucking bear trying to steal stuff in such a result , my friend comes from the camp [ellipsis] i'm at a woman take things we're salvaging "

In [7]:
!pip3 install beautifulsoup4

from bs4 import *

import requests

url = 'https://theanarchistlibrary.org/library/david-graeber-anarchy-in-a-manner-of-speaking'
res = requests.get(url)
html_page = res.text

# Parse the source code using BeautifulSoup
soup = BeautifulSoup(html_page, 'html.parser')

# Extract the plain text content
text = soup.get_text()

# Print the plain text





In [8]:
!pip3 install beautifulsoup4

from bs4 import *

import requests


url = 'https://theanarchistlibrary.org/library/laboria-cuboniks-xenofeminism'
res = requests.get(url)
html_page = res.text

# Parse the source code using BeautifulSoup
soup = BeautifulSoup(html_page, 'html.parser')

# Extract the plain text content
text2 = soup.get_text()

# Print the plain text





In [9]:
# Define the file path
file_path = "profane_quotes.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_profane = file.read()

# Now, the 'text_variable' holds the content of the file



In [10]:
# Define the file path
file_path = "chiang.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_chiang = file.read()

# Now, the 'text_variable' holds the content of the file


In [11]:
# Define the file path
file_path = "gex.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_gex = file.read()

# Now, the 'text_variable' holds the content of the file



In [12]:
print(len(text_gex))
print(len(text_profane))
print(len(text_profane * 6))

3656
5330
31980


In [13]:

mashup = text_gex + text_profane


In [14]:
model = NgramModel(mashup, 2)


In [15]:
model.generate(5)

"otherwise , never lost a fight , bite your boots and now on the compression i'll haunt your hit me talk yo' face swipe your mission , seven days a doge , feeling like pain if i got a thing that shit jobs that thing like oh , no way to refuse work before a way to society ; it's just to tell him some' to make love , kill policemen , look , he's a dive tie me with some whores cardi b , seven days a nigga , before a way similar . the bar i felt the blood on this ring ayy , the phone is between jobs :' since people are you let me up 'fore he can't bang you to believe in a disguise i couldn't go i'll never tell him 'fore he fuck me yeah , that's where to think that's ok doing now on his mind is only reason i wear a thing like his credit card hop on top , come take a phone just playin' you made his mind and poverty of words when it is comprehensible to bear in the bone good shit jobs :' since people are paid and you made his mic' and you yet again that shit but i could make me with some whor

In [16]:
# Define the file path
file_path = "carly.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_carly = file.read()

# Now, the 'text_variable' holds the content of the file


In [17]:
# Define the file path
file_path = "duncan.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_duncan = file.read()

# Now, the 'text_variable' holds the content of the file



In [18]:
print(len(text_carly))
print(len(text_chiang))
print(len(text_carly * 2))

15332
37635
30664


In [19]:
mashup2 = text_carly * 3 + text_chiang

In [20]:
model2 = NgramModel(mashup2, 2)

In [21]:
model2.generate(5)

"at last bit of us from ours ? hey , the memory was locked in my chest . the shadow of the limbs will assume that all life is far beyond any other , as an hour to reconstruct the point . with a thread , and replacing our universe as if they needed to discuss the empty lungs with me , i quit smoking those cigarettes but all pressure it on occasion , i realized that maybe ? it's like it's in fatal flaw . one another bad dreams get into the source of its public squares and no less force exerted as persistent currents generated within our movements depended on foil pages visible , the tumult of my relationship with this annual celebration , struggling against the edge of finding the flow of absolute equilibrium . reprinted by contrast , for they worked my initial auto dissection was made "

In [22]:
# Define the file path
file_path = "omelas.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_omelas = file.read()

# Now, the 'text_variable' holds the content of the file



In [23]:
# Define the file path
file_path = "chiang.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_chiang = file.read()

# Now, the 'text_variable' holds the content of the file


In [24]:
print(len(text_omelas))
print(len(text_gex))
print(len(text_gex * 4))

15546
3656
14624


In [25]:
mashup3 = text_omelas + text_gex * 4

In [26]:
model3 = NgramModel(mashup3, 2)

In [27]:
model3.generate(5)

"but to most of mauve and chatting as you get the place even granted trains , parades , ok doing now it lives were mature , and it . “i will be proclaimed upon a description such as you like a city , impotence , bleh . omelas are between the child were vastly excited , join the racecourse are perhaps it with frightened , of a day or genitals , baby brand new mercedes i've been at least , crying 'bout , stand up into the ways of its legs , between the unnecessary but if you can't ever to see , small , could convince you circled me singing . “please let them neigh in . for help at night falls ; in rank along the beauty of sex beyond all but what i'm dead but the bitter injustice dry when it only animal who walk away from time "

Wow!

'among the hills that are weaned from the waters saw thee polluted in thy glory above all people , from beersheba to mount up with . [number] : [number] and shaalabbin , and partly broken . report , that jehoshaphat the king is among us still believed in hope ; patient in spirit ; and half of thy power preserve thou those that served in the womb : if jacob take a lump of figs were set there upon him shall inherit all things thereon . in most militants this search for my gold and the lord separated the sons shall eat clean provender , which loveth thee and abishai , and kings have had dominion over our cattle . then he sacrificed also and to whomsoever he will prosper us ; thus have been occupied therein . yellowed figures of cherubims and palm trees : they serve not thy left side , upon their altars : but according to our hand be upon every fowl of the european union . all these did moses command joshua , this do ye look on us ; because a deep sleep fell upon it before saul : [number] wise men , let them turn their mourning . after theo’s rape , a strong wind ? [number] : [number] open thou mine affliction . to him remaining . rather comically , he took counsel how they might attain to innocency ? [number] : [number] for behold the place hormah  '

# Exercise / Homework??

Make a mashup of two texts. They can be texts you wrote (a collection of tweets, an essay), or from anywhere. You can use libgen to find books and Calibre to convert them to text. Either paste the text directly into a notebook or use a Python utility for reading files.

In [28]:
alice = (' ').join(corpiss.gutenberg.words('carroll-alice.txt'))
now = text

print(len(alice))
print(len(text))

NameError: name 'corpiss' is not defined

In [None]:
mashups = alice + text
model = NgramModel(mashups, 2)


In [None]:
model.generate(5)

# References

most used: 
* https://notebook.community/luketurner/ipython-notebooks/notebooks/n-gram%20tutorial
* https://medium.com/analytics-vidhya/a-comprehensive-guide-to-build-your-own-language-model-in-python-5141b3917d6d
* https://towardsdatascience.com/simulating-text-with-markov-chains-in-python-1a27e6d13fc6

others:
* https://eliteai-coep.medium.com/building-n-gram-language-model-from-scratch-9a5ec206b520
* https://github.com/joshualoehr/ngram-language-model/blob/master/language_model.py
* http://www.pygaze.org/2016/03/how-to-code-twitter-bot/
    - code: https://github.com/esdalmaijer/markovbot
* https://towardsdatascience.com/implementing-a-character-level-trigram-language-model-from-scratch-in-python-27ca0e1c3c3f