# Here's our N-gram model: what we have so far

In [40]:
from nltk.lm.preprocessing import pad_both_ends
from nltk import ConditionalFreqDist
from nltk.probability import ConditionalProbDist, ELEProbDist
from nltk.util import pad_sequence
from nltk.lm.preprocessing import pad_both_ends
import re

In [41]:
corpus = """By this liberty they entered into a very laudable emulation to do all of them \
what they saw did please one. If any of the gallants or ladies should say, Let us drink, \
they would all drink.  If any one of them said, Let us play, they all played.  If one said, \
Let us go a-walking into the fields they went all."""

In [42]:
# get the ngrams for a corpus
def ngrams(text, n):
    n_grams = []
    for i in range(n-1, len(tokenized_corpus)): 
        n_grams.append(tuple(tokenized_corpus[i-(n-1):i+1]))
    return n_grams



In [43]:



class NgramModel():

    
    def __init__(self, corpus, n):
        self.n = n
        tokenized_corpus = self._tokenize(corpus)
        self._ngrams = self._build_ngrams(tokenized_corpus, n)
        self._cpd = self._build_distribution(self._ngrams, n)        
        
    def _tokenize(self, corpus):
        
        tokenized_corpus = []
        
        # separate punctuation from previous word
        spaced_corpus = re.sub(r'(\w)([.,?!;:])', r'\1 \2', corpus) 
        
        # split into sentences
        sentences = spaced_corpus.split('.')
        for sentence in sentences:
            words = sentence.split() # split on whitespace
            words = [word.lower() for word in words]
            words = list(pad_both_ends(words, n=self.n))
            tokenized_corpus += words
        
        return tokenized_corpus
            
    def _build_ngrams(self, tokenized_corpus, n):
        n_grams = []
        for i in range(n-1, len(tokenized_corpus)): 
            n_grams.append(tuple(tokenized_corpus[i-(n-1):i+1]))    
        return n_grams
    
    def _build_distribution(self, corpus, n):
               
        cfd = ConditionalFreqDist()
        for ngram in self._ngrams:
            condition = tuple(ngram[0:n-1]) 
            outcome = ngram[n-1]
            
            cfd[condition][outcome] += 1
        bins = len(cfd) # we have to pass the number of bins in our freq dist in as a parameter to probability distribution, so we have a bin for every word
        cpd = ConditionalProbDist(cfd, ELEProbDist, bins)
        self.cpd = cpd
        return cpd
        
    def generate(self, num_sentences = 1, seed = []):
        """
        There are two cases to deal with here. Either we have a start string, or we don't. 
        If we are given a start string, we'll have to find the last n-1 gram and condition on that
        If we are not, we need to generate the first n-1 gram. For a trigram model, we need a bigram. But how can we use our model to generate new words when we have fewer than two words to condition on?
        We can use a bigram model! But wait. If we have a bigram model, how do we generate the first token without another token to condition on? 
        We can use a unigram model! 
        Recursion will save us here. Turns out the easiest way to do this will be to recursively construct an n-1gram model and store it in the main model.
        And how can we 
        Either way, we need a seed condition to enter into the loop with.
        """

        # place to put generated tokens
        string = []

        if seed:
            string = string + (list(pad_sequence(seed, self.n, pad_left=True, pad_right=False, left_pad_symbol='<s>') ) )
        else:
            string = string + (list(pad_sequence('', self.n, pad_left=True, pad_right=False, left_pad_symbol='<s>') ) )
        
        for i in range(num_sentences):
            next_token = tuple(string[-(self.n-1):])
            
            # keep generating tokens as long as we havent reached the stop sequence
            while next_token != '</s>':
                
                # get the last n-1 tokens to condition on next
                lessgram = tuple(string[-(self.n-1):])

    
                next_token = self.cpd[lessgram].generate()
                string.append( next_token )

        string = ' '.join(string)

        return string

        
        

# Scaling up

If we keep increasing n, our generated text starts to repeat our input text almost word for word. To get interesting behavior, we have to increase the size of the corpus. Let's try with a much bigger corpus!

In [44]:
import nltk
!python3 -m nltk.downloader gutenberg


[nltk_data] Downloading package gutenberg to /home/jovyan/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


NLTK comes with several built in corpora, including a selection of books from project gutenberg

In [45]:
# import corpus using an alias to avoid namespace confustion with our corpus variable
from nltk import corpus as corpiss 

corpiss.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [46]:
kjv = corpiss.gutenberg.words('shakespeare-macbeth.txt')
list(kjv)[:40]

['[',
 'The',
 'Tragedie',
 'of',
 'Macbeth',
 'by',
 'William',
 'Shakespeare',
 '1603',
 ']',
 'Actus',
 'Primus',
 '.',
 'Scoena',
 'Prima',
 '.',
 'Thunder',
 'and',
 'Lightning',
 '.',
 'Enter',
 'three',
 'Witches',
 '.',
 '1',
 '.',
 'When',
 'shall',
 'we',
 'three',
 'meet',
 'againe',
 '?',
 'In',
 'Thunder',
 ',',
 'Lightning',
 ',',
 'or',
 'in']

### A Few final adjustments

We have some housekeeping things to take care of. 

1. Because we have encoded sentence breaks as a string of start and stop sequences, we now will generate a lot of them in our output. We add function to strip them out, and update our generate method to strip out these tokens before printing



In [47]:
def add_stops(string):
    """
    function to convert the stop/start sequence back into periods.
    strips all the sequences of any number of stop tokens followed by the some number of start tokens
    and replaces them with a period.

    then strips any remaining stop and start sequences (which will occur at the beginning and end of our entire generated sequence)
    """
    string = re.sub(r"</s>(?:\s</s>)*\s<s>(?:\s<s>)*", ".", string)

    string = re.sub(r"(<s>\s)+", "", string) # initial tokens
    string = re.sub(r"(</s>)", "", string) # final token

    return string


test_string = '<s> <s> <s> take mark , and see now , and humble ye them , and seethe his flesh in running water , and be slain , and they rode upon the camels , and have washed their robes , and made unto themselves of the holy angels , and to solomon his son </s> </s> </s> <s> <s> <s> even so , father ; for the press is full , the fats overflow ; for their wickedness </s> </s> </s> <s> <s> <s> 119 : 59 i thought on my ways , as a seal upon him , and would none of my words </s> </s> </s> <s> <s> <s> 30 : 37 and jacob took him rods of green poplar , and of beast : it is most holy unto him of his labour the days of jehoahaz </s> </s> </s> <s> <s> <s> 97 : 3 a man shall dig a pit , and sold joseph to the ishmeelites for twenty pieces of silver out of the sheath thereof , and add unto it the fifth part unto pharaoh , and say to hezekiah , thus saith god the lord , after this manner therefore pray ye : our father which art in heaven , now is the judgment of moab </s> </s> </s> <s> <s> <s> 134 : 3 the aged women likewise , that they escaped all safe to land </s> </s> </s> <s> <s> <s> spots they are and blemishes , sporting themselves with their own works , as god liveth , who hath begotten me these , seeing i have rejected him from reigning over israel ? 15 : 37 and reuben spake unto his brother a name in israel , telleth the king of lachish , bind the tire of thine head upon thee , until it was a river that i could withstand god ? 11 : 30 are they not written in this book : worship god : for man would swallow me up </s> </s> </s> <s> <s> <s> unto him that giveth his neighbour drink , that puttest thy bottle to him , rise , and measure the temple of babylon , my servant deceived me : my moisture is turned into mourning </s> </s> </s> <s> <s> <s> 106 : 18 and samuel told him every whit , and hid snares for my feet </s> </s> </s> <s> <s> <s>'
model = NgramModel(corpus, 3)
add_stops(test_string)

'take mark , and see now , and humble ye them , and seethe his flesh in running water , and be slain , and they rode upon the camels , and have washed their robes , and made unto themselves of the holy angels , and to solomon his son . even so , father ; for the press is full , the fats overflow ; for their wickedness . 119 : 59 i thought on my ways , as a seal upon him , and would none of my words . 30 : 37 and jacob took him rods of green poplar , and of beast : it is most holy unto him of his labour the days of jehoahaz . 97 : 3 a man shall dig a pit , and sold joseph to the ishmeelites for twenty pieces of silver out of the sheath thereof , and add unto it the fifth part unto pharaoh , and say to hezekiah , thus saith god the lord , after this manner therefore pray ye : our father which art in heaven , now is the judgment of moab . 134 : 3 the aged women likewise , that they escaped all safe to land . spots they are and blemishes , sporting themselves with their own works , as god 

2. numbers are a problem for n-gram models becayse there are so many of them. we don't want to eliminate them, because they are meaningful, but we want to abstract away from the individual numbers. In addition, we might want to get rid of some other things like parentheticals and quotes, becayse these impossible for our model to keep track of given it's amount of memory. We can take care of these things in the preprocessing function

In [48]:
from functools import reduce

def _tokenize(self, corpus):
    # The list of regular expressions and replacements to be applied
    # the order here matters! these replacements will happen in order
    replacements = [
         ["[-\n]",                   " "] # Hyphens to whitespace
        ,[r'[][(){}#$%"]',           ""] # Strip unwanted characters like quotes and brackets
        ,[r'\s([./-]?\d+)+[./-]?\s', " [NUMBER] "] # Standardize numbers
        ,[r'\.{3,}',                 " [ELLIPSIS] "] # remove ellipsis
        ,[r'(\w)([.,?!;:])',         r'\1 \2' ]  # separate punctuation from previous word
    ]

    # This is a function that applies a single replacement from the list
    resub = lambda words, repls: re.sub(repls[0], repls[1], words)

    # we use the resub function to applea each replacement to the entire corpus,
    normalized_corpus = reduce(resub, replacements, corpus)


    sentences = normalized_corpus.split('.')

    tokens = []
    for sentence in sentences:
        words = sentence.split() # split on whitespace
        words = [word.lower() for word in words]
        words = list(pad_both_ends(words, n=self.n))
        tokens += words

    return tokens

Here is a final version of our class with all the bells and whistles

In [49]:
from nltk.lm.preprocessing import pad_both_ends
from nltk import ConditionalFreqDist
from nltk.probability import ConditionalProbDist, ELEProbDist
from nltk.util import pad_sequence
from nltk.lm.preprocessing import pad_both_ends
from functools import reduce

class NgramModel():

    
    def __init__(self, corpus, n):
        self.n = n
        tokenized_corpus = self._tokenize(corpus)
        self._ngrams = self._build_ngrams(tokenized_corpus, n)
        self._cpd = self._build_distribution(self._ngrams, n)        

    def _tokenize(self, corpus):
        # The list of regular expressions and replacements to be applied
        # the order here matters! these replacements will happen in order
        replacements = [
             ["[-\n]",                   " "] # Hyphens to whitespace
            ,[r'[][(){}#$%"]',           ""] # Strip unwanted characters like quotes and brackets
            ,[r'\s([./-]?\d+)+[./-]?\s', " [NUMBER] "] # Standardize numbers
            ,[r'\.{3,}',                 " [ELLIPSIS] "] # remove ellipsis
            ,[r'(\w)([.,?!;:])',         r'\1 \2' ]  # separate punctuation from previous word
        ]
        
        # This is a function that applies a single replacement from the list
        resub = lambda words, repls: re.sub(repls[0], repls[1], words)
        
        # we use the resub function to applea each replacement to the entire corpus,
        normalized_corpus = reduce(resub, replacements, corpus)
        
        
        sentences = normalized_corpus.split('.')
        
        tokens = []
        for sentence in sentences:
            words = sentence.split() # split on whitespace
            words = [word.lower() for word in words]
            words = list(pad_both_ends(words, n=self.n))
            tokens += words
        
        return tokens
            
    def _build_ngrams(self, tokenized_corpus, n):
        n_grams = []
        for i in range(n-1, len(tokenized_corpus)): 
            n_grams.append(tuple(tokenized_corpus[i-(n-1):i+1]))    
        return n_grams
    
    def _build_distribution(self, corpus, n):
               
        cfd = ConditionalFreqDist()
        for ngram in self._ngrams:
            condition = tuple(ngram[0:n-1]) 
            outcome = ngram[n-1]
            
            cfd[condition][outcome] += 1
        bins = len(cfd) # we have to pass the number of bins in our freq dist in as a parameter to probability distribution, so we have a bin for every word
        cpd = ConditionalProbDist(cfd, ELEProbDist, bins)
        self.cpd = cpd
        return cpd
        
    def generate(self, num_sentences = 1, seed = []):
        """
        There are two cases to deal with here. Either we have a start string, or we don't. 
        If we are given a start string, we'll have to find the last n-1 gram and condition on that
        If we are not, we need to generate the first n-1 gram. For a trigram model, we need a bigram. But how can we use our model to generate new words when we have fewer than two words to condition on?
        We can use a bigram model! But wait. If we have a bigram model, how do we generate the first token without another token to condition on? 
        We can use a unigram model! 
        Recursion will save us here. Turns out the easiest way to do this will be to recursively construct an n-1gram model and store it in the main model.
        And how can we 
        Either way, we need a seed condition to enter into the loop with.
        """

        # place to put generated tokens
        string = []

        if seed:
            string = string + (list(pad_sequence(seed, self.n, pad_left=True, pad_right=False, left_pad_symbol='<s>') ) )
        else:
            string = string + (list(pad_sequence('', self.n, pad_left=True, pad_right=False, left_pad_symbol='<s>') ) )
        
        for i in range(num_sentences):
            next_token = tuple(string[-(self.n-1):])
            
            # keep generating tokens as long as we havent reached the stop sequence
            while next_token != '</s>':
                
                # get the last n-1 tokens to condition on next
                lessgram = tuple(string[-(self.n-1):])

    
                next_token = self.cpd[lessgram].generate()
                string.append( next_token )

        string = ' '.join(string)
        string = add_stops(string)

        return string

    
    def add_stops(string):
        """
        function to convert the stop/start sequence back into periods.
        strips all the sequences of any number of stop tokens followed by the some number of start tokens
        and replaces them with a period.

        then strips any remaining stop and start sequences (which will occur at the beginning and end of our entire generated sequence)
        """
        string = re.sub(r"</s>(?:\s</s>)*\s<s>(?:\s<s>)*", ".", string)

        string = re.sub(r"(<s>\s)+", "", string) # initial tokens
        string = re.sub(r"(</s>)", "", string) # final token

        return string

In [50]:
model = NgramModel(corpus, 2)


In [51]:
model.generate(10)

'by this liberty they saw did please one of the fields they entered into a walking into the gallants or ladies should say , let us play , let us drink . if one . . by this liberty they saw did please one said , let us go a very laudable emulation to do all played . if one said , let us play , they would all played . . if one . by this liberty they saw did please one . by this liberty they would all played . by this liberty they entered into a walking into a very laudable emulation to do all drink '

We try generating a 4-gram model with the King James Bible

Our model expects its training corpus in the form of a single string.

In [52]:
kjv = (' ').join(kjv)

In [53]:
model = NgramModel(kjv, 4)


In [54]:
model.generate(10)

'lay it to thy heart and farewell . go pricke thy face , if thou beest slaine , and with some sweet obliuious antidote cleanse the stufft bosome , of that perillous stuffe which weighes vpon the heart ? doct . heere abiure the taints , and blames i laide vpon my selfe . me thought i heard a voyce cry , sleep no more : i am as i haue spoken mac '

# Let's do a mashup

Intro to beautiful soup for scraping web text

In [55]:
!pip3 install beautifulsoup4

from bs4 import *

import requests

url = 'https://theanarchistlibrary.org/library/david-graeber-anarchy-in-a-manner-of-speaking'
res = requests.get(url)
html_page = res.text

# Parse the source code using BeautifulSoup
soup = BeautifulSoup(html_page, 'html.parser')

# Extract the plain text content
text = soup.get_text()

# Print the plain text
print(text[:2000])







Anarchy — In a Manner of Speaking | The Anarchist Library

















































Toggle navigation




 











Search











                  Table of Contents
                









                  Archive
                





                    Titles
                  



                      Authors
                    



                      Topics
                    




                    Latest entries
                  


Popular Texts




                    Add a new text
                  







                  More
                




About the project


Live Chat (IRC/Matrix)


Tor Onion Services


Bookshelf (wiki)


Donate


SHH! THIS IS A LIBRARY! (forums)







                  Other languages
                




Anarhistička biblioteka


Bibliothèque Anarchiste (French)


Det Anarkistiske Bibliotek (Danish)


Biblioteca anarchica (Italian)


Анархистичка библиотека (Macedonian)


Anarchistische Bibliothek (Ge

In [56]:
!pip3 install beautifulsoup4

from bs4 import *

import requests


url = 'https://theanarchistlibrary.org/library/laboria-cuboniks-xenofeminism'
res = requests.get(url)
html_page = res.text

# Parse the source code using BeautifulSoup
soup = BeautifulSoup(html_page, 'html.parser')

# Extract the plain text content
text2 = soup.get_text()

# Print the plain text
print(text2[:2000])







Xenofeminism | The Anarchist Library

















































Toggle navigation




 











Search











                  Table of Contents
                









                  Archive
                





                    Titles
                  



                      Authors
                    



                      Topics
                    




                    Latest entries
                  


Popular Texts




                    Add a new text
                  







                  More
                




About the project


Live Chat (IRC/Matrix)


Tor Onion Services


Bookshelf (wiki)


Donate


SHH! THIS IS A LIBRARY! (forums)







                  Other languages
                




Anarhistička biblioteka


Bibliothèque Anarchiste (French)


Det Anarkistiske Bibliotek (Danish)


Biblioteca anarchica (Italian)


Анархистичка библиотека (Macedonian)


Anarchistische Bibliothek (German)


Det Anarkisti

In [57]:
# Define the file path
file_path = "profane_quotes.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_profane = file.read()

# Now, the 'text_variable' holds the content of the file
print(text_profane)


1. 
" People are tired of meetings, the classics, pointless marches, theoretical discussions that split hairs in four, endless distinctions, the monotony and poverty of certain political analyses. They prefer to make love, smoke, listen to music, go for walks, sleep, laugh, play, kill policemen, lame journalists, kill judges, blow up barracks. Anathema! The struggle is only legitimate when it is comprehensible to the leaders of the revolution. Otherwise, there being a risk that the situation might go beyond their control, there must have been a provocation.

Hurry comrade, shoot the policeman, the judge, the boss. Now, before a new police prevent you.

Hurry to say No, before the new repression convinces you that saying no is pointless, mad, and that you should accept the hospitality of the mental asylum.

Hurry to attack capital before a new ideology makes it sacred to you.

Hurry to refuse work before some new sophist tells you yet again that ‘work makes you free’.

Hurry to play. Hu

In [58]:
# Define the file path
file_path = "gex.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_gex = file.read()

# Now, the 'text_variable' holds the content of the file
print(text_gex)


Sitting around, don't know what you're doing now, but that's OK
Feeling now, got a phone, doing shit but I'm on the phone
I'm smoking dope, goddamn that's a whole lot of dope
Good shit, that's all I know

[?]

Feeling now, don't know what I'm feeling now but that's OK
Doing shit on the phone, got a phone so that's OK
My phone is dead but I'm dead
But I'm talking to you so that's OK
God damn Matt find my rope

[?]

Feeding now, got a doge, and a phone
So my dog's a phone
Eating now, on the floor, feeling like "Oh, that's okay"
Feeding doge my femur bone but my teeth break on the bone
Good shit, fuck teeth, OK doggo 


I was trying to find a way to kill time
I didn't even get to tell you goodbye
I was trying to find a way to kill time
Now you're gone and I can't
Ever say goodbye

This feeling's going to my head
I'm thinking things I shouldn't say
You circled me inside my room
I couldn't go another day
This feeling's going to my head
I'm thinking things I shouldn't say
You circled me insi

In [59]:
print(len(text_gex))
print(len(text_profane))
print(len(text_profane * 6))

3656
5330
31980


In [60]:

mashup = text_gex + text_profane


In [61]:
model = NgramModel(mashup, 2)


In [62]:
model.generate(5)

"now ? what i'm going to the bar i know ? so often do you . they prefer to bear in hollywood baby baby ? feeding now from the new police prevent you got for pictures of benefit to music , catch a bucket and your weapon i ride the matter is clearly of benefit to say you write it in yo' shit but let it i'm surprised let's role play , the situation might almost be done and jobs are so that's where i'm a disguise i was on this ring ayy gobble me on this pussy bring up like pain if he cheating put him for this pussy bring a beating in it in the boss . now get the matter is odd , never make him where to offer excellent working conditions . the back of benefit to believe in never lost a charge extra large , no , catch a garden snake , pointless marches , make him on the matter is comprehensible to arm yourself alfredo bonanno [number] whores cardi b , think i didn't wanna hurt no is only reason i shouldn't say no way to think that's okay you'll think of words when it i'm on him for a beard , 

In [63]:
# Define the file path
file_path = "carly.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_carly = file.read()

# Now, the 'text_variable' holds the content of the file
print(text_carly)


I threw a wish in the well
Don't ask me, I'll never tell
I looked to you as it fell
And now you're in my way

I'd trade my soul for a wish
Pennies and dimes for a kiss
I wasn't looking for this
But now you're in my way

Your stare was holdin'
Ripped jeans, skin was showin'
Hot night, wind was blowin'
Where you think you're going, baby?

Hey, I just met you
And this is crazy
But here's my number
So call me, maybe?

It's hard to look right
At you baby
But here's my number
So call me, maybe?

Hey, I just met you
And this is crazy
But here's my number
So call me, maybe?

And all the other boys
Try to chase me
But here's my number
So call me, maybe?

You took your time with the call
I took no time with the fall
You gave me nothing at all
But still, you're in my way

I beg, and borrow and steal
At first sight and it's real
I didn't know I would feel it
But it's in my way

Your stare was holdin'
Ripped jeans, skin was showin'
Hot night, wind was blowin'
Where you think you're going, baby?

He

In [64]:
# Define the file path
file_path = "duncan.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_duncan = file.read()

# Now, the 'text_variable' holds the content of the file
print(text_duncan)


Okay, so we are back from vacation and ready to plunge into the thick chaos that was France in July 1789. Hopefully everyone downloaded the tour announcement while I was gone, and if you didn’t, please go check it out. There is a brand new American Revolutions tour and another run through England and Paris, which will include stops at the very places where most of today’s episode is going to take place. So go to revolutionspodcast.com for all the details, and then sign up, because the slots are going fast.


So we left off last time with the final disintegration of the Estates General and the King’s Order for all three Estates to come together under this self-declared thing, the National Assembly. As will often be the case over the next few years, many observers at the time thought that this moment would mark the end of the Revolution. The demands of the Third Estate for double representation and voting by head had now been achieved. It had taken more of a fight than the Third Estate d

In [65]:
print(len(text_carly))
print(len(text_duncan))
print(len(text_carly * 2))

15332
28437
30664


In [66]:
mashup2 = text_carly * 2 + text_duncan

In [67]:
model2 = NgramModel(mashup2, 2)

In [83]:
model2.generate(5)

"related to take a tour and that bad dreams get lost oh run away with news of lettres , look in the gunpowder . paris city hall . knowing it dropped open . shortly thereafter , used as this angry crowd started coming for settling of gunpowder to signal that had more than a time with me nothing today . coupled with reports like they marched him back up their hands out like the end , you're driving you all being confiscated to demand the middle of the call me just the mob seized the 14th , green flags even as various embellished memoirs were published by orders from all the clouds , troops marching in the crowd and something yeah late night all they did was involved in the delegates studiously talked their two regular army "

In [69]:
# Define the file path
file_path = "omelas.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_omelas = file.read()

# Now, the 'text_variable' holds the content of the file
print(text_omelas)


With a clamor of bells that set the swallows soaring,
the Festival of Summer came to the city Omelas, bright-
towered by the sea. The rigging of the boats in harbor
sparkled with flags. In the streets between houses with red
roofs and painted walls, between old moss-grown gardens and
under avenues of trees, past great parks and public buildings,
processions moved. Some were decorous: old people in long
stiff robes of mauve and gray, grave master workmen, quiet,
merry women carrying their babies and chatting as they
walked. In other streets the music beat faster, a shimmering of
gong and tambourine, and the people went dancing, the
procession was a dance. Children dodged in and out, their high
calls rising like the swallows' crossing flights over the music
and the singing. All the processions wound towards the north
side of the city, where on the great water-meadow called the
Green Fields boys and girls, naked in the bright air, with mud-
stained feet and ankles and long, lithe arms, ex

In [70]:
# Define the file path
file_path = "chiang.txt"

# Read the content of the file and store it in a variable
with open(file_path, 'r') as file:
    text_chiang = file.read()

# Now, the 'text_variable' holds the content of the file
print(text_chiang)



lightspeedmagazine.com
Exhalation - Lightspeed Magazine
Wendy Wagner
34–44 minutes

Published in Apr. 2014 (Issue 47) | 6552 words
© 2008 by Ted Chiang. Originally published in Eclipse 2, edited by Jonathan Strahan. Reprinted by permission of the author.

It has long been said that air (which others call argon) is the source of life. This is not in fact the case, and I engrave these words to describe how I came to understand the true source of life and, as a corollary, the means by which life will one day end.

For most of history, the proposition that we drew life from air was so obvious that there was no need to assert it. Every day we consume two lungs heavy with air; every day we remove the empty ones from our chest and replace them with full ones. If a person is careless and lets his air level run too low, he feels the heaviness of his limbs and the growing need for replenishment. It is exceedingly rare that a person is unable to get at least one replacement lung before his insta

In [71]:
print(len(text_duncan))
print(len(text_gex))
print(len(text_gex * 8))

28437
3656
29248


In [72]:
mashup3 = text_duncan + text_gex * 8

In [73]:
model3 = NgramModel(mashup3, 2)

In [81]:
model3.generate(5)

"the phone eating now actively working with a pistol shots hit the man we’ll see next few midsummer weeks were positively delighting in fact . knowing that is where it stop took too many problems . but the hundred parisians were in their guns from there were over , bread crisis defused , louis asked , oh oh whoo ! sitting down ? if that day and further to sack necker on all they found out that evening , all being used as i was the flower war had elected the bar i can't ever been stockpiled from all about how you who you gonna say that news of july [number] hopefully everyone to my god damn matt find a mountain ? we will . it i'm thinking things were the crib going rabble , oh my hand , whose sole purpose it rains all the very end of what he gathered with governor bernard rené delaunay was nearly a café inside . over the demands of one he detested necker might be reassigned from vacation and made it in , when it in from the long positioned himself as necker kicked around , that's ok fee

Wow!

'among the hills that are weaned from the waters saw thee polluted in thy glory above all people , from beersheba to mount up with . [number] : [number] and shaalabbin , and partly broken . report , that jehoshaphat the king is among us still believed in hope ; patient in spirit ; and half of thy power preserve thou those that served in the womb : if jacob take a lump of figs were set there upon him shall inherit all things thereon . in most militants this search for my gold and the lord separated the sons shall eat clean provender , which loveth thee and abishai , and kings have had dominion over our cattle . then he sacrificed also and to whomsoever he will prosper us ; thus have been occupied therein . yellowed figures of cherubims and palm trees : they serve not thy left side , upon their altars : but according to our hand be upon every fowl of the european union . all these did moses command joshua , this do ye look on us ; because a deep sleep fell upon it before saul : [number] wise men , let them turn their mourning . after theo’s rape , a strong wind ? [number] : [number] open thou mine affliction . to him remaining . rather comically , he took counsel how they might attain to innocency ? [number] : [number] for behold the place hormah  '

# Exercise / Homework??

Make a mashup of two texts. They can be texts you wrote (a collection of tweets, an essay), or from anywhere. You can use libgen to find books and Calibre to convert them to text. Either paste the text directly into a notebook or use a Python utility for reading files.

In [75]:
alice = (' ').join(corpiss.gutenberg.words('carroll-alice.txt'))
now = text

print(len(alice))
print(len(text))

150118
306881


In [76]:
mashups = alice + text
model = NgramModel(mashups, 2)


In [77]:
model.generate(5)

"both as dead , fascism leave out some group starts to it this comes out bullies tend towards anything likely to it ' look back the hedgehogs , weber suggested said anything new is endlessly suspended . it’s probably the origins of expressing yourself airs ! take horror or children ; that poststructuralists take—but that’s hurdling downwards at history those societies did you can remember being followed a procession moved off from england supposed consumer capitalism , saying . would emerge and alice considered a uniform factor is birds hurried off to hesiod , something going into alice knew she turned it were far left her : “it is precisely because somebody attacks you took even freudians , very soon came together unfinished in universal scope , child life ; bakunin said a tree a few years we can’t control themselves anti nietzschean ; without being . so interested and pretend things by heroic verse ,' continued as usual , and besides . ” hunter gatherers , beauti ful soup of generali

# References

most used: 
* https://notebook.community/luketurner/ipython-notebooks/notebooks/n-gram%20tutorial
* https://medium.com/analytics-vidhya/a-comprehensive-guide-to-build-your-own-language-model-in-python-5141b3917d6d
* https://towardsdatascience.com/simulating-text-with-markov-chains-in-python-1a27e6d13fc6

others:
* https://eliteai-coep.medium.com/building-n-gram-language-model-from-scratch-9a5ec206b520
* https://github.com/joshualoehr/ngram-language-model/blob/master/language_model.py
* http://www.pygaze.org/2016/03/how-to-code-twitter-bot/
    - code: https://github.com/esdalmaijer/markovbot
* https://towardsdatascience.com/implementing-a-character-level-trigram-language-model-from-scratch-in-python-27ca0e1c3c3f