In [None]:
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
from nltk.corpus import words


In [2]:
with open("english_words.txt") as word_file:
    english_words = set(word.strip().lower() for word in word_file)

def is_english_word(word):
    return word.lower() in english_words


In [3]:
def unzip(pairs):
    """Splits list of pairs (tuples) into separate lists.
    
    Example: pairs = [("a", 1), ("b", 2)] --> ["a", "b"] and [1, 2]
    
    This should look familiar from our review back at the beginning of week 1
    :)
    """
    return tuple(zip(*pairs))

In [7]:
# SOLUTION

def normalize(counter):
    """ Convert counter to a list of (letter, frequency) pairs, sorted in descending order of frequency.
    
        Parameters
        -----------
        counter: A Counter-instance

        Returns
        -------
        A list of tuples - (letter, frequency) pairs. 
        
        For example, if counter had the counts:
        
            {'a': 1, 'b': 3}
        
        `normalize(counter)` will return:
        
            [('b', 0.75), ('a', 0.25)]
    """
    total = sum(counter.values())
    return [(char, cnt/total) for char, cnt in counter.most_common()]

In [8]:
from collections import defaultdict

def train_lm(text, n):
    """ Train character-based n-gram language model.
        
        This will learn: given a sequence of n-1 characters, what the probability
        distribution is for the n-th character in the sequence.
        
        For example if we train on the text:
            text = "cacao"
        
        Using a n-gram size of n=3, then the following dict would be returned:
        
            {'ac': [('a', 1.0)],
             'ca': [('c', 0.5), ('o', 0.5)],
             '~c': [('a', 1.0)],
             '~~': [('c', 1.0)]}

        Tildas ("~") are used for padding the history when necessary, so that it's 
        possible to estimate the probability of a seeing a character when there 
        aren't (n - 1) previous characters of history available.
        
        So, according to this text we trained on, if you see the sequence 'ac',
        our model predicts that the next character should be 'a' 100% of the time.
        
       For generatiing the padding, recall that Python allows you to generate 
        repeated sequences easily: 
           `"p" * 4` returns `"pppp"`
           
        Parameters
        -----------
        text: str 
            A string (doesn't need to be lowercased).
        n: int
            The length of n-gram to analyze.
        
        Returns
        -------
        A dict that maps histories (strings of length (n-1)) to lists of (char, prob) 
        pairs, where prob is the probability (i.e frequency) of char appearing after 
        that specific history. For example, if

    """
    raw_lm = defaultdict(Counter)
    history = "~" * (n - 1)
    
    # count number of times characters appear following different histories
    for x in text:
        raw_lm[history][x] += 1
        history = history[1:] + x
    
    # create final dictionary by normalizing
    lm = { history : normalize(counter) for history, counter in raw_lm.items() }
    
    return lm

In [14]:
def generate_letter(lm, history):
    """ Randomly picks letter according to probability distribution associated with 
        the specified history.
    
        Note: returns dummy character "~" if history not found in model.
    
        Parameters
        ----------
        lm: Dict[str, Tuple[str, float]] 
            The n-gram language model. I.e. the dictionary: history -> (char, freq)
        
        history: str
            A string of length (n-1) to use as context/history for generating 
            the next character.
        
        Returns
        -------
        str
            The predicted character. '~' if history is not in language model.
    """
    if not history in lm:
        return "~"
    letters, probs = unzip(lm[history])
    i = np.random.choice(letters, p=probs)
    return i

In [17]:
def generate_text(lm, n, nletters=100):
    """ Randomly generates nletters of text with n-gram language model lm.
    
        Parameters
        ----------
        lm: Dict[str, Tuple[str, float]] 
            The n-gram language model. I.e. the dictionary: history -> (char, freq)
        n: int
            Order of n-gram model.
        nletters: int
            Number of letters to randomly generate.
        
        Returns
        -------
        str
            Model-generated text.
    """
    history = "~" * (n - 1)
    text = []
    for i in range(nletters):
        
        if i <10:
            c = generate_letter(lm, history)
            print(c)
            text.append(c)
            history = history[1:] + c
            print(history)
        
        
        else:
            c = generate_letter(lm, history)
            text.append(c)
            history = history[1:] + c  
        
        
    return "".join(text)    

In [19]:
path_to_merged = "shakespeare_input.txt"
with open(path_to_merged , "r") as f:
    twentyk  = f.read()
print(str(len(twentyk )) + " character(s)")
chars = set(twentyk)
"~" in chars

4573338 character(s)


False

In [22]:
t0 = time.time()
lm5 = train_lm(twentyk, 5)
t1 = time.time()
print("elapsed = " + str(t1 - t0) + "s")

elapsed = 4.113054275512695s


In [23]:
print(generate_text(lm5, 5, 500))

F
~~~F
i
~~Fi
r
~Fir
s
Firs
t
irst
 
rst 
M
st M
u
t Mu
r
 Mur
d
Murd
First Murder of the nobless and his pales, and on most ackness' speaks winds woman of Venice
In my lord, sir?

AUFIDIUS:
Presence:
He'll have revenger?

POMPEY:
Hard-bone, and the laste! hand, my life!

HUBERTRAM:
Son:
By whom your naturers' doom falls
There's my ripe,
At that
the speak a noble a comest, envy forthy bosoms true hobby-horn
And depost--on were is queen ye curst, to flour reportender own darkly stay'd
The shall your day, sick.

PROTEUS:
At once
To seeming.

BULLCALF:
And time to ch


In [1]:
import random, string
first_letter = random.choice(string.ascii_letters[:22])

print(string.ascii_letters[:22])




abcdefghijklmnopqrstuv


In [None]:


sec = input("Please input the next letter: ")

string = first_letter + sec

print(string)

history = string

for i in range(20):
    

    
    if i % 2 == 0:
        #for el in lm3.get(history):
            #if el[0] == ' ':
                #lm3.get(history).remove(el)
        
        letter = generate_letter(lm3,history)
        history = history[1:] + letter
        
        string +=letter
        
        print(string)
        
        if is_english_word(string) and len(string) > 3:
            print("The computer loses!")
            break
        
        
        
    if i % 2 is not 0:
        
        letter = input("Please input the next letter: ")
        
        string += letter
        
        print(string)

        if is_english_word(string) and len(string) > 3:
            print("You lose!")
            break
