In [1]:
import time
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
from nltk.corpus import words
from nltk import word_tokenize
from collections import Counter

In [2]:
with open("english_words.txt") as word_file:
    english_words = set(word.strip().lower() for word in word_file)

def is_english_word(word):
    return word.lower() in english_words


In [3]:
def unzip(pairs):
    """Splits list of pairs (tuples) into separate lists.
    
    Example: pairs = [("a", 1), ("b", 2)] --> ["a", "b"] and [1, 2]
    
    This should look familiar from our review back at the beginning of week 1
    :)
    """
    return tuple(zip(*pairs))

In [4]:
# SOLUTION

def normalize(counter):
    """ Convert counter to a list of (letter, frequency) pairs, sorted in descending order of frequency.
    
        Parameters
        -----------
        counter: A Counter-instance

        Returns
        -------
        A list of tuples - (letter, frequency) pairs. 
        
        For example, if counter had the counts:
        
            {'a': 1, 'b': 3}
        
        `normalize(counter)` will return:
        
            [('b', 0.75), ('a', 0.25)]
    """
    total = sum(counter.values())
    return [(char, cnt/total) for char, cnt in counter.most_common()]

In [5]:
from collections import defaultdict
import itertools

def train_lm(text, n):
    """ Train character-based n-gram language model.
        
        This will learn: given a sequence of n-1 characters, what the probability
        distribution is for the n-th character in the sequence.
        
        For example if we train on the text:
            text = "cacao"
        
        Using a n-gram size of n=3, then the following dict would be returned:
        
            {'ac': [('a', 1.0)],
             'ca': [('c', 0.5), ('o', 0.5)],
             '~c': [('a', 1.0)],
             '~~': [('c', 1.0)]}

        Tildas ("~") are used for padding the history when necessary, so that it's 
        possible to estimate the probability of a seeing a character when there 
        aren't (n - 1) previous characters of history available.
        
        So, according to this text we trained on, if you see the sequence 'ac',
        our model predicts that the next character should be 'a' 100% of the time.
        
       For generatiing the padding, recall that Python allows you to generate 
        repeated sequences easily: 
           `"p" * 4` returns `"pppp"`
           
        Parameters
        -----------
        text: str 
            A string (doesn't need to be lowercased).
        n: int
            The length of n-gram to analyze.
        
        Returns
        -------
        A dict that maps histories (strings of length (n-1)) to lists of (char, prob) 
        pairs, where prob is the probability (i.e frequency) of char appearing after 
        that specific history. For example, if

    """
    li = word_tokenize(text)

    
    
    text = ' '.join(list(itertools.filterfalse(lambda x: len(x) % 2 ==0, li)))
    
    #for el in li:
        #if len(el) % 2 !=0:
            #li.remove(el)
            
    #text = ''.join(li)
    
    raw_lm = defaultdict(Counter)
    history = "~" * (n - 1)
    
    # count number of times characters appear following different histories
    for x in text:
        raw_lm[history][x] += 1
        history = history[1:] + x
    
    # create final dictionary by normalizing

    
    the_list = list(raw_lm.values())
    key_list = list(raw_lm.keys())
    
 
    
    for i in range(len(the_list)):
        
        if the_list[i].get(' ') is not None:
            del the_list[i][' ']

    final_lm = {}
    for key, val in zip(key_list,the_list):
        final_lm[key] = val
        

    lm = { history : normalize(counter) for history, counter in final_lm.items() }
    
    return lm

In [6]:
def generate_letter(lm, history):
    """ Randomly picks letter according to probability distribution associated with 
        the specified history.
    
        Note: returns dummy character "~" if history not found in model.
    
        Parameters
        ----------
        lm: Dict[str, Tuple[str, float]] 
            The n-gram language model. I.e. the dictionary: history -> (char, freq)
        
        history: str
            A string of length (n-1) to use as context/history for generating 
            the next character.
        
        Returns
        -------
        str
            The predicted character. '~' if history is not in language model.
    """
    if not history in lm:
        return "~"
    letters, probs = unzip(lm[history])
    i = np.random.choice(letters, p=probs)
    return i

In [7]:
def generate_text(lm, n, nletters=100):
    """ Randomly generates nletters of text with n-gram language model lm.
    
        Parameters
        ----------
        lm: Dict[str, Tuple[str, float]] 
            The n-gram language model. I.e. the dictionary: history -> (char, freq)
        n: int
            Order of n-gram model.
        nletters: int
            Number of letters to randomly generate.
        
        Returns
        -------
        str
            Model-generated text.
    """
    history = "~" * (n - 1)
    text = []
    for i in range(nletters):
        
        if i <10:
            c = generate_letter(lm, history)
            print(c)
            text.append(c)
            history = history[1:] + c
            print(history)
        
        
        else:
            c = generate_letter(lm, history)
            text.append(c)
            history = history[1:] + c  
        
        
    return "".join(text)    

In [8]:
path_to_merged = "english_words.txt"
with open(path_to_merged , "r") as f:
    twentyk  = f.read()
print(str(len(twentyk )) + " character(s)")
chars = set(twentyk)
print("~" in chars)


1044754 character(s)
False


In [9]:
t0 = time.time()
lm3 = train_lm(twentyk, 3)
t1 = time.time()
print(type(lm3))
print("elapsed = " + str(t1 - t0) + "s")

<class 'dict'>
elapsed = 1.2819085121154785s


In [10]:
t0 = time.time()
lm5 = train_lm(twentyk, 5)
t1 = time.time()
print("elapsed = " + str(t1 - t0) + "s")

elapsed = 1.7542412281036377s


In [11]:
t0 = time.time()
lm7 = train_lm(twentyk, 7)
t1 = time.time()
print("elapsed = " + str(t1 - t0) + "s")

elapsed = 3.6996219158172607s


In [None]:
import random, string

first_letter = random.choice(string.ascii_letters[:22])

print(string.ascii_letters[:22])



abcdefghijklmnopqrstuv


In [None]:


sec = input("Please input the next letter: ")

string = first_letter + sec

print(string)

history = string

for i in range(10):
    

    
    if i == 0:
        
        letter = generate_letter(lm3,history)
        history = history[1:] + letter
        
        string +=letter
        
        print(string)
        
        if is_english_word(string) and len(string) > 3:
            print("The computer loses!")
            break
        
        
        
    if i %2 ==1:
        
        letter = input("Please input the next letter: ")
        
        string += letter
        
        print(string)

        if is_english_word(string) and len(string) > 3:
            print("You lose!")
            break
            
       
    if i >= 2 and i%2 ==0 and i < 4:
        
        letter = generate_letter(lm5,string[-4:])
        history = history[1:] + letter
        
        string +=letter
        
        print(string)
        
        if is_english_word(string) and len(string) > 3:
            print("The computer loses!")
            break
            
    if i >= 2 and i%2 ==0 and i >= 4:
        
        letter = generate_letter(lm7,string[-6:])
        history = history[1:] + letter
        
        string +=letter
        
        print(string)
        
        if is_english_word(string) and len(string) > 3:
            print("The computer loses!")
            break
        
    
