# Module 3: Infer Language Models

* DS 6001
* Raf Alvarado

We now create a series of langage models and evaluate them.

# Set Up

## Configure

In [1]:
OHCO = ['book_id', 'chap_num', 'para_num', 'sent_num', 'token_num']
text_file1 = '../2020-01-23/austen-persuasion.csv'
text_file2 = '../2020-01-23/austen-sense.csv'

## Import libraries

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
%matplotlib inline

# Import and combine texts

In [6]:
text1 = pd.read_csv(text_file1)
text2 = pd.read_csv(text_file2)

In [8]:
text1.head(10)

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str
0,1,1,0,0,Sir
1,1,1,0,1,Walter
2,1,1,0,2,Elliot
3,1,1,0,3,of
4,1,1,0,4,Kellynch
5,1,1,0,5,Hall
6,1,1,0,6,in
7,1,1,0,7,Somersetshire
8,1,1,0,8,was
9,1,1,0,9,a


In [10]:
text1['book_id'] = 1
text2['book_id'] = 2

In [11]:
text1.head()

Unnamed: 0,chap_num,para_num,sent_num,token_num,token_str,book_id
0,1,1,0,0,Sir,1
1,1,1,0,1,Walter,1
2,1,1,0,2,Elliot,1
3,1,1,0,3,of,1
4,1,1,0,4,Kellynch,1


In [12]:
tokens = pd.concat([text1, text2]).dropna()

In [13]:
tokens = tokens.set_index(OHCO)

In [14]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1
1,1,1,0,0,Sir
1,1,1,0,1,Walter
1,1,1,0,2,Elliot
1,1,1,0,3,of
1,1,1,0,4,Kellynch


# Create a vocabulary

In [15]:
tokens['term_str'] = tokens['token_str'].str.lower().str.replace(r'[\W_]', '')

In [16]:
tokens.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,token_str,term_str
book_id,chap_num,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1,1,0,0,Sir,sir
1,1,1,0,1,Walter,walter
1,1,1,0,2,Elliot,elliot
1,1,1,0,3,of,of
1,1,1,0,4,Kellynch,kellynch


In [17]:
vocab = tokens['term_str'].value_counts()\
    .to_frame()\
    .reset_index()\
    .rename(columns={'term_str':'n', 'index':'term_str'})\
    .sort_values('term_str')
vocab.index.name = 'term_id'

In [18]:
vocab.head()

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
766,,29
3456,1.0,3
7639,15.0,1
6602,16.0,1
6938,1760.0,1


In [19]:
vocab.sample(5)

Unnamed: 0_level_0,term_str,n
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1
6120,littlenesses,1
5179,improvident,1
7261,hotel,1
3474,accent,3
3371,linen,3


# Simple Unigram Model

In [20]:
n_tokens = vocab.n.sum()
vocab['p'] = vocab['n'] / n_tokens
vocab['log_p'] = np.log2(vocab['p'])

In [22]:
n_tokens

204833

In [23]:
vocab.sort_values('p', ascending=False).head(10)

Unnamed: 0_level_0,term_str,n,p,log_p
term_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,the,7436,0.036303,-4.783778
1,to,6924,0.033803,-4.886699
2,and,6290,0.030708,-5.025244
3,of,6145,0.03,-5.058891
4,her,3747,0.018293,-5.772568
5,a,3687,0.018,-5.795857
6,in,3368,0.016443,-5.926412
7,was,3198,0.015613,-6.001134
8,i,3128,0.015271,-6.033064
9,it,2795,0.013645,-6.195456


In [24]:
smooth = vocab['p'].min()
def predict_sentence(sent_str):
    
    # Parse sentence into tokens and normalize string
    tokens = pd.DataFrame(sent_str.lower().split(), columns=['term_str'])
    
    # Link the tokens with model vocabulary
    tokens = tokens.merge(vocab, on='term_str', how='left') # Left join is key
    
    # Add minimum values where token is not in our vocabulary
    tokens.loc[tokens['p'].isna(), 'p'] = [smooth]
    
    # Compute probability of sentence by getting product of token probabilities
    p = tokens['p'].product()
        
    # Print results
    print("p('{}') = {}".format(sent_str, p))

In [25]:
predict_sentence('I love you')
predict_sentence('I love cars')
predict_sentence("I want to")
predict_sentence("anne said to")
predict_sentence("said to her")
predict_sentence('said to him')

p('I love you') = 7.878556023336425e-08
p('I love cars') = 4.3312567472987495e-11
p('I want to') = 1.8649008463478524e-07
p('anne said to') = 2.3099369325723746e-07
p('said to her') = 1.7207422835683278e-06
p('said to him') = 5.092882819528357e-07


# N-Gram models

This function generates models up to the length specified.

In [26]:
def get_ngrams(tokens, n=2):
    
    global OHCO
    
    # Create list to store copies of tokens table
    X = []
    
    # Convert the index to cols in order to change the value of token_num
    X.append(tokens['term_str'].reset_index())
        
    # Create copies of token table for each level of ngram, offset by 1, and 
    # merge with previous 
    for i in range(1, n):
        X.append(X[0].copy())
        X[i]['token_num'] = X[i]['token_num'] + i
        X[i] = X[i].merge(X[i-1], on=OHCO, how='left', sort=True).fillna('<s>')
        
    # Compress tables to unique ngrams with counts
    for i in range(0, n):
        X[i] = X[i].drop(OHCO, 1)
        cols = X[i].columns.tolist()
        X[i]['n'] = 0
        X[i] = X[i].groupby(cols).n.apply(lambda x: x.count()).to_frame()
        X[i].index.names = ['w{}'.format(j) for j in range(i+1)]
            
    # Return just the ngram tables
    return X

## Generate three models

Unigram, bigram, and trigram

In [27]:
m1, m2, m3 = get_ngrams(tokens, n=3)

In [35]:
# m3.sort_values('n', ascending=False).head(10)

## Compute joint probabilities

In [36]:
m1['p'] = m1['n'] / m1['n'].sum()
m2['p'] = m2['n'] / m2['n'].sum()
m3['p'] = m3['n'] / m3['n'].sum()

In [37]:
m1.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,n,p
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
the,7436,0.036303
to,6924,0.033803
and,6290,0.030708
of,6145,0.03
her,3747,0.018293


In [38]:
m2.sort_values('p', ascending=False).head()

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p
w0,w1,Unnamed: 2_level_1,Unnamed: 3_level_1
of,the,857,0.004184
to,be,814,0.003974
in,the,683,0.003334
mrs,<s>,530,0.002587
it,was,498,0.002431


In [39]:
m3.sort_values('p', ascending=False).head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,p
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1
mrs,<s>,<s>,530,0.002587
it,<s>,<s>,369,0.001801
her,<s>,<s>,244,0.001191
him,<s>,<s>,227,0.001108
mr,<s>,<s>,179,0.000874
you,<s>,<s>,172,0.00084
them,<s>,<s>,161,0.000786
me,<s>,<s>,160,0.000781
elinor,<s>,<s>,119,0.000581
i,am,sure,107,0.000522


## Compute conditional probabilities

$p(w_1|w_0) = p(w_0, w_1) / p(w_0)$

$p(w_2|w_0,w_1) = p(w_0, w_1, w_2) / p(w_0, w_1)$

In [40]:
m2m = m2.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

In [43]:
# m2m

In [41]:
m3m = m3.n.unstack().fillna(0).apply(lambda x: x / x.sum(), 1)

# Predict Sentences

In [45]:
def predict_sentence2(sent_str, n=2):
    
    # Pick appropriate model
    global m1, m2, m3
    if n == 1:
        M = m1
    elif n == 2:
        M = m2
    elif n == 3:
        M = m3
    else:
        return False
    
    # Get smoothing 
    smooth = M.p.min()
    
    # Add sentence padding (Hacky)
    padded_sent_str = sent_str + (' <s>' * (n-1))
    
    # Parse sentence into tokens and normalize string
    tokens = pd.DataFrame(padded_sent_str.lower().split(), columns=['term_str'])
    
    # Generate ngram keys 
    ngrams = []
    offset = n - 1
    for i in range(offset, tokens.shape[0]):
        ngram = []
        w = tokens.iloc[i].term_str
        for j in range(n):
            ngram.append(tokens.iloc[i-j].term_str)
        ngram.reverse()
        ngrams.append(ngram)
        
    # Compute the probability of the sentence
    L = 0
    for ngram in ngrams:
        try:
            p_ngram = M.loc[tuple(ngram)].p
        except KeyError:
            p_ngram = smooth
        L += np.log2(p_ngram)
    P = np.exp(L)
    
    print(sent_str, P)

In [46]:
predict_sentence2('I love you', 1)
predict_sentence2('I love cars', 1)
predict_sentence2("I want to", 1)
predict_sentence2("anne said to", 1)
predict_sentence2("said to her", 1)
predict_sentence2('said to him', 1)

I love you 5.645972739472476e-11
I love cars 1.118907816687782e-15
I want to 1.9570792682414204e-10
anne said to 2.6650097828995353e-10
said to her 4.829429322644128e-09
said to him 8.338111808245719e-10


In [47]:
predict_sentence2('I love you', 2)
predict_sentence2('I love cars', 2)
predict_sentence2("I want to", 2)
predict_sentence2("anne said to", 2)
predict_sentence2("said to her", 2)
predict_sentence2('said to him', 2)

I love you 1.6912924832811006e-18
I love cars 2.0639180372517065e-22
I want to 2.0994247126049545e-19
anne said to 7.112019880991409e-20
said to her 7.131778675619001e-15
said to him 1.2820268067010739e-15


In [48]:
predict_sentence2('I love you', 3)
predict_sentence2('I love cars', 3)
predict_sentence2("I want to", 2)
predict_sentence2("anne said to", 3)
predict_sentence2("said to her", 3)
predict_sentence2('said to him', 3)

I love you 1.725817247418853e-20
I love cars 1.0275642842631827e-23
I want to 2.0994247126049545e-19
anne said to 1.1935219350244338e-21
said to her 6.065124721977218e-18
said to him 9.465985002423803e-18


# Explore

In [49]:
m2m.loc[['he','she','it','anne','wentworth'], 
        ['is','had','was','felt','thought','looked','said','saw']]\
    .style.background_gradient(cmap='Greens')

w1,is,had,was,felt,thought,looked,said,saw
w0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
he,0.0578035,0.14499,0.120906,0.00529865,0.00481696,0.00867052,0.0163776,0.00529865
she,0.0242842,0.148967,0.135194,0.0181225,0.00978615,0.0050743,0.0105111,0.0159478
it,0.0969589,0.0225403,0.178175,0.000357782,0.000357782,0.000357782,0.000357782,0.0
anne,0.00397614,0.0755467,0.0894632,0.0178926,0.00397614,0.00198807,0.00198807,0.00596421
wentworth,0.0137615,0.0366972,0.0825688,0.0,0.0,0.00458716,0.00917431,0.00458716


In [50]:
m2m.loc[['he','she'],['felt','said']].style.background_gradient(cmap='Greens')

w1,felt,said
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
he,0.00529865,0.0163776
she,0.0181225,0.0105111


# Generate Text

We use back-off to account for missing ngrams.

In [51]:
def generate_text(start_word='she', n=250):
    words = [start_word]
    for i in range(n):
        if len(words) == 1:
            w = m2m.loc[start_word]
            next_word = m2m.loc[start_word].sample(weights=w).index.values[0]
        elif len(words) > 1:
            bg = tuple(words[-2:])
            try:
                w = m3m.loc[bg]
                next_word = m3m.loc[bg].sample(weights=w).index.values[0]
            except KeyError:
                ug = bg[1]
                if ug == '<s>':
                    next_word = m1.sample(weights=m1.p).index[0]
                else:
                    w = m2m.loc[ug]
                    next_word = m2m.loc[ug].sample(weights=w).index.values[0]
        words.append(next_word)
    text = ' '.join(words)
    text = text.replace(' <s> <s>', '.') + '.'
    text = text.upper() # To give that telegraph message look :-)
    print(text)

In [52]:
generate_text('the')

THE DOOR WAS OPENED BEFORE SHE KNEW NOT WHAT THEY COULD BE AUTHORISED BY NOTHING ELSE TO BE UNKIND HOWEVER AND AS FOR LADY RUSSELL WOULD LIKE HIM. AND I AM NOT DECEIVED HER. TIME MAY COME WHEN HARRY WILL REGRET THAT THEY WOULD BURST OUT AND BROKEN UP. FEW MOMENTS REFLECTION HOWEVER PRODUCED A GREAT DEAL OF MOST CHARACTERISTIC PROCEEDING. I HAVE HEARD IT YESTERDAY BY CHANCE THAN ANY OTHER WOMAN I NEVER SHALL. HER SPIRITS TO BE FETTERED TO LUCY SHE IS VERY ASTONISHING. PRESENT INSTANCE THIS LAST WEEK AND RATHER VULGAR. TALKED OF TO EVERYBODY. EXCELLENT YOUNG MAN THOUGH HERE IT PLAINLY APPEARED THAT THOUGH THERE COULD HAVE BEEN NOTHING TO WHAT THE GREATEST SIMPLETON IN THE MIDST OF THE TWO FAIR RIVALS WERE THUS DIVIDED FORMING THREE DISTINCT PARTIES. DONE WHEN THEY ARE ALL GONE TOGETHER BLESSED HER MEMORY. SET HIM DOWN AS SHE REJOICED IN THERE BEING NOTHING TO FORFEIT HER ESTEEM SHE THOUGHT ABOUT ME. HAD BEEN PREVIOUSLY INFORMED. MOTHER TO STAY A MINUTE NOT A MOTHER TO PART WITH HER HANDK

In [None]:
generate_text('she')