In [1]:
import numpy as np
import torch

# 2. Applications of Smoothing

## setup

In [2]:
lm={} #dictionary of language models

#set document d
d='the sun rises in the east and sets in the west'.split()
d_size=len(d)

#create set of words from d
words={w for w in d}

#calculate c(w, d)
c={}
for w in d:
    if w in c:
        c[w]+=1
    else:
        c[w]=1

#compute probabilities for base language model
lm['base']={f'p({w}|d)':c[w]/d_size for w in words}   

#create dictionary for p(w) and p_ref(w)
p={}
p['REF']={
    'a':.18,
    'the':.17,
    'from':.13,
    'retrieval':.02,
    'sun':.05,
    'rises':.04,
    'in':.16,
    'BM25':.01,
    'east':.02,
    'sets':.04,
    'west':.02,
    'and':.16,
}

#compute probabilities for background language model
lm['bg']={f'p({w}|d)':p['REF'][w] for w in words} 



## 2a. compute dirichlet prior smoothing model with $\mu=4$

In [3]:
mu=4
model='dp_4'
lm[model]={f'p({w}|d)':(c[w]+mu*p["REF"][w])/(d_size+mu) for w in words}
for key in lm[model]:
    print(f'{key}={lm[model][key]:.3f}') 

p(and|d)=0.109
p(rises|d)=0.077
p(in|d)=0.176
p(east|d)=0.072
p(sets|d)=0.077
p(west|d)=0.072
p(the|d)=0.245
p(sun|d)=0.080


## 2b. repeat 2a assuming $\mu=0.01$ and $\mu=100$

In [4]:
mus=[0.01, 100]
models=[f'dp_{mu}' for mu in mus]

#calculate model for each mu
for mu, model in zip(mus, models):
    lm[model]={f'p({w}|d)':(c[w]+mu*p["REF"][w])/(d_size+mu) for w in words}

#print out all probabilities for each model
for mu, model in zip(mus, models):
    print(f'dirichlet prior mu={mu}')
    for key in lm[model]:
        print(f'{key}={lm[model][key]:.3f}') 
    print()
    
    

dirichlet prior mu=0.01
p(and|d)=0.091
p(rises|d)=0.091
p(in|d)=0.182
p(east|d)=0.091
p(sets|d)=0.091
p(west|d)=0.091
p(the|d)=0.273
p(sun|d)=0.091

dirichlet prior mu=100
p(and|d)=0.153
p(rises|d)=0.045
p(in|d)=0.162
p(east|d)=0.027
p(sets|d)=0.045
p(west|d)=0.027
p(the|d)=0.180
p(sun|d)=0.054



## 2c. compute jelinek-mercer model for $\lambda={0.01,0.5,0.9}$

In [5]:
lambdas=[0.01, 0.5, 0.9]
models=[f'jm_{lmda}' for lmda in lambdas]

#calculate model for each lambda
for lmda, model in zip(lambdas, models):
    lm[model]={f'p({w}|d)':(1-lmda)*(c[w]/d_size)+lmda*p['REF'][w] for w in words}

#print out all probabilities for each model
for lmda, model in zip(lambdas, models):
    print(f'jelinek-mercer lambda={lmda}')
    for key in lm[model]:
        print(f'{key}={lm[model][key]:.3f}') 
    print()
    

jelinek-mercer lambda=0.01
p(and|d)=0.092
p(rises|d)=0.090
p(in|d)=0.182
p(east|d)=0.090
p(sets|d)=0.090
p(west|d)=0.090
p(the|d)=0.272
p(sun|d)=0.090

jelinek-mercer lambda=0.5
p(and|d)=0.125
p(rises|d)=0.065
p(in|d)=0.171
p(east|d)=0.055
p(sets|d)=0.065
p(west|d)=0.055
p(the|d)=0.221
p(sun|d)=0.070

jelinek-mercer lambda=0.9
p(and|d)=0.153
p(rises|d)=0.045
p(in|d)=0.162
p(east|d)=0.027
p(sets|d)=0.045
p(west|d)=0.027
p(the|d)=0.180
p(sun|d)=0.054



## compare the probabilities of each model

In [7]:
#setup variables to be used for printing the table
keys=list(lm.keys())
prob_names=list(lm[keys[0]].keys())
pad_len=max([len(name) for name in prob_names])
num_stars=100

#print headers
headers=f'p()\t\t'+'\t'.join(keys)
print(headers)

#print table contents
print(f'{"*"*num_stars}')
for i in prob_names:
    base=f'{i}{" "*(pad_len-len(i))}\t'
    cols=''
    for key in keys:
        cols+=f'{lm[key][i]:.3f}\t'        
    print(base + cols)
print(f'{"*"*num_stars}')

p()		base	bg	dp_4	dp_0.01	dp_100	jm_0.01	jm_0.5	jm_0.9
****************************************************************************************************
p(and|d)  	0.091	0.160	0.109	0.091	0.153	0.092	0.125	0.153	
p(rises|d)	0.091	0.040	0.077	0.091	0.045	0.090	0.065	0.045	
p(in|d)   	0.182	0.160	0.176	0.182	0.162	0.182	0.171	0.162	
p(east|d) 	0.091	0.020	0.072	0.091	0.027	0.090	0.055	0.027	
p(sets|d) 	0.091	0.040	0.077	0.091	0.045	0.090	0.065	0.045	
p(west|d) 	0.091	0.020	0.072	0.091	0.027	0.090	0.055	0.027	
p(the|d)  	0.273	0.170	0.245	0.273	0.180	0.272	0.221	0.180	
p(sun|d)  	0.091	0.050	0.080	0.091	0.054	0.090	0.070	0.054	
****************************************************************************************************
