In [40]:
import string
import time
from nltk.util import ngrams
from nltk.corpus import stopwords
import nltk
from collections import OrderedDict
from collections import Counter
from collections import defaultdict


## Preprocessing the tokens

In [4]:
def preprocess(tokenList):
    i=0
    for word1 in tokenList:
    #conversion into lowercase
        word1=word1.lower()
    #Takes Care of Multiple Punctuation Marks
        word1=word1.replace('.','').replace(',','').replace(':','').replace(';','').replace('!','').replace('?','').replace('(','').replace(')','').replace('-','').replace('_','').replace('\\',' ').replace('\"',' ').replace('\'',' ')      
    
        tokenList[i]=word1        
        i=i+1

## Creating unigram dictionary

In [5]:
def unigramize(unigrams):
    
    od=Counter()
    
    for item in unigrams:
        od[item]+=1
        
    return od

## Creating bigram dictionary

In [6]:
def bigramize(bigrams):
    od2=Counter()


    for item in bigrams:
        od2[item]+=1
        
    return od2

## Creating trigram dictionary

In [7]:
def trigramize(trigrams):
    od3=Counter()


    for item in trigrams:
        od3[item]+=1
        
    return od3

## Creating quadgram dictionary

In [8]:
def quadgramize(quadgrams):
    od4=Counter()

    for item in quadgrams:
        od4[item]+=1
        
    return od4

In [9]:
def uni_prob(token,unigrams):
    
    uni_freq=Counter(unigrams)
    n=len(token)
    
    for item in uni_freq:
        uni_freq[item]=uni_freq[item]/n
        
    return uni_freq
    
    

In [10]:
def bi_prob(unigrams,bigrams):
    
    uni_freq=Counter(unigrams)
    bi_freq=Counter(bigrams)
    
    for item in bi_freq:
        uni=item[0]
        bi_freq[item]=bi_freq[item]/uni_freq[uni]
        
    return bi_freq

In [11]:
def tri_prob(bigrams,trigrams):
    
    bi_freq=Counter(bigrams)
    tri_freq=Counter(trigrams)
    
    for item in tri_freq:
        bi=item[0:2]
        tri_freq[item]=bi_freq[item]/bi_freq[bi]
        
    return tri_freq

## Creating quadgram probability table

In [12]:
def quad_prob(trigrams,quadgrams):
    
    tri_freq=Counter(trigrams)
    quad_freq=Counter(quadgrams)
    #print (tri_freq.items())
    
    for item in quad_freq:
        tri=item[0:3]
        quad_freq[item]=(quad_freq[item]/tri_freq[tri])
        
    return quad_freq

## Predicting the word

In [13]:
def pred_table(quad_prob_table):
    quad_pred_table=defaultdict(dict)
    
    for quad in quad_prob_table:
        prob=quad_prob_table[quad]
        tri=quad[0:3]
        token=quad[3]
        quad_pred_table[tri][token]=prob
        
    for tri in quad_pred_table:
        quad_pred_table[tri]=sorted(quad_pred_table[tri].items(), key=lambda x: x[1], reverse=True)
        
        return quad_pred_table
        
    


## Interpolation

In [14]:
def interpolation_table(od,od2,od3,od4):
    
    pole=defaultdict(dict)
    
    i=0
    s=0.0
    lambda1=0.25
    lambda2=0.25
    lambda3=0.25
    lambda4=0.25
    
    for item,value in od4.items():
        #if i>100:
         #   break    
        p=lambda1*float((value)/od3[item[0:3]]) + lambda2*float((od3[item[1:4]])/(od2[item[1:3]])) + lambda3*float((od2[item[2:4]])/(od[item[2]]+1)) +lambda4*(float((od[item[3]]+1)))
        pole[item[0:3]][item[3]]=p    
        
    for tri in pole:
        pole[tri]=sorted(pole[tri].items(), key=lambda x: x[1], reverse=True)
        
    return pole

## Calculating score of the model

In [15]:
def scoreCalc(quad,tri,tokenList2):
    score=0
    scorepred=OrderedDict()
    scorepred=OrderedDict.fromkeys(tokenList2,0)

    for item in quad:
        if item[0:3] in tri:
            scorepred[item[3]]+=1
        v=list(scorepred.values())
        k=list(scorepred.keys())
        if (k[v.index(max(v))]==item[3]):
            score+=1

    return score

## Add 1 Smoothing

In [25]:
def trismooth1(tokens,tri,trigramSet2,smooth_tri):
    
    #Add 1 Smoothing for trigram model
    
    i=len(trigramSet2)
    
    for item in trigramSet2:
        #smooth_bi.append(bi.count(item) + 1/float((token.count(item[0])+ len(bigramSet2))))
        smooth_tri[item]=tri.count(item) + 1/float((tokens.count(item[0:2])+ i))
        #print ("%s ->  %f" %(item,counting[i]/float((tokens.count(item[0])+ len(bigramSet2)))))


In [26]:
def quadsmooth1(tokens,quad,quadgramSet2,smooth_quad):
    
    #Add 1 Smoothing for quadgram model
    
    i=len(quadgramSet2)
    
    for item in quadgramSet2:
        smooth_quad[item]=quad.count(item) + 1/float((tokens.count(item[0:3])+ i))
          

## Add k smoothing

In [34]:
def trismoothk(tokens,tri,trigramSet2,smooth_tri,k):
    
    #Add k Smoothing for trigram model
    
    i=len(trigramSet2)
    
    for item in trigramSet2:
        smooth_tri[item]=tri.count(item) + k/float((tokens.count(item[0:2])+ i))
        

In [35]:
def quadsmoothk(tokens,quad,quadgramSet2,smooth_quad,k):
    
    #Add k Smoothing for quadgram model
    
    i=len(quadgramSet2)
    
    for item in quadgramSet2:
        smooth_quad[item]=quad.count(item) + k/float((tokens.count(item[0:3])+ i))
          
    


## Perplexity

In [22]:
def quadperp(tokenList2,smooth_quad,smooth_tri,quadgramSet2):
    
    #computing quadgram perplexity
    n= len (tokenList2)
    perplexity4=1.0
    
    for item in quadgramSet2:
        perplexity4=perplexity4*(((1/float(smooth_quad[item]))*smooth_tri[item[0:3]])**(1./n))
        
    print ("Quadgram Perplexity = %f" %(perplexity4))


## Main function calling all modules

In [43]:
def main():
    f=open('Data/LanguageModels/training_corpus.txt','r',encoding='latin1')
    content=f.read()
    token=content.split()

    #splitting into tokens
    tokenList=list(token)
    tokenSet=set(tokenList)

    #storing stop words separately
    stop = set(stopwords.words('english'))
    #print (len(stop))
    #tokenSet=tokenSet-stop


    #listing the tokens into n-grams
    unigrams=list(ngrams(token,1))
    bigrams=list(ngrams(token,2))
    trigrams=list(ngrams(token,3))
    quadgrams=list(ngrams(token,4))
    
    #Preprocessing the training set
    preprocess(tokenList)
    
    od=Counter()
    od=unigramize(unigrams)
    
    od2=Counter()
    od2=bigramize(bigrams)
    
    od3=Counter()
    od3=trigramize(trigrams)
    
    
    od4=Counter()
    od4=quadgramize(quadgrams)
    
    
    
    #Taking our input string
    sent=input("Enter your test string: ")
    list2=sent.split()
    #test=' '.join(list2[(len(list2)-3):len(list2)])
    sent_tri=list(ngrams(list2,3))
    x=len(sent_tri)
    test_tri=sent_tri[x-1]

    #Normal prediction of the most probable word
    quad_prob_table=Counter(od4)
    quad_prob_table=quad_prob(od3,od4)
    
    quad_pred_table=defaultdict(dict)
    quad_pred_table=pred_table(quad_prob_table)
    
    word=quad_pred_table[test_tri]
    word=sorted(word.items(), key=lambda x: x[1], reverse=True)
    print ("The next word could be: ")    
    print (word[0][0])
    
    #Prediction of the word after interpolation
    quad_pole_table=defaultdict(dict)
    quad_pole_table=interpolation_table(od,od2,od3,od4)
    word=quad_pole_table[test_tri]
    word=sorted(word, key=lambda x: x[1], reverse=True)
    print ("After interpolation, the most probable word could be: ")
    print (word[0][0])
    
    #computing score of the language model
    score=0
    with open('Data/LanguageModels/testing_corpus.txt','r',encoding='latin1') as f:
            contents=f.read()
            tokens=contents.split()
            tokenList2=list(tokens)
     
    #Preprocessing the test set
    preprocess(tokens)
    
    bi=list(ngrams(tokenList2,2))
    tri=list(ngrams(tokenList2,3))
    quad=list(ngrams(tokenList2,4))
    
    bigramSet2=set(bi)
    trigramSet2=set(tri)
    quadgramSet2=set(quad)
    
    #print ("Score of the language model is: ")
    #print (scoreCalc(quad,tri,tokenList2))
    
    smooth_tri=OrderedDict()
    smooth_tri=OrderedDict.fromkeys(trigramSet2,0)
    #trismooth1(tokens,tri,trigramSet2,smooth_tri)
    smooth_quad=OrderedDict()
    smooth_quad=OrderedDict.fromkeys(quadgramSet2,0)
    #quadsmooth1(tokens,quad,quadgramSet2,smooth_quad)
    k=int(input("We will perform Add k smoothing now. Enter value of k: "))
    time1=time.time()
    
    trismoothk(tokens,tri,trigramSet2,smooth_tri,k)
    quadsmoothk(tokens,quad,quadgramSet2,smooth_quad,k)
    
    
    quadperp(tokenList2,smooth_quad,smooth_tri,quadgramSet2)
    print ("Time taken: ", time.time()-time1)
    
    

In [44]:
if __name__ == "__main__":
    main()

Enter your test string: I will not
The next word could be: 
plague
After interpolation, the most probable word could be: 
say
We will perform Add k smoothing now. Enter value of k: 4
Quadgram Perplexity = 1.052211
Time taken:  34.63732933998108
