# Language Modelling Lab (week 4)
This notebook provides the "starter" code in the week 4 lab


1.
We need to get the names of files in the training directory and split them into training and testing 50:50.

In [25]:
import os,random,math
TRAINING_DIR="sentence-completion/Holmes_Training_Data"  #this needs to be the parent directory for the training corpus

def get_training_testing(training_dir=TRAINING_DIR,split=0.5):

    filenames=os.listdir(training_dir)
    n=len(filenames)
    print("There are {} files in the training directory: {}".format(n,training_dir))
    random.seed(53)  #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    return(filenames[:index],filenames[index:])

trainingfiles,heldoutfiles=get_training_testing()


There are 522 files in the training directory: sentence-completion/Holmes_Training_Data


In [6]:
len(trainingfiles)

261

In [10]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/finpearson/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## 1  Building a unigram model

In [26]:
from nltk import word_tokenize as tokenize
import operator

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #convert the accumulated counts to probabilities
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
        
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))
      
            
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
       
    def get_prob(self,token,method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,0)
        else:
            print("Not implemented: {}".format(method))
            return 0
    

    
        
       

In [27]:
MAX_FILES=5
mylm=language_model(files=trainingfiles[:MAX_FILES])

Processing 19TOM10.TXT
Processing SNOWI10.TXT
Processing FBRLS10.TXT
Processing WTSLW10.TXT
UnicodeDecodeError processing WTSLW10.TXT: ignoring file
Processing MOHIC10.TXT
UnicodeDecodeError processing MOHIC10.TXT: ignoring file


Make sure you look up some probabilities of words in your model.  Pick some words which you would expect to have high probabilities and some words which you would expect to have low probabilities.

As an extension, see how these change if you use a bigger portion of the training data to train your model.


In [33]:
sorted_mylm = dict(sorted(mylm.unigram.items(), key=lambda x:x[1], reverse=True))
#print(sorted_mylm)


In [72]:
import itertools
n = 1000
top_n_words = dict(itertools.islice(sorted_mylm.items(), n))
#print(top_n_words)
outputString = ""
for i in range(0,50):
    nextwordpair = random.choices(list(top_n_words.items()), list(top_n_words.values()))
    #print(nextwordpair)
    nextword = nextwordpair[0][0]
    #print(nextword)
    if nextword == "__END" or nextword == "__START":
        pass
    else:
        outputString = outputString + " " + nextword
        if nextword == '.':
            break

print(outputString)

 should letter this in ' She it fact 'How his with ' a always she ; sometimes .


In [64]:
random_word = random.choices(list(top_n_words.items()), list(top_n_words.values()))
print(random_word)

[('he', 0.009378406632416326)]


In [28]:
from nltk import word_tokenize as tokenize
import operator

class language_model2():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        self.bigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #convert the accumulated counts to probabilities
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1

        pastToken = "__END"
        for token in tokens:
            tempDict = self.bigram.get(pastToken, {})
            #print(tempDict)
            #self.bigram[pastToken][token] = self.bigram.get([pastToken],0)+1
            tempDict[token] = tempDict.get(token, 0)+1
            self.bigram[pastToken] = tempDict
            pastToken = token
        
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))
      
            
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
        for key in self.bigram:
            self.bigram[key] = {k:v/sum(self.bigram[key].values()) for (k,v) in self.bigram[key].items()}
       
    def get_prob(self,token,method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,0)
        else:
            print("Not implemented: {}".format(method))
            return 0
    

    
        
       

In [29]:
MAX_FILES = 100
mylm_bigram = language_model2(files=trainingfiles[:MAX_FILES])
#print(mylm_bigram.bigram)


Processing 19TOM10.TXT
Processing SNOWI10.TXT
Processing FBRLS10.TXT
Processing WTSLW10.TXT
UnicodeDecodeError processing WTSLW10.TXT: ignoring file
Processing MOHIC10.TXT
UnicodeDecodeError processing MOHIC10.TXT: ignoring file
Processing CEVEN10.TXT
Processing WNLAW10.TXT
Processing PRESC10.TXT
Processing MPOOL10.TXT
Processing AHERO10.TXT
Processing TBSCC10.TXT
Processing JUNGL10.TXT
Processing WWHIT10.TXT
Processing LPRIN10.TXT
Processing CLOTL11.TXT
Processing WARW11.TXT
Processing PCRCS10.TXT
Processing LVGRO10.TXT
Processing PROTT10.TXT
Processing TWILS10.TXT
Processing ALAMO10.TXT
Processing TETHR10.TXT
Processing NATIV10.TXT
Processing MRMEN10.TXT
Processing TBTIC10.TXT
Processing GHROS10.TXT
UnicodeDecodeError processing GHROS10.TXT: ignoring file
Processing 08WOZ10.TXT
Processing 10WOZ10.TXT
Processing MAROG10.TXT
Processing 2TALE10.TXT
Processing 2DFRE10.TXT
Processing GRTTR10.TXT
Processing OWLCR10.TXT
Processing OTORN10.TXT
Processing ZENDA10.TXT
Processing SILAP10.TXT
Pr

In [32]:
print(mylm_bigram.bigram['__START'])



In [33]:
#print(mylm_bigram.bigram.keys())
#nextwordpair = random.choices(list(top_n_words.items()), list(top_n_words.values()))
token = "__START"
for i in range(0,100):
    outputString = ""
    for i in range (0,50):
        unsorted_word_dict = mylm_bigram.bigram[token]
        word_dict = dict(sorted(unsorted_word_dict.items(), key=lambda x:x[1], reverse=True))
        #print(word_dict)
        nextwordpair = random.choices(list(word_dict.items()), list(word_dict.values()))
        nextword = nextwordpair[0][0]
        #print(nextword)
        if nextword == '.' or nextword == ',' or nextword == '!':
            outputString = outputString + nextword
            break
        else:
            outputString = outputString + " " + nextword
        token = nextword
    print(outputString + '\n')


 stood there,

 's fortune,

 and had __END __START indeed you see or set down ? '' was lying white is the __END __START to search ? '' said,

 she passed over the little nearer freedom of every grain by his warlike and hind legs.

 would therefore,

,

 stretched at quite commendable.

 : __END __START could __END __START mechanics too unknown to go back over the darkness was so affected her senses,

 and though not see the key hanging to me!

,

!

,

 ? '' __END __START of the scullery.

,

 had the 'green tree.

 the sled stopped smoking,

 __END __START `` I want it 's the Barracks,

 when she said my own life,

,

 unthinkingly upon the more about him,

 in the playground.

 to watch my feet.

 again,

 -- a longer.

 stay with thee can never got for which he could not served when she did n't __END __START them is so __END __START of __END __START `` So far away.

 since from the first reverse.

 of daemons and so ; __END __START time.

,

 by her nephew,

 ; then putting on that