# Language Modelling Lab (week 4)
This notebook provides the "starter" code in the week 4 lab


1.
We need to get the names of files in the training directory and split them into training and testing 50:50.

In [1]:
import os,random,math
TRAINING_DIR="lab2resources/sentence-completion/Holmes_Training_Data"  #this needs to be the parent directory for the training corpus

def get_training_testing(training_dir=TRAINING_DIR,split=0.5):

    filenames=os.listdir(training_dir)
    n=len(filenames)
    print("There are {} files in the training directory: {}".format(n,training_dir))
    random.seed(53)  #if you want the same random split every time
    random.shuffle(filenames)
    index=int(n*split)
    return(filenames[:index],filenames[index:])

trainingfiles,heldoutfiles=get_training_testing()


There are 522 files in the training directory: lab2resources/sentence-completion/Holmes_Training_Data


In [2]:
len(trainingfiles)

261

## 1  Building a unigram model

In [3]:
from nltk import word_tokenize as tokenize
import operator

class language_model():
    
    def __init__(self,trainingdir=TRAINING_DIR,files=[]):
        #store the names of the files containing training data and run the training method
        self.training_dir=trainingdir
        self.files=files
        
        self.train()
        
    def train(self):
        #initialise an empty dictionary which will be the unigram model {w:P(w)} when training is complete
        self.unigram={}
        #process all of the training data, accumulating counts of events
        self._processfiles()
        #convert the accumulated counts to probabilities
        self._convert_to_probs()
        
    def _processline(self,line):
        #process each line of a file
        #each line is tokenized and has a special start and end token added
        #counts of tokens are added to the self.unigram count model
        tokens=["__START"]+tokenize(line)+["__END"]
        for token in tokens:
            self.unigram[token]=self.unigram.get(token,0)+1
    
    
    def _processfiles(self):
        #process each file in turn
        for afile in self.files:
            print("Processing {}".format(afile))
            try:
                with open(os.path.join(self.training_dir,afile)) as instream:
                    for line in instream:
                        line=line.rstrip()
                        if len(line)>0:
                            self._processline(line)
            except UnicodeDecodeError:
                print("UnicodeDecodeError processing {}: ignoring file".format(afile))
      
            
    def _convert_to_probs(self):
        #self.unigram initially counts counts for each token {token:freq(token)}
        #sum all of the frequencies and divide each frequency by that sum to get probabilities
        
        self.unigram={k:v/sum(self.unigram.values()) for (k,v) in self.unigram.items()}
       
    def get_prob(self,token,method="unigram"):
        #simple look up method
        if method=="unigram":
            return self.unigram.get(token,0)
        else:
            print("Not implemented: {}".format(method))
            return 0
    

    
        
       

In [4]:
MAX_FILES=5
mylm=language_model(files=trainingfiles[:MAX_FILES])

Processing 19TOM10.TXT
Processing SNOWI10.TXT
Processing FBRLS10.TXT
Processing WTSLW10.TXT
UnicodeDecodeError processing WTSLW10.TXT: ignoring file
Processing MOHIC10.TXT
UnicodeDecodeError processing MOHIC10.TXT: ignoring file


Make sure you look up some probabilities of words in your model.  Pick some words which you would expect to have high probabilities and some words which you would expect to have low probabilities.

As an extension, see how these change if you use a bigger portion of the training data to train your model.
