# N-Gram Language Models Implementation

In [None]:
import os
import pickle
import nltk
import nltk.data
import random
import math
from nltk import word_tokenize, sent_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
import pandas as pd

stop_words = set(stopwords.words('english'))

## Reading 20N dataset

In [52]:
def read_document_20N():
    """
    This method reads and writes the 20N Dataset
    :return:
    """
    new_file = os.path.join(os.getcwd(), 'datasets/20news_file')
    if os.path.exists(new_file):
        os.remove(new_file)

    folder_path = os.path.join(os.getcwd(), 'datasets/20news-18828')
    inner_dirs = os.listdir(folder_path)
    for dir in inner_dirs:
        if not dir.startswith('.'):
            #print(dir)
            dir_path = os.path.join(folder_path,dir)
            filenames = os.listdir(dir_path)
            for file in filenames:
                cur_path = os.path.join(dir_path,file)
                #print("Copying "+file)
                with open(cur_path,'r', errors="ignore") as firstfile, open(new_file,'a') as secondfile:
                    for line in firstfile:
                        secondfile.write(line)
    print("Archivo terminado")


read_document_20N()


Archivo terminado


## Reading BAC dataset

In [53]:
def read_document_BAC():
    """
    This method reads and writes the BAC Dataset
    :return:
    """
    new_file = os.path.join(os.getcwd(), 'datasets/bac_file')
    if os.path.exists(new_file):
        os.remove(new_file)

    folder_path = os.path.join(os.getcwd(), 'datasets/blogs')
    files = os.listdir(folder_path)
    #files.sort()
    for file in files:
        #print(file)
        cur_path = os.path.join(folder_path,file)
        with open(cur_path,'r', errors="ignore") as firstfile, open(new_file,'a') as secondfile:
            for line in firstfile:
                if len(line)>8 and not len(line)==28:
                    #print(len(line))
                    #print(line.strip())
                    secondfile.write(line.strip())
    print("Archivo terminado")


read_document_BAC()


Archivo terminado


## Tokenize by sentence

In [54]:
def sentences(path):
    """
    Divides the document in a List of sentences.
    :return:
    """
    news_file = os.path.join(os.getcwd(), path)
    text = open(news_file).read()
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = sent_detector.tokenize(text.strip())
    return sentences

sentencesf1 = sentences("datasets/20news_file")
sentencesf2 = sentences("datasets/bac_file")


In [55]:
def tokenization(sentences):
    """
    Tokenization of the sentences array.
    :return:
    """
    tokens = []
    i = 0
    for sentence in sentences:
        #print(sentence)
        # Normalize, but DO NOT eliminate stop words.
        lower_text = sentence.lower()
        sentence_tok = [token for token in nltk.word_tokenize(lower_text) if (token.isalnum())]
        if len(sentence_tok)== 0:
            continue
        # Replace numbers with a token named NUM.
        sentence_tok = ["NUM" if token.isnumeric() else token for token in sentence_tok]
        # Add sentence start and end tags <s></s>.
        sentence_tok.insert(0,"<s>")
        sentence_tok.append("</s>")
        #print(sentence_tok)
        tokens.extend(sentence_tok)
        #print(tokens)
    return tokens


tokensf1 = tokenization(sentencesf1)
tokensf2 = tokenization(sentencesf2)

#print(tokensf1)


In [56]:
def unit_frequency(tokens):
    """

    :param tokens:
    :return:
    """
    tokens_final = ["<UKN>" if tokens.count(token) == 1 else token for token in tokens]
    return tokens_final


tokensf1 = unit_frequency(tokensf1)
tokensf2 = unit_frequency(tokensf2)


KeyboardInterrupt: 

## Training and Testing Division

In [57]:
def sentence_selection(tokens):
    sentences = []
    i = 0
    for token in tokens:
        if "<s>" in token:
            new_sentence = [token]
        elif "</s>" in token:
            new_sentence.append(token)
            sentences.append(new_sentence)
        else:
            new_sentence.append(token)
    percentage = math.floor(len(sentences)*0.8)
    training = random.sample(sentences, percentage)
    test = [ sentence for sentence in sentences if sentence not in training ]

    return training, test


(training1, test1) = sentence_selection(tokensf1)
(training2, test2) = sentence_selection(tokensf2)

KeyboardInterrupt: 

In [None]:
#lista = [['hola','amigo'],['hola1','amigo1'],['hola2','amigo2'],['hola3','amigo3']]
#porcentaje = math.floor(len(lista)*0.8)
#training = random.sample(lista, porcentaje)
#test = [ sentence for sentence in lista if sentence not in training ]

#print(training)
#print(test)

In [None]:
def writing_files(file, training, testing):
    file_name = "20N" if file == 1 else "BAC"
    path_training = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_training")
    path_testing = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_testing")
    outfile = open(path_training,'wb')
    pickle.dump(training,outfile)
    outfile.close()

    outfile = open(path_testing,'wb')
    pickle.dump(testing,outfile)
    outfile.close()

writing_files(1, training1, test1)
writing_files(2, training2, test2)


In [3]:
def reading_files(file):
    file_name = "20N" if file == 1 else "BAC"
    path_training = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_training")
    path_testing = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_testing")
    with open(path_training, 'rb') as training:
        training = pickle.load(training)
    with open(path_testing, 'rb') as testing:
        testing = pickle.load(testing)
    return training, testing


training1, test1 = reading_files(1)
training2, test2 = reading_files(2)


In [4]:
def crop_data(training, test):
    percentage = 0.005

    training_small = training[:round(len(training)*percentage)]
    test_small = test[:round(len(test)*percentage)]
    return training_small,test_small

training1_small, test1_small =  crop_data(training1, test1)
training2_small, test2_small =  crop_data(training2, test2)

## N-grams Modelling with Laplace Smoothing

In [5]:

def ngrams_generation(training):
    tokenized_text = training
    ngrams_all = {1:[], 2:[], 3:[]}
    for i in range(3):
        for each in tokenized_text:
            for j in ngrams(each, i+1):
                ngrams_all[i+1].append(j);

    ngrams_voc = {1:set([]), 2:set([]), 3:set([])}

    for i in range(3):
        for gram in ngrams_all[i+1]:
            if gram not in ngrams_voc[i+1]:
                ngrams_voc[i+1].add(gram)
    total_ngrams = {1:-1, 2:-1, 3:-1}
    total_voc = {1:-1, 2:-1, 3:-1}

    for i in range(3):
        total_ngrams[i+1] = len(ngrams_all[i+1])
        total_voc[i+1] = len(ngrams_voc[i+1])

    ngrams_prob = {1:[], 2:[], 3:[]}
    for i in range(3):
        for ngram in ngrams_voc[i+1]:
            tlist = [ngram]
            tlist.append(ngrams_all[i+1].count(ngram))
            ngrams_prob[i+1].append(tlist)

    for i in range(3):
        for ngram in ngrams_prob[i+1]:
            ngram[-1] = (ngram[-1]+1)/(total_ngrams[i+1]+total_voc[i+1])
    return ngrams_prob

ngrams_prob_1 = ngrams_generation(training1_small)
ngrams_prob_2 = ngrams_generation(training2_small)


In [6]:
#Prints top 10 unigram, bigram, trigram after smoothing
print("Most common n-grams without stopword removal and with add-1 smoothing: \n")
for i in range(3):
    ngrams_prob_1[i+1] = sorted(ngrams_prob_1[i+1], key = lambda x:x[1], reverse = True)

print ("Most common unigrams: ", str(ngrams_prob_1[1][:10]))
print ("\nMost common bigrams: ", str(ngrams_prob_1[2][:10]))
print ("\nMost common trigrams: ", str(ngrams_prob_1[3][:10]))

Most common n-grams without stopword removal and with add-1 smoothing: 

Most common unigrams:  [[('the',), 0.03246727410177854], [('to',), 0.018740301595511877], [('of',), 0.016512155333625114], [('a',), 0.016512155333625114], [('and',), 0.013806549158476903], [('NUM',), 0.012811841005848885], [('in',), 0.012772052679743764], [('is',), 0.010225599809016035], [('i',), 0.009668563243544344], [('that',), 0.009390044960808499]]

Most common bigrams:  [[('of', 'the'), 0.0025376830647873545], [('in', 'the'), 0.00250710857003088], [('NUM', 'NUM'), 0.0017427462011190265], [('subject', 're'), 0.0016815972116060783], [('on', 'the'), 0.0010701073164765952], [('is', 'a'), 0.0010395328217201211], [('to', 'the'), 0.001008958326963647], [('to', 'be'), 0.0008866603479377503], [('for', 'the'), 0.0008866603479377503], [('with', 'the'), 0.0008560858531812762]]

Most common trigrams:  [[('NUM', 'NUM', 'NUM'), 0.0007150518412584912], [('writes', 'in', 'article'), 0.00023835061375283043], [('NUM', 'NUM', '

## File Generation

In [9]:
def writing_models(file, ngrams_prob):
    file_name = "20N" if file == 1 else "BAC"
    path_unigram = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_unigram")
    path_bigram = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_bigram")
    path_trigram = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_trigram")
    outfile = open(path_unigram,'wb')
    pickle.dump(ngrams_prob[1],outfile)
    outfile.close()

    outfile = open(path_bigram,'wb')
    pickle.dump(ngrams_prob[2],outfile)
    outfile.close()

    outfile = open(path_trigram,'wb')
    pickle.dump(ngrams_prob[3],outfile)
    outfile.close()

writing_models(1, ngrams_prob_1)
writing_models(2, ngrams_prob_2)


In [24]:
def reading_models(file):
    file_name = "20N" if file == 1 else "BAC"
    path_unigram = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_unigram")
    path_bigram = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_bigram")
    path_trigram = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_trigram")
    with open(path_unigram, 'rb') as unigram:
        unigram = pickle.load(unigram)
    with open(path_bigram, 'rb') as bigram:
        bigram = pickle.load(bigram)
    with open(path_bigram, 'rb') as trigram:
        trigram = pickle.load(trigram)
    return unigram, bigram, trigram


unigram_1, bigram_1, trigram_1  = reading_models(1)
unigram_2, bigram_2, trigram_2  = reading_models(2)


## Perplexity

In [23]:
def perplexity(test, n, ngram):
    perp = 1;
    N = 0
    for sentence in test:
        for j in range(len(sentence)):
            for gram in ngram:
                for i in range(n):
                    encontrado = True
                    if j-i >= 0 and sentence[j-i] == gram[0][n-i-1]:
                       encontrado = encontrado and True
                    else:
                       encontrado = encontrado and False
                       break
                    if i == n-1 and encontrado:
                        perp = perp*(1/gram[1])
                        #print(perp)
                        N = N+1
    #print(perp)
    print(N)
    return perp**(1/N)

uni_perp_1 = perplexity(test1_small,1,unigram_1)
bi_perp_1 = perplexity(test1_small,2,bigram_1)
tri_perp_1 = perplexity(test1_small,3,trigram_1)
print("Unigram Perplexity 20N", uni_perp_1)
print("Bigram Perplexity 20N", bi_perp_1)
print("Trigram Perplexity 20N", tri_perp_1)

uni_perp_2 = perplexity(test2_small,1,unigram_2)
bi_perp_2 = perplexity(test2_small,2,bigram_2)
tri_perp_2 = perplexity(test2_small,3,trigram_2)
print("Unigram Perplexity 20N", uni_perp_2)
print("Bigram Perplexity 20N", bi_perp_2)
print("Trigram Perplexity 20N", tri_perp_2)


Unigram Perplexity inf
Bigram Perplexity inf
Trigram Perplexity inf


## Linear Interpolation

In [26]:
def linear_interpolation(training):
    tokenized_text = training
    ngrams_all = {1:[], 2:[], 3:[]}
    for i in range(3):
        for each in tokenized_text:
            for j in ngrams(each, i+1):
                ngrams_all[i+1].append(j);

    ngrams_voc = {1:set([]), 2:set([]), 3:set([])}

    for i in range(3):
        for gram in ngrams_all[i+1]:
            if gram not in ngrams_voc[i+1]:
                ngrams_voc[i+1].add(gram)
    total_ngrams = {1:-1, 2:-1, 3:-1}
    total_voc = {1:-1, 2:-1, 3:-1}

    for i in range(3):
        total_ngrams[i+1] = len(ngrams_all[i+1])
        total_voc[i+1] = len(ngrams_voc[i+1])

    ngrams_prob = {1:[], 2:[], 3:[]}
    for i in range(3):
        for ngram in ngrams_voc[i+1]:
            tlist = [ngram]
            tlist.append(ngrams_all[i+1].count(ngram)/(total_ngrams[i+1]))
            ngrams_prob[i+1].append(tlist)

    linear = []
    lambda1 = 1/3
    lambda2 = 1/3
    lambda3 = 1/3

    for trigram in ngrams_prob[3]:
        for bigram in ngrams_prob[2]:
            for unigram in ngrams_prob[1]:
                if trigram[0][-1] == bigram[0][-1] and trigram[0][-2] == trigram[0][-2] and trigram[0][-1] == unigram[0][0]:
                    prob = lambda1*trigram[1] + lambda2*bigram[1] + lambda3*unigram[1]
                    linear.append([trigram[0],prob])

    return linear

linear_1 = linear_interpolation(training1_small)
linear_2 = linear_interpolation(training1_small)


KeyboardInterrupt: 

In [None]:
#Prints top 10 trigrams after smothing
print("Most common n-grams with linear interpolation: \n")
linear_1 = sorted(linear_1, key = lambda x:x[1], reverse = True)

print ("\nMost common trigrams: ", str(linear_1[:10]))

## Perplexity Comparrison

In [None]:
linear_perp_1 = perplexity(test1_small,3,linear_1)
linear_perp_2 = perplexity(test2_small,3,linear_1)
print("Linear Interpolation Perplexity 20N", linear_perp_1)
print("Linear Interpolation Perplexity BAC", linear_perp_2)


## Sentence Generator

In [None]:
def sentence_generator(word, ngram, sentence):
    if word == "</s>":
        return sentence
    else:
        for gram in ngram:
            if word == gram[0][-1]:
                sentence = sentence + " " + gram[0][-2]
                word = gram[0][-2]
                break
        sentence_generator(word, ngram, sentence)

linear_1
linear_1 = sorted(linear_1[i+1], key = lambda x:x[1], reverse = True)

test_sentence = sentence_generator("friend", linear_1, "")
print("Test Sentence 1", test_sentence)

test_sentence = sentence_generator("politics", linear_1, "")
print("Test Sentence 2", test_sentence)

test_sentence = sentence_generator("i", linear_1, "")
print("Test Sentence 3", test_sentence)

test_sentence = sentence_generator("Once", linear_1, "")
print("Test Sentence 4", test_sentence)



