# N-Gram Language Models Implementation

In [37]:
import os
import pickle
import nltk
import nltk.data
import random
import math

## Reading 20N dataset

In [38]:
def read_document_20N():
    """
    This method reads and writes the 20N Dataset
    :return:
    """
    new_file = os.path.join(os.getcwd(), 'datasets/20news_file')
    if os.path.exists(new_file):
        os.remove(new_file)

    folder_path = os.path.join(os.getcwd(), 'datasets/20news-18828')
    inner_dirs = os.listdir(folder_path)
    for dir in inner_dirs:
        if not dir.startswith('.'):
            #print(dir)
            dir_path = os.path.join(folder_path,dir)
            filenames = os.listdir(dir_path)
            for file in filenames:
                cur_path = os.path.join(dir_path,file)
                #print("Copying "+file)
                with open(cur_path,'r', errors="ignore") as firstfile, open(new_file,'a') as secondfile:
                    for line in firstfile:
                        secondfile.write(line)
    print("Archivo terminado")


read_document_20N()


Archivo terminado


## Reading BAC dataset

In [39]:
def read_document_BAC():
    """
    This method reads and writes the BAC Dataset
    :return:
    """
    new_file = os.path.join(os.getcwd(), 'datasets/bac_file')
    if os.path.exists(new_file):
        os.remove(new_file)

    folder_path = os.path.join(os.getcwd(), 'datasets/blogs')
    files = os.listdir(folder_path)
    #files.sort()
    for file in files:
        #print(file)
        cur_path = os.path.join(folder_path,file)
        with open(cur_path,'r', errors="ignore") as firstfile, open(new_file,'a') as secondfile:
            for line in firstfile:
                if len(line)>8 and not len(line)==28:
                    #print(len(line))
                    #print(line.strip())
                    secondfile.write(line.strip())
    print("Archivo terminado")


read_document_BAC()


KeyboardInterrupt: 

## Tokenize by sentence

In [None]:
def sentences(path):
    """
    Divides the document in a List of sentences.
    :return:
    """
    news_file = os.path.join(os.getcwd(), path)
    text = open(news_file).read()
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = sent_detector.tokenize(text.strip())
    return sentences

sentencesf1 = sentences("datasets/20news_file")
sentencesf2 = sentences("datasets/bac_file")


In [None]:
def tokenization(sentences):
    """
    Tokenization of the sentences array.
    :return:
    """
    tokens = []
    i = 0
    for sentence in sentences:
        #print(sentence)
        # Normalize, but DO NOT eliminate stop words.
        lower_text = sentence.lower()
        sentence_tok = [token for token in nltk.word_tokenize(lower_text) if (token.isalnum())]
        if len(sentence_tok)== 0:
            continue
        # Replace numbers with a token named NUM.
        sentence_tok = ["NUM" if token.isnumeric() else token for token in sentence_tok]
        # Add ta Add sentence start and end tags <s></s>.
        sentence_tok[0] = "<s>" + sentence_tok[0]
        sentence_tok[len(sentence_tok)-1] = sentence_tok[len(sentence_tok)-1] +  "</s>"
        #print(sentence_tok)
        tokens.extend(sentence_tok)
        #print(tokens)
    return tokens


tokensf1_pre = tokenization(sentencesf1)
tokensf2_pre = tokenization(sentencesf2)

#print(tokensf1)


In [None]:
def unit_frequency(tokens):
    """

    :param tokens:
    :return:
    """
    tokens_final = ["<UKN>" if tokens.count(token) == 1 else token for token in tokens]
    return tokens_final


tokensf1 = unit_frequency(tokensf1_pre)
tokensf2 = unit_frequency(tokensf2_pre)


## Training and Testing Division

In [None]:
def sentence_selection(tokens):
    sentences = []
    i = 0
    for token in tokens:
        if "<s>" in token:
            new_sentence = [token]
        elif "</s>" in token:
            new_sentence.append(token)
            sentences.append(new_sentence)
        else:
            new_sentence.append(token)
    percentage = math.floor(len(sentences)*0.8)
    training = random.sample(sentences, percentage)
    test = [ sentence for sentence in sentences if sentence not in training ]

    return training, test


(training1, test1) = sentence_selection(tokensf1)
#(training2, test2) = sentence_selection(tokensf2)

In [None]:
lista = [['hola','amigo'],['hola1','amigo1'],['hola2','amigo2'],['hola3','amigo3']]
porcentaje = math.floor(len(lista)*0.8)
training = random.sample(lista, porcentaje)
test = [ sentence for sentence in lista if sentence not in training ]

print(training)
print(test)

In [None]:
def writing_files(file, training, testing):
    file_name = "20N" if file == 1 else BAC
    path_training = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_training")
    path_testing = os.path.join(os.getcwd(), 'datasets/'+file_name+"_2_testing")
    outfile = open(path_training,'wb')
    pickle.dump(training,outfile)
    outfile.close()

    outfile = open(path_testing,'wb')
    pickle.dump(testing,outfile)
    outfile.close()

writing_files(1, training1, test1)
writing_files(2, training2, test2)
