In [3]:
import glob
import re
import math
from collections import Counter
from nltk.util import ngrams
from nltk.tokenize import sent_tokenize, word_tokenize

In [4]:
def load_data(input_directory):
    file_contents = ""
    for file_name in glob.glob(input_directory + "/*"):
        print("Prepocessing: {}".format(file_name))
        file_pointer = open(file_name, "r")

        # Read file contents
        file_content = file_pointer.read()

        # Remove duplicate spaces
        file_content = re.sub(' +', ' ', file_content)

        # Remove new line characters
        file_content = file_content.replace("\n", " ")

        if file_contents == "":
            file_contents = file_content
        else:
            file_contents = file_contents + ' ' + file_content
        file_pointer.close()
    return file_contents

In [5]:
def get_word_tokens(data):
    word_tokens = word_tokenize(data)
    word_tokens = [word_token.lower() for word_token in word_tokens]
    return word_tokens

In [6]:
def replace_low_count_words(word_tokens, cut_off_count):
    word_tokens_with_count = Counter(word_tokens)
    candidate_words = {}
    for word in word_tokens_with_count:
        if word_tokens_with_count[word] <= cut_off_count:
            candidate_words[word] = 1
    
    ## Remove all the words <= cut_off_count
    for i in range(len(word_tokens)):
        if word_tokens[i] in candidate_words:
            word_tokens[i] = 'UNK'

    return word_tokens

In [7]:
def generate_ngrams(word_tokens, n):
    ngrams_zip = zip(*[word_tokens[i:] for i in range(n)])
    ngrams_list = [" ".join(element) for element in ngrams_zip]
    ngrams_keys_counts = Counter(ngrams_list)
    return ngrams_keys_counts

In [8]:
def nltk_generate_ngrams(word_tokens, n):
    ngrams_result = list(ngrams(word_tokens, n))
    ngrams_keys_counts = Counter(ngrams_result)
    return ngrams_keys_counts

3.1
Across all files in the directory (counted together), report the unigram, bigram, and trigram wordlevel counts. Submit these counts in a file named ngramCounts.txt.

Note: You can use any word tokenizer to tokenize the dataset e.g. nltk word tokenize, although
for creating the n-grams do not use any libraries.

In [9]:
def run_1():
    print("##### Loading train date")
    train_data = load_data('../input/train')
    #     train_data = load_data('../input/custom_train')

    print("##### Tokenize train data")
    train_word_tokens = get_word_tokens(train_data)
    print("Number of original word tokens: {}".format(len(train_word_tokens)))
    
    print("### Replace word tokens <= 3 with 'UNK")
    train_word_tokens = replace_low_count_words(train_word_tokens, 3)
    print("Number of tokens after replacement: {}".format(len(train_word_tokens)))

    print("##### Get Unigrams")
    unigrams = generate_ngrams(train_word_tokens, 1)
    print("Unigrams counts: {}, nltk count: {}".format(len(unigrams), len( nltk_generate_ngrams(train_word_tokens, 1))))

    print("##### Get Bigrams")
    bigrams = generate_ngrams(train_word_tokens, 2)
    print("Bigrams counts: {}, nltk count: {}".format(len(bigrams), len( nltk_generate_ngrams(train_word_tokens, 2))))

    print("##### Get Trigrams")
    trigrams = generate_ngrams(train_word_tokens, 3)
    print("Trigrams counts: {}, nltk count: {}".format(len(trigrams), len(nltk_generate_ngrams(train_word_tokens, 3))))
    
    # Write to output file ngramCounts.txt
    print("Writing to output file ngramCounts.txt")
    
    with open('../output/ngramCounts.txt', 'w') as output_file:
        output_file.write('Unigrams: {}\n'.format(len(unigrams)))
        output_file.write('Bigrams: {}\n'.format(len(bigrams)))
        output_file.write('Trigrams: {}\n'.format(len(trigrams)))

In [10]:
run_1()

##### Loading train date
Prepocessing: ../input/train/a894e49e-a0a6-4851-be23-4da89a52bb8e.txt
Prepocessing: ../input/train/14d44997-7510-4777-adf0-5e4dd387e0bf.txt
Prepocessing: ../input/train/521ce35b-288c-4b12-a7a8-70b836290f90.txt
Prepocessing: ../input/train/9ae9dae7-69a0-4116-9622-448f154bc269.txt
Prepocessing: ../input/train/0493e223-a0e2-4c6f-ade9-7172f35c18b1.txt
Prepocessing: ../input/train/da059d4f-19ac-4130-858a-f6241b56fe39.txt
Prepocessing: ../input/train/30381986-3d6b-4227-9733-9483ead7343d.txt
Prepocessing: ../input/train/e0a13927-26e0-4ca8-b1a0-864b7604e566.txt
Prepocessing: ../input/train/5d384641-37e5-4b1c-b4d6-0ee935141ecb.txt
Prepocessing: ../input/train/904393ad-fbc1-4512-8705-ce1c005c4915.txt
##### Tokenize train data
Number of original word tokens: 2495360
### Replace word tokens <= 3 with 'UNK
Number of tokens after replacement: 2495360
##### Get Unigrams
Unigrams counts: 12936, nltk count: 12936
##### Get Bigrams
Bigrams counts: 386929, nltk count: 386929
####

3.2

For the given test dataset: https://www.dropbox.com/s/ik98szmqbsq2wtd/test.zip?dl=0
Calculate the perplexity for each file in the test set using linear interpolation smoothing method.
For determining the λs for linear interpolation, you can divide the training data into a new training
set (80%) and a held-out set (20%) , then using grid search method.

1. First, report all the candidate lambdas used for grid search and the corresponding perplexities you got on the held-out set

2. Report the best λs chosen from the grid search, and explain why it’s chosen (i.e. leveraging the perplexities achieved on the held-out set).

3. Report the perplexity for each file in the test set (use the best λs obtained from grid search to calculate perplexity on test set).

4. Based on the test file’s perplexities you got write a brief observation comparing the test files. Submit these perplexities and your report in a file named perplexitiesInterpolation.txt.

In [62]:
def get_candidate_lamdas(lambda_pool):
    candidate_lambdas = []
    for i in lambda_pool:
        for j in lambda_pool:
            if i+j < 1:
                candidate_lambdas.append([i, j, round(1-(i+j), 1)])
    return candidate_lambdas

In [63]:
def replace_eval_data_with_UNK(eval_word_tokens, train_unigrams):
    for i in range(len(eval_word_tokens)):
        if eval_word_tokens[i] not in train_unigrams:
            eval_word_tokens[i] = 'UNK' 
    return eval_word_tokens 

In [64]:
def get_unigram_probability(eval_word, corpus_length, train_unigrams):
    numerator = eval_word.split(" ")[-1:][0]
    probability =  train_unigrams[numerator]/corpus_length
    return(probability)

In [77]:
def get_bigram_probabiltiy(eval_word, train_unigrams, train_bigrams):
    denominator_word_tokens = eval_word.split(" ")[-2:-1]
    denominator_word = " ".join(denominator_word_tokens)
    numerator_word = eval_word.split(" ")[-2:]
    probability =  (train_bigrams[eval_word]) /(train_unigrams[denominator_word])
    return(probability)

In [81]:
def get_trigram_probabiltiy(eval_word, train_bigrams, train_trigrams):
    denominator_word_tokens = eval_word.split(" ")[-3:-1]
    denominator_word = " ".join(denominator_word_tokens)
    if denominator_word in train_bigrams:
        probability =  (train_trigrams[eval_word])/(train_bigrams[denominator_word])
        return probability
    else:
        return 0

In [79]:
def linear_interpolation(lambdas, train_unigrams, train_bigrams, train_trigrams, eval_trigrams):
    # Calculate corpous length
    corpus_length = 0
    for unigram in train_unigrams:
        corpus_length = corpus_length + train_unigrams[unigram]
    
    result = 0
    # Calculate the linear interpolation
    [lambda_1, lambda_2, lambda_3] = lambdas
    for eval_trigram in eval_trigrams:
        unigram_probability = get_unigram_probability(eval_trigram, corpus_length, train_unigrams)
        bigram_probabiltiy = get_bigram_probabiltiy(eval_trigram, train_unigrams, train_bigrams)
        trigram_probabiltiy = get_trigram_probabiltiy(eval_trigram, train_bigrams, train_trigrams)
        probability = (lambda_1 * trigram_probabiltiy) + (lambda_2 * bigram_probabiltiy) + (lambda_3 * unigram_probability)
        log_probability = math.log(probability)
        result = result + log_probability    
    return result

In [83]:
def calculate_perplexity(log_probability_score, corpus_length):
    perplexity = (math.e ** (-1/corpus_length * log_probability_score))
    return perplexity

In [69]:
def get_best_lambdas(candidate_lambdas, train_unigrams, train_bigrams, train_trigrams, eval_trigrams, eval_corpus_length):
    output_file = open('../output/perplexitiesInterpolation.txt', 'w')
    best_parameters = None
    best_score = None
    for i in range(len(candidate_lambdas)):
        result = linear_interpolation(candidate_lambdas[i], train_unigrams, train_bigrams, train_trigrams, eval_trigrams)
        if best_score == None:
            best_score = result
            best_parameters = candidate_lambdas[i]
        else:
            if result > best_score:
                best_score = result
                best_parameters = candidate_lambdas[i]
        # Calculate Perplexity
        perplexity = calculate_perplexity(result, eval_corpus_length)
        # Write to output file        
        output_file.write('Lambdas: {}, Perplexity Score: {} \n'.format(candidate_lambdas[i], perplexity)) 
    output_file.close()
    return best_parameters

In [87]:
def calculate_linear_interpolation_perplexity_for_test_data(input_directory, lambdas, train_unigrams, train_bigrams, train_trigrams):
    perplexities_interpolation_output_file = open('../output/perplexitiesInterpolation.txt', 'a')
    perplexities_interpolation_output_file.write('\n')
    
    for file_name in glob.glob(input_directory + "/*"):
        print("Prepocessing for test file : {}".format(file_name))
        file_pointer = open(file_name, "r")

        # Read file contents
        file_content = file_pointer.read()

        # Remove duplicate spaces
        file_content = re.sub(' +', ' ', file_content)

        # Remove new line characters
        file_content = file_content.replace("\n", " ")

        print("##### Tokenize test data")
        test_word_tokens = get_word_tokens(file_content)
        print("Number of original test word tokens: {}".format(len(test_word_tokens)))

        print("Replacing words in test data with UNK that is not present in train data")
        test_word_tokens = replace_eval_data_with_UNK(test_word_tokens, train_unigrams)

        print("Trigram for test data")
        test_trigrams = generate_ngrams(test_word_tokens, 3)
        test_corpus_length = len(test_word_tokens)
        
        log_probability = linear_interpolation(lambdas, train_unigrams, train_bigrams, train_trigrams, test_trigrams)
        perplexity = calculate_perplexity(log_probability, test_corpus_length)
        perplexities_interpolation_output_file.write('Filename: {}, Perplexity: {}\n'.format(file_name, perplexity))
        
    perplexities_interpolation_output_file.close()

In [85]:
def run_2():
    print("##### Loading input data")
    input_data = load_data('../input/train')
#     input_data = load_data('../input/custom_train')
    
    print("##### Tokenize input data")
    input_word_tokens = get_word_tokens(input_data)
    print("Number of original word tokens: {}".format(len(input_word_tokens)))
    
    eighty_percentile = round(len(input_word_tokens) * 0.8)
    train_word_tokens = input_word_tokens[:eighty_percentile]
    dev_word_tokens = input_word_tokens[eighty_percentile:]
    print("Trian: {}, Dev: {}, Total: {}".format(len(train_word_tokens), len(dev_word_tokens), len(input_word_tokens)))
    print("Percentage: Trian: {}, Dev: {}".format(len(train_word_tokens)/len(input_word_tokens), len(dev_word_tokens)/len(input_word_tokens)))
    
    print("### Replace train word tokens <= 3 with 'UNK")
    train_word_tokens = replace_low_count_words(train_word_tokens, 3)
    print("Number of train tokens after replacement: {}".format(len(train_word_tokens)))
    
    train_unigrams = generate_ngrams(train_word_tokens, 1)
    train_bigrams = generate_ngrams(train_word_tokens, 2)
    train_trigrams = generate_ngrams(train_word_tokens, 3)
    print("Training data Counts of Unigrams: {}, Bigrams: {}, Trigrams: {}".format(len(train_unigrams), len(train_bigrams), len(train_trigrams)))
    
    lambda_pool = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    candidate_lambdas = get_candidate_lamdas(lambda_pool)
    
    print("Replacing words in eval data with UNK that is not present in train data")
    dev_word_tokens = replace_eval_data_with_UNK(dev_word_tokens, train_unigrams)
    dev_corpus_length = len(dev_word_tokens)
    print("Trigram for held out data")
    dev_trigrams = generate_ngrams(dev_word_tokens, 3)

    lambdas = get_best_lambdas(candidate_lambdas, train_unigrams, train_bigrams, train_trigrams, dev_trigrams, dev_corpus_length)
    print("Best Lambdas: {}".format(lambdas))
    with open('../output/perplexitiesInterpolation.txt', 'a') as perplexities_interpolation_output_file:
        perplexities_interpolation_output_file.write('\n Choosen lambdas: {}\n'.format(lambdas))
        
#     calculate_perplexity_for_test_data('../input/custom_test', lambdas, train_unigrams, train_bigrams, train_trigrams)
    calculate_linear_interpolation_perplexity_for_test_data('../input/test', lambdas, train_unigrams, train_bigrams, train_trigrams)

In [86]:
run_2()

##### Loading input data
Prepocessing: ../input/train/a894e49e-a0a6-4851-be23-4da89a52bb8e.txt
Prepocessing: ../input/train/14d44997-7510-4777-adf0-5e4dd387e0bf.txt
Prepocessing: ../input/train/521ce35b-288c-4b12-a7a8-70b836290f90.txt
Prepocessing: ../input/train/9ae9dae7-69a0-4116-9622-448f154bc269.txt
Prepocessing: ../input/train/0493e223-a0e2-4c6f-ade9-7172f35c18b1.txt
Prepocessing: ../input/train/da059d4f-19ac-4130-858a-f6241b56fe39.txt
Prepocessing: ../input/train/30381986-3d6b-4227-9733-9483ead7343d.txt
Prepocessing: ../input/train/e0a13927-26e0-4ca8-b1a0-864b7604e566.txt
Prepocessing: ../input/train/5d384641-37e5-4b1c-b4d6-0ee935141ecb.txt
Prepocessing: ../input/train/904393ad-fbc1-4512-8705-ce1c005c4915.txt
##### Tokenize input data
Number of original word tokens: 2495360
Trian: 1996288, Dev: 499072, Total: 2495360
Percentage: Trian: 0.8, Dev: 0.2
### Replace train word tokens <= 3 with 'UNK
Number of train tokens after replacement: 1996288
Training data Counts of Unigrams: 116

3.3
Build another language model with add-λ smoothing. Use λ = 0.1 and λ = 0.3.
1. Report the perplexity for each file in the test set (for both the λ values).
2. Based on the test file’s perplexities you got write a brief observation comparing the test files.
Submit these perplexities and your report in a file named perplexitiesAddLambda.txt.

In [132]:
def add_k(k, vocab_size, train_bigrams, train_trigrams, eval_trigrams):
    result = 0
    for eval_trigram in eval_trigrams:
        denominator_word_tokens = eval_trigram.split(" ")[-3:-1]
        denominator_word = " ".join(denominator_word_tokens)
        probability = (train_trigrams[eval_trigram] + k)/ (train_bigrams[denominator_word] + vocab_size)
        log_probability = math.log(probability)
        result = result + log_probability    
    return result

In [133]:
def calculate_add_k_perplexity_for_test_data(input_directory, k_list, vocab_size, train_unigrams, train_bigrams, train_trigrams):
    perplexities_add_k_output_file = open('../output/perplexitiesAddLambda.txt', 'w')
    for k in k_list:
        perplexities_add_k_output_file.write('Lambda: {}\n'.format(k))
        for file_name in glob.glob(input_directory + "/*"):
            print("Prepocessing for test file : {}".format(file_name))
            file_pointer = open(file_name, "r")

            # Read file contents
            file_content = file_pointer.read()

            # Remove duplicate spaces
            file_content = re.sub(' +', ' ', file_content)

            # Remove new line characters
            file_content = file_content.replace("\n", " ")

            print("##### Tokenize test data")
            test_word_tokens = get_word_tokens(file_content)
            print("Number of original test word tokens: {}".format(len(test_word_tokens)))

            print("Replacing words in test data with UNK that is not present in train data")
            test_word_tokens = replace_eval_data_with_UNK(test_word_tokens, train_unigrams)

            print("Trigram for test data")
            test_trigrams = generate_ngrams(test_word_tokens, 3)
            test_corpus_length = len(test_word_tokens)

            log_probability = add_k(k, vocab_size, train_bigrams, train_trigrams, test_trigrams)
            perplexity = calculate_perplexity(log_probability, test_corpus_length)
            perplexities_add_k_output_file.write('Filename: {}, Perplexity: {}\n'.format(file_name, perplexity))

    perplexities_add_k_output_file.close()

In [134]:
def run_3():
    print("##### Loading input data")
    input_data = load_data('../input/train')
#     input_data = load_data('../input/custom_train')
    
    print("##### Tokenize input data")
    input_word_tokens = get_word_tokens(input_data)
    print("Number of original word tokens: {}".format(len(input_word_tokens)))
    
    eighty_percentile = round(len(input_word_tokens) * 0.8)
    train_word_tokens = input_word_tokens[:eighty_percentile]
    dev_word_tokens = input_word_tokens[eighty_percentile:]
    print("Trian: {}, Dev: {}, Total: {}".format(len(train_word_tokens), len(dev_word_tokens), len(input_word_tokens)))
    print("Percentage: Trian: {}, Dev: {}".format(len(train_word_tokens)/len(input_word_tokens), len(dev_word_tokens)/len(input_word_tokens)))
    
    print("### Replace train word tokens <= 3 with 'UNK")
    train_word_tokens = replace_low_count_words(train_word_tokens, 3)
    print("Number of train tokens after replacement: {}".format(len(train_word_tokens)))
    
    train_unigrams = generate_ngrams(train_word_tokens, 1)
    vocab_size = len(train_unigrams.keys())
    train_bigrams = generate_ngrams(train_word_tokens, 2)
    train_trigrams = generate_ngrams(train_word_tokens, 3)
    print("Training data Counts of Unigrams: {}, Bigrams: {}, Trigrams: {}".format(len(train_unigrams), len(train_bigrams), len(train_trigrams)))
    
#     calculate_perplexity_for_test_data('../input/custom_test', lambdas, train_unigrams, train_bigrams, train_trigrams)
    calculate_add_k_perplexity_for_test_data('../input/test', [0.1, 0.3], vocab_size, train_unigrams, train_bigrams, train_trigrams)
    
    

In [135]:
run_3()

##### Loading input data
Prepocessing: ../input/train/a894e49e-a0a6-4851-be23-4da89a52bb8e.txt
Prepocessing: ../input/train/14d44997-7510-4777-adf0-5e4dd387e0bf.txt
Prepocessing: ../input/train/521ce35b-288c-4b12-a7a8-70b836290f90.txt
Prepocessing: ../input/train/9ae9dae7-69a0-4116-9622-448f154bc269.txt
Prepocessing: ../input/train/0493e223-a0e2-4c6f-ade9-7172f35c18b1.txt
Prepocessing: ../input/train/da059d4f-19ac-4130-858a-f6241b56fe39.txt
Prepocessing: ../input/train/30381986-3d6b-4227-9733-9483ead7343d.txt
Prepocessing: ../input/train/e0a13927-26e0-4ca8-b1a0-864b7604e566.txt
Prepocessing: ../input/train/5d384641-37e5-4b1c-b4d6-0ee935141ecb.txt
Prepocessing: ../input/train/904393ad-fbc1-4512-8705-ce1c005c4915.txt
##### Tokenize input data
Number of original word tokens: 2495360
Trian: 1996288, Dev: 499072, Total: 2495360
Percentage: Trian: 0.8, Dev: 0.2
### Replace train word tokens <= 3 with 'UNK
Number of train tokens after replacement: 1996288
Training data Counts of Unigrams: 116

3.4
Based on your observation from above questions, compare linear interpolation and add-lambda
smoothing by listing out their pros and cons.

Perplexity is lower in linear interpolation compared to add-lambda indicating linear interpolation trigram model is better.

However, an extra computation is required to identify the best parameters for linear interpolation.