# Imports

In [1]:
import codecs
import re
from collections import defaultdict, Counter, OrderedDict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Util Functions For LM

In [2]:
def create_model_output_file(list_of_models, model_file):
    file = codecs.open(model_file, "w+", "utf-8")
    for index, model in enumerate(list_of_models):
        file.write(f'{index + 1}-grams:\n')
        calc_and_print_probs(model, file)
        file.write("\n")
    file.close()

def calc_and_print_probs(model, file):
    final_model = {}
    for items in model.items():
        prefix = items[0]
        counter = items[1]

        keys = list(counter.keys())
        values = list(counter.values())
        probs = [v / sum(values) for v in values]
        for index, key in enumerate(keys):
            final_model[prefix + key] = np.log2(probs[index])

    for item in final_model.items():
        file.write(f'{item[0]}\t{item[1]}\n')

# Utils Functions For Eval

In [3]:
def create_model(list_models):
    result = {}
    for dictionary in list_models:
        result.update(dictionary)
    return result

def n_grams_to_dict(n_grams):
    n_grams = filter(lambda x: x != '', n_grams.split("\n"))
    n_grams = [gram for gram in n_grams]
    n_grams = map(lambda x: x.split("\t"), n_grams)
    n_grams = [gram for gram in n_grams]
    n_grams = map(lambda x: (x[0], float(x[1])), n_grams)
    n_grams = [gram for gram in n_grams]
    return n_grams

def model_file_to_dict(model_file):
    file = codecs.open(model_file, "r", "utf-8")
    file_string = file.read()
    file_split = re.split("\d+-grams:\n", file_string)
    uni_grams = n_grams_to_dict(file_split[1])
    bi_grams = n_grams_to_dict(file_split[2])
    three_grams = n_grams_to_dict(file_split[3])
    return dict(uni_grams), dict(bi_grams), dict(three_grams)

# Part 1

In [4]:
def lm(corpus_file, model_file):
    corpus_file = corpus_file.replace("\n", "<ENTER>")

    three_grams = defaultdict(Counter)
    bi_grams = defaultdict(Counter)
    uni_grams = defaultdict(Counter)
    previous_unigram = None
    for i in range(0, len(corpus_file) - n_gram + 1):
        if i > 0:
            previous_unigram = corpus_file[i - 1]

        curr_three_gram = corpus_file[i:i + n_gram]
        curr_bi_gram = corpus_file[i:i + n_gram - 1]
        curr_uni_gram = corpus_file[i]

        three_grams[curr_three_gram[0:2]][curr_three_gram[2]] += 1
        bi_grams[curr_bi_gram[0:1]][curr_bi_gram[1]] += 1

        if previous_unigram == "<":
            if curr_uni_gram != "e" and curr_uni_gram != "s":
                uni_grams[''][curr_uni_gram] += 1
        else:
            uni_grams[''][curr_uni_gram] += 1

    create_model_output_file([uni_grams, bi_grams, three_grams], model_file)

# Part 2

In [5]:
def eval_perplexity(input_file, model_file, weights):
    model = create_model(model_file_to_dict(model_file))

    cumulative_interpolate = 0
    for i in range(0, len(input_file) - n_gram + 1):
        curr_three_gram = input_file[i:i + n_gram]
        p_interpolate = interpolate(curr_three_gram, model, weights)
        if not p_interpolate == 0:
            cumulative_interpolate += p_interpolate

    cross_entropy = (-1) / len(input_file) *  cumulative_interpolate
    perplexity = np.power(2, cross_entropy)

    print(f'{model_file} perplexity = {perplexity}')
    return perplexity


def interpolate(curr_three_gram, model, weights):
    try:
        p_xy3 = model[curr_three_gram]
    except KeyError:
        return 0

    p_xy2 = model[curr_three_gram[1:]]
    p_xy1 = model[curr_three_gram[-1]]

    return weights[0] * p_xy3 + weights[1] * p_xy2 + weights[2] * p_xy1

# Part 3

In [6]:
languages = ['en', 'es', 'fr', 'in', 'it', 'nl', 'pt', 'tl']
n_gram = 3

# Train

In [None]:
train_test_dict = {}
for lang in languages[0:]:
    df = pd.read_csv("./data/" + lang + ".csv")
    train, test = train_test_split(df, test_size=0.1, random_state=12)
    train_test_dict[lang] = (train, test)
    train_corpus = "<s>" + "<e><s>".join(train["tweet_text"].values) + "<e>"

    lm(train_corpus, lang+".txt")

# Test

In [None]:
results_dict = defaultdict(dict)

In [None]:
for test_lang in languages[0:]:
    print(f'testing {test_lang}')
    test = train_test_dict[test_lang][1]
    test_corpus = "<s>" + "<e><s>".join(test["tweet_text"].values) + "<e>"

    for model_lang in languages[0:]:
        model_perplexity = eval_perplexity(test_corpus, model_lang + ".txt", (0.4, 0.3, 0.3))
        results_dict[test_lang][model_lang] = model_perplexity

In [None]:
df = pd.DataFrame(results_dict)
df.to_csv("results.csv")
print(df)

# Test No Unigram

In [None]:
results_dict_no_unigram = defaultdict(dict)
for test_lang in languages[0:]:
    print(f'testing {test_lang}')
    test = train_test_dict[test_lang][1]
    test_corpus = "<s>" + "<e><s>".join(test["tweet_text"].values) + "<e>"

    for model_lang in languages[0:]:
        model_perplexity = eval_perplexity(test_corpus, model_lang + ".txt", (0.6, 0.4, 0.0))
        results_dict_no_unigram[test_lang][model_lang] = model_perplexity

In [None]:
df = pd.DataFrame(results_dict_no_unigram)
df.to_csv("results_no_unigram.csv")
print(df)