# Imports

In [1]:
import codecs
import re
from collections import defaultdict, Counter, OrderedDict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Util Functions For LM

In [2]:
def create_model_output_file(list_of_models, model_file):
    file = codecs.open(model_file, "w+", "utf-8")
    for index, model in enumerate(list_of_models):
        file.write(f'{index + 1}-grams:\n')
        if index==0:
            calc_and_print_probs(model, file, smoothing = True)
        calc_and_print_probs(model, file, smoothing = False)
        file.write("\n")
    file.close()

def calc_and_print_probs(model, file, smoothing):
    final_model = {}
    for items in model.items():
        prefix = items[0]
        counter = items[1]

        keys = list(counter.keys())
        values = list(counter.values())
        probs = [v / sum(values) for v in values]
        if smoothing:
            unknown_prob = 1 / (sum(values))
            probs = [prob + unknown_prob for prob in probs]
        for index, key in enumerate(keys):
            final_model[prefix + key] = np.log2(probs[index])

    if smoothing:
        final_model["UNKNOWN"] = unknown_prob
    for item in final_model.items():
        file.write(f'{item[0]}\t{item[1]}\n')


# Utils Functions For Eval

In [3]:
def create_model(list_models):
    result = {}
    for dictionary in list_models:
        result.update(dictionary)
    return result

def n_grams_to_dict(n_grams):
    n_grams = filter(lambda x: x != '', n_grams.split("\n"))
    n_grams = [gram for gram in n_grams]
    n_grams = map(lambda x: x.split("\t"), n_grams)
    n_grams = [gram for gram in n_grams]
    n_grams = map(lambda x: (x[0], float(x[1])), n_grams)
    n_grams = [gram for gram in n_grams]
    return n_grams

def model_file_to_dict(model_file):
    file = codecs.open(model_file, "r", "utf-8")
    file_string = file.read()
    file_split = re.split("\d+-grams:\n", file_string)
    uni_grams = n_grams_to_dict(file_split[1])
    bi_grams = n_grams_to_dict(file_split[2])
    three_grams = n_grams_to_dict(file_split[3])
    return dict(uni_grams), dict(bi_grams), dict(three_grams)

# Part 1

In [7]:
def lm(corpus_file, model_file):
    corpus_file = corpus_file.replace("\n", "<ENTER>")

    three_grams = defaultdict(Counter)
    bi_grams = defaultdict(Counter)
    uni_grams = defaultdict(Counter)
    previous_unigram = None
    for i in range(0, len(corpus_file) - n_gram + 1):
        if i > 0:
            previous_unigram = corpus_file[i - 1]

        curr_three_gram = corpus_file[i:i + n_gram]
        curr_bi_gram = corpus_file[i:i + n_gram - 1]
        curr_uni_gram = corpus_file[i]

        three_grams[curr_three_gram[0:2]][curr_three_gram[2]] += 1
        bi_grams[curr_bi_gram[0:1]][curr_bi_gram[1]] += 1

        if previous_unigram == "<":
            if curr_uni_gram != "e" and curr_uni_gram != "s":
                uni_grams[''][curr_uni_gram] += 1
        else:
            uni_grams[''][curr_uni_gram] += 1
    
    create_model_output_file([uni_grams, bi_grams, three_grams], model_file)

# Part 2

In [8]:
def eval_perplexity(input_file, model_file, weights):
    model = create_model(model_file_to_dict(model_file))

    cumulative_interpolate = 0
    for i in range(0, len(input_file) - n_gram + 1):
        curr_three_gram = input_file[i:i + n_gram]
        p_interpolate = interpolate(curr_three_gram, model, weights)
        if not p_interpolate == 0:
            cumulative_interpolate += p_interpolate

    cross_entropy = (-1) / len(input_file) *  cumulative_interpolate
    perplexity = np.power(2, cross_entropy)

    print(f'{model_file} perplexity = {perplexity}')
    return perplexity


def interpolate(curr_three_gram, model, weights):
    try:
        p_xy3 = model[curr_three_gram]
    except KeyError:
        p_xy3 = model["UNKNOWN"]

    try:
        p_xy2 = model[curr_three_gram[1:]]
    except KeyError:
        p_xy2 = model["UNKNOWN"]

    try:
        p_xy1 = model[curr_three_gram[-1]]
    except KeyError:
        p_xy1 = model["UNKNOWN"]

    return weights[0] * p_xy3 + weights[1] * p_xy2 + weights[2] * p_xy1


# Part 3

In [9]:
languages = ['en', 'es', 'fr', 'in', 'it', 'nl', 'pt', 'tl']
n_gram = 3

# Train

In [10]:
train_test_dict = {}
for lang in languages[0:]:
    df = pd.read_csv("./data/" + lang + ".csv")
    train, test = train_test_split(df, test_size=0.1, random_state=12)
    train_test_dict[lang] = (train, test)
    train_corpus = "<s>" + "<e><s>".join(train["tweet_text"].values) + "<e>"

    lm(train_corpus, lang+".txt")

# Test

In [11]:
results_dict = defaultdict(dict)

In [12]:
for test_lang in languages[0:]:
    print(f'testing {test_lang}')
    test = train_test_dict[test_lang][1]
    test_corpus = "<s>" + "<e><s>".join(test["tweet_text"].values) + "<e>"

    for model_lang in languages[0:]:
        model_perplexity = eval_perplexity(test_corpus, model_lang + ".txt", (0.4, 0.3, 0.3))
        results_dict[test_lang][model_lang] = model_perplexity

testing en
en.txt perplexity = 14.954497192618483
es.txt perplexity = 19.804420884686973
fr.txt perplexity = 19.02182104573655
in.txt perplexity = 19.470163930975538
it.txt perplexity = 19.68963104407588
nl.txt perplexity = 18.423846618094498
pt.txt perplexity = 19.5427409014225
tl.txt perplexity = 18.147270752302468
testing es
en.txt perplexity = 17.557307827163154
es.txt perplexity = 14.045118821751924
fr.txt perplexity = 16.74321793655993
in.txt perplexity = 18.06237047489614
it.txt perplexity = 16.391051860823453
nl.txt perplexity = 18.854738321532977
pt.txt perplexity = 15.665498565904697
tl.txt perplexity = 18.21279103572767
testing fr
en.txt perplexity = 16.691777170820075
es.txt perplexity = 16.981291175845612
fr.txt perplexity = 14.487609506106248
in.txt perplexity = 18.510414427937956
it.txt perplexity = 17.429267995911037
nl.txt perplexity = 18.269516274129263
pt.txt perplexity = 17.270368732436058
tl.txt perplexity = 18.025105577162282
testing in
en.txt perplexity = 20.4475

In [13]:
df = pd.DataFrame(results_dict)
df.to_csv("results.csv")
print(df)

           en         es         fr         in         it         nl  \
en  14.954497  17.557308  16.691777  20.447547  18.741254  18.384435   
es  19.804421  14.045119  16.981291  20.313890  17.047745  19.606258   
fr  19.021821  16.743218  14.487610  21.217396  18.436676  19.183053   
in  19.470164  18.062370  18.510414  15.392363  19.037878  19.719234   
it  19.689631  16.391052  17.429268  20.274900  14.413139  20.147427   
nl  18.423847  18.854738  18.269516  20.441303  20.207722  14.826939   
pt  19.542741  15.665499  17.270369  20.216527  17.018654  20.011669   
tl  18.147271  18.212791  18.025106  17.994469  18.496597  19.608786   

           pt         tl  
en  17.608718  19.652317  
es  15.613762  20.510822  
fr  17.471084  21.316079  
in  18.120524  17.656488  
it  16.208600  19.772552  
nl  19.124873  20.213700  
pt  13.756324  19.719762  
tl  17.921315  14.694539  
