# Imports

In [1]:
import codecs
import re
from collections import defaultdict, Counter, OrderedDict

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Util Functions For LM

In [2]:
def create_model_output_file(list_of_models, model_file):
    file = codecs.open(model_file, "w+", "utf-8")
    for index, model in enumerate(list_of_models):
        file.write(f'{index + 1}-grams:\n')
        calc_and_print_probs(model, file)
        file.write("\n")
    file.close()

def calc_and_print_probs(model, file):
    final_model = {}
    for items in model.items():
        prefix = items[0]
        counter = items[1]

        keys = list(counter.keys())
        values = list(counter.values())
        probs = [v / sum(values) for v in values]
        for index, key in enumerate(keys):
            final_model[prefix + key] = np.log2(probs[index])

    for item in final_model.items():
        file.write(f'{item[0]}\t{item[1]}\n')

# Utils Functions For Eval

In [3]:
def create_model(list_models):
    result = {}
    for dictionary in list_models:
        result.update(dictionary)
    return result

def n_grams_to_dict(n_grams):
    n_grams = filter(lambda x: x != '', n_grams.split("\n"))
    n_grams = [gram for gram in n_grams]
    n_grams = map(lambda x: x.split("\t"), n_grams)
    n_grams = [gram for gram in n_grams]
    n_grams = map(lambda x: (x[0], float(x[1])), n_grams)
    n_grams = [gram for gram in n_grams]
    return n_grams

def model_file_to_dict(model_file):
    file = codecs.open(model_file, "r", "utf-8")
    file_string = file.read()
    file_split = re.split("\d+-grams:\n", file_string)
    uni_grams = n_grams_to_dict(file_split[1])
    bi_grams = n_grams_to_dict(file_split[2])
    three_grams = n_grams_to_dict(file_split[3])
    return dict(uni_grams), dict(bi_grams), dict(three_grams)

# Part 1

In [4]:
def lm(corpus_file, model_file):
    corpus_file = corpus_file.replace("\n", "<ENTER>")

    three_grams = defaultdict(Counter)
    bi_grams = defaultdict(Counter)
    uni_grams = defaultdict(Counter)
    previous_unigram = None
    for i in range(0, len(corpus_file) - n_gram + 1):
        if i > 0:
            previous_unigram = corpus_file[i - 1]

        curr_three_gram = corpus_file[i:i + n_gram]
        curr_bi_gram = corpus_file[i:i + n_gram - 1]
        curr_uni_gram = corpus_file[i]

        three_grams[curr_three_gram[0:2]][curr_three_gram[2]] += 1
        bi_grams[curr_bi_gram[0:1]][curr_bi_gram[1]] += 1

        if previous_unigram == "<":
            if curr_uni_gram != "e" and curr_uni_gram != "s":
                uni_grams[''][curr_uni_gram] += 1
        else:
            uni_grams[''][curr_uni_gram] += 1

    create_model_output_file([uni_grams, bi_grams, three_grams], model_file)

# Part 2

In [5]:
def eval_perplexity(input_file, model_file, weights):
    model = create_model(model_file_to_dict(model_file))

    cumulative_interpolate = 0
    for i in range(0, len(input_file) - n_gram + 1):
        curr_three_gram = input_file[i:i + n_gram]
        p_interpolate = interpolate(curr_three_gram, model, weights)
        if not p_interpolate == 0:
            cumulative_interpolate += p_interpolate

    cross_entropy = (-1) / len(input_file) *  cumulative_interpolate
    perplexity = np.power(2, cross_entropy)

    print(f'{model_file} perplexity = {perplexity}')
    return perplexity


def interpolate(curr_three_gram, model, weights):
    try:
        p_xy3 = model[curr_three_gram]
    except KeyError:
        return 0

    p_xy2 = model[curr_three_gram[1:]]
    p_xy1 = model[curr_three_gram[-1]]

    return weights[0] * p_xy3 + weights[1] * p_xy2 + weights[2] * p_xy1

# Part 3

In [6]:
languages = ['en', 'es', 'fr', 'in', 'it', 'nl', 'pt', 'tl']
n_gram = 3

# Train

In [7]:
train_test_dict = {}
for lang in languages[0:]:
    df = pd.read_csv("./data/" + lang + ".csv")
    train, test = train_test_split(df, test_size=0.1, random_state=12)
    train_test_dict[lang] = (train, test)
    train_corpus = "<s>" + "<e><s>".join(train["tweet_text"].values) + "<e>"

    lm(train_corpus, lang+".txt")

# Test

In [8]:
results_dict = defaultdict(dict)

In [9]:
for test_lang in languages[0:]:
    print(f'testing {test_lang}')
    test = train_test_dict[test_lang][1]
    test_corpus = "<s>" + "<e><s>".join(test["tweet_text"].values) + "<e>"

    for model_lang in languages[0:]:
        model_perplexity = eval_perplexity(test_corpus, model_lang + ".txt", (0.4, 0.3, 0.3))
        results_dict[test_lang][model_lang] = model_perplexity

testing en
en.txt perplexity = 12.579148233431042
es.txt perplexity = 14.547181515257451
fr.txt perplexity = 14.637219063967972
in.txt perplexity = 14.994689146168547
it.txt perplexity = 14.677917899475593
nl.txt perplexity = 14.516354653594203
pt.txt perplexity = 13.852343368246306
tl.txt perplexity = 14.09422941109137
testing es
en.txt perplexity = 13.412594114804705
es.txt perplexity = 11.97551320732334
fr.txt perplexity = 12.768733416797108
in.txt perplexity = 13.922002158147787
it.txt perplexity = 12.40107344912731
nl.txt perplexity = 14.030403392148576
pt.txt perplexity = 11.810494740090974
tl.txt perplexity = 13.765339993049025
testing fr
en.txt perplexity = 11.902169739629631
es.txt perplexity = 12.042588314359762
fr.txt perplexity = 12.350566691284433
in.txt perplexity = 12.722596400274153
it.txt perplexity = 12.478263420519045
nl.txt perplexity = 12.860741295168705
pt.txt perplexity = 11.807977918407149
tl.txt perplexity = 12.188070455804684
testing in
en.txt perplexity = 15.

In [10]:
df = pd.DataFrame(results_dict)
df.to_csv("results.csv")
print(df)

           en         es         fr         in         it         nl  \
en  12.579148  13.412594  11.902170  15.191110  14.643972  13.330103   
es  14.547182  11.975513  12.042588  13.878569  13.307541  13.166704   
fr  14.637219  12.768733  12.350567  15.343643  14.315356  13.530849   
in  14.994689  13.922002  12.722596  13.062450  14.444088  14.340804   
it  14.677918  12.401073  12.478263  13.796956  12.288053  13.616496   
nl  14.516355  14.030403  12.860741  15.291659  15.344734  12.273105   
pt  13.852343  11.810495  11.807978  13.368077  12.844423  13.111424   
tl  14.094229  13.765340  12.188070  13.714973  14.130902  13.645973   

           pt         tl  
en  13.186476  14.829559  
es  12.246643  14.608287  
fr  13.270118  15.499609  
in  13.695018  13.515339  
it  12.379418  14.216598  
nl  14.155552  15.140698  
pt  11.898184  13.688038  
tl  13.356914  12.572145  


# Test No Unigram

In [11]:
results_dict_no_unigram = defaultdict(dict)
for test_lang in languages[0:]:
    print(f'testing {test_lang}')
    test = train_test_dict[test_lang][1]
    test_corpus = "<s>" + "<e><s>".join(test["tweet_text"].values) + "<e>"

    for model_lang in languages[0:]:
        model_perplexity = eval_perplexity(test_corpus, model_lang + ".txt", (0.6, 0.4, 0.0))
        results_dict_no_unigram[test_lang][model_lang] = model_perplexity

testing en
en.txt perplexity = 8.734948496186254
es.txt perplexity = 11.567575700954963
fr.txt perplexity = 11.373073781066015
in.txt perplexity = 11.563783254661384
it.txt perplexity = 11.627493233505858
nl.txt perplexity = 11.152567614890474
pt.txt perplexity = 11.01931709578962
tl.txt perplexity = 10.538736049888037
testing es
en.txt perplexity = 10.601739676461282
es.txt perplexity = 8.31001929380021
fr.txt perplexity = 9.911115153572338
in.txt perplexity = 11.005726082119736
it.txt perplexity = 9.430560174196232
nl.txt perplexity = 11.348057922117972
pt.txt perplexity = 8.866903699021753
tl.txt perplexity = 10.588308390501053
testing fr
en.txt perplexity = 9.266709772777487
es.txt perplexity = 9.449055970541984
fr.txt perplexity = 8.554343517524382
in.txt perplexity = 10.128192929092725
it.txt perplexity = 9.813096670630404
nl.txt perplexity = 10.182978266232286
pt.txt perplexity = 9.365039673127827
tl.txt perplexity = 9.532679723140928
testing in
en.txt perplexity = 12.2842481507

In [12]:
df = pd.DataFrame(results_dict_no_unigram)
df.to_csv("results_no_unigram.csv")
print(df)

           en         es         fr         in         it         nl  \
en   8.734948  10.601740   9.266710  12.284248  11.575471  10.504277   
es  11.567576   8.310019   9.449056  11.233399  10.109781  10.704325   
fr  11.373074   9.911115   8.554344  12.395548  11.158184  10.826088   
in  11.563783  11.005726  10.128193   9.263651  11.261693  11.379948   
it  11.627493   9.430560   9.813097  11.149374   8.444241  11.198089   
nl  11.152568  11.348058  10.182978  12.369893  12.455353   8.632502   
pt  11.019317   8.866904   9.365040  10.948768   9.845702  10.837933   
tl  10.538736  10.588308   9.532680  10.619811  10.872046  10.766957   

           pt         tl  
en  10.513064  11.500987  
es   9.207739  11.466085  
fr  10.483608  12.089394  
in  10.897499  10.113640  
it   9.451630  10.969712  
nl  11.552711  11.759245  
pt   8.073184  10.659179  
tl  10.407956   8.430425  
