In [151]:
import nltk
from math import inf, log
from nltk.collocations import TrigramCollocationFinder
from nltk.collocations import BigramCollocationFinder
import time

In [152]:
def preprocessing(text_lines,l=None):
    preprocessed = []
    for index, line in enumerate(text_lines):
        num, sentence = line.split('\t')
        new_line = ''.join([c.lower() for c in sentence if not c.isdigit()])
        new_line = ' '.join(new_line.split())
        preprocessed.append(new_line)
    return preprocessed

In [153]:
langs = ['eng', 'deu', 'fra', 'ita', 'nld', 'spa']
dataset = {}
for l in langs:
    dataset[l] = {}
    with open('langId/' + l + '_trn.txt', 'r') as file:
        dataset[l]['trn'] = file.readlines()
    with open('langId/' + l + '_tst.txt', 'r') as file:
        dataset[l]['tst'] = file.readlines()
    dataset[l]['trn'] = '  ' + '  '.join(preprocessing(dataset[l]['trn'],l)) + '  '
    dataset[l]['tst'] = ['  ' + sentence + '  ' for sentence in preprocessing(dataset[l]['tst'],l)]

In [154]:
class LanguageModel:
    def __init__(self, clean_corpus, lang):
        self.trigrams = TrigramCollocationFinder.from_words(clean_corpus)
        self.trigrams.apply_freq_filter(5)
        self.bigrams = self.trigrams.bigram_finder()
        self.lambd = 1
        self.V = len(self.trigrams.ngram_fd)
        self.lang = lang
            
    def infer(self, trigram):
        # N: bigram_freq, V: train trigrams vocabulary size, ci: trigram frequency
        ci = self.trigrams.ngram_fd[trigram]
        (t1, t2, t3) = trigram
        N = self.bigrams.ngram_fd[(t1, t2)]
        return (ci + self.lambd) / (N + self.lambd * self.V)
        
        
    def eval(self, sentence):
        # log probablity, not perplexity
        p = 1
        trigrams = TrigramCollocationFinder.from_words(sentence)
        for trigram in trigrams.ngram_fd:
            p += log(self.infer(trigram))
        return p

In [155]:
lt0 = time.time()
print('Training English ...')
engModel = LanguageModel(dataset['eng']['trn'], 'eng')
print('Training Deutsch ...')
deuModel = LanguageModel(dataset['deu']['trn'], 'deu')
print('Training Français ...')
fraModel = LanguageModel(dataset['fra']['trn'], 'fra')
print('Training Italiano ...')
itaModel = LanguageModel(dataset['ita']['trn'], 'ita')
print('Training Nederlands ...')
nldModel = LanguageModel(dataset['nld']['trn'], 'nld')
print('Training Español ...')
spaModel = LanguageModel(dataset['spa']['trn'], 'spa')
lt1 = time.time()
print('Training time:', lt1 - lt0)

Training English ...
Training Deutsch ...
Training Français ...
Training Italiano ...
Training Nederlands ...
Training Español ...
Training time: 70.53643941879272


In [156]:
def metamodel(models, sentence):
    max_p = inf
    res = None
    for model in models:
        p = model.eval(sentence)
        if abs(p) < max_p:
            # (log probablity)
            max_p = abs(p)
            res = model.lang
    return res

models = [engModel, deuModel, fraModel, itaModel, nldModel, spaModel]

In [157]:
tt0 = time.time()
confusion_matrix = {}
for l in langs:
    print('Evaluating on ...', l)
    confusion_matrix[l] = {}
    for sentence in dataset[l]['tst']:
        res = metamodel(models, sentence)
        if res in confusion_matrix[l]:
            confusion_matrix[l][res] += 1
        else:
            confusion_matrix[l][res] = 1
tt1 = time.time()

print('Inference time:', tt1-tt0)
        

Evaluating on ... eng
Evaluating on ... deu
Evaluating on ... fra
Evaluating on ... ita
Evaluating on ... nld
Evaluating on ... spa
Inference time: 209.0808641910553


In [159]:
def prettify(conf_mat):
    print('***', end = '')
    for l in langs:
        print('|*' + l, end = '')
    print('|')
    for l in langs:
        print(l, end = '|')
        for l2 in langs:
            if l2 in conf_mat[l]:
                print(str(conf_mat[l][l2]).rjust(4), end = '|')
            else:
                print(str('   .'), end = '|')
        print()
    print('-'*34)

def get_accuracy(conf_mat):
    total = 0
    right = 0
    for l in langs:
        right += conf_mat[l][l]
        total += sum(conf_mat[l].values())
    return right/total
    
prettify(confusion_matrix)
print('Accuracy', get_accuracy(confusion_matrix))
print('Training time:', lt1 - lt0)
print('Inference time:', tt1-tt0)
        

***|*eng|*deu|*fra|*ita|*nld|*spa|
eng|9985|   .|   1|   .|   1|   .|
deu|  11|9971|   .|   1|   6|   1|
fra|  11|   .|9980|   5|   3|   1|
ita|   8|   .|   1|9987|   .|   4|
nld|  28|   7|   2|   4|9957|   2|
spa|   4|   .|   1|   3|   .|9992|
----------------------------------
Accuracy 0.9982493289094153
Training time: 70.53643941879272
Inference time: 209.0808641910553
