## Optional exercise

**First, we crete a function to do the preprocessing of the text.**

In [1]:
import nltk
import re
from itertools import islice
from nltk.collocations import TrigramCollocationFinder
import math
from nltk.metrics.scores import accuracy
from nltk.metrics import ConfusionMatrix

def preprocess(line):
    #Remove all the digits and punctuation from data
    line = re.sub(r'(\d|[^\w ])', ' ', line)
    #Remove spaces at the beginning of the line
    line = re.sub(r'^[ ]+', '', line)
    #Remove spaces at the end of the line
    line = line.rstrip()
    #Convert all the texts to lower case
    line = line.lower()
    #Replace continuous white spaces by a single one
    line = re.sub(r'[ ]+', ' ', line)
    return line

**Then, we read the training texts and preprocess them. We also make sure to remove all the trigrams that occur less then 5 times in the training corpus.**

In [2]:
languages = ['spa', 'nld', 'ita', 'fra', 'eng', 'deu']

languages_dict = {}
for lang in languages:
    preprocessed_text = ''
    with open('./langId/'+ lang +'_trn.txt') as f:
        for line in f:
            preprocessed_text = '  '.join([preprocessed_text, preprocess(line)])

    finder = TrigramCollocationFinder.from_words(preprocessed_text)
    finder.apply_freq_filter(5)
    languages_dict[lang] = finder.ngram_fd

**Finally, we read the test texts and for each sentence of each language we do the preprocessing and then calculate the probabilites for each sentence to belong to each of the possible language, and we assign to the sentence the language with the higher probability.**

In [3]:
real_languages = []
predicted_languages = []
for language in languages:
    with open('./langId/'+language+'_tst.txt') as f:
        for line in f:
            test_text = preprocess(line)
            finder = TrigramCollocationFinder.from_words(test_text)

            probabilites = {}
            for lang in languages:
                p = sum(math.log((languages_dict[lang][tr[0]] + 1)*tr[1]) for tr in finder.ngram_fd.items())
                probabilites[lang]= p - math.log(languages_dict[lang].N()+1)*finder.ngram_fd.N()
            sorted_probabilites = sorted(probabilites.items(), key=lambda x: x[1], reverse=True)
            predicted = sorted_probabilites[0][0]
            real_languages.append(language)
            predicted_languages.append(predicted)

**Once our model is tested, we calculate the accuracy of the model.**

In [4]:
print('Accuracy: ', accuracy(real_languages,predicted_languages))

Accuracy:  0.9983993864314654


**Finally, we print the confusion matrix.**

In [5]:
cm = ConfusionMatrix(real_languages,predicted_languages)
print('Confusion Matrix:')
print(cm.pretty_format())

Confusion Matrix:
    |    d    e    f    i    n    s |
    |    e    n    r    t    l    p |
    |    u    g    a    a    d    a |
----+-------------------------------+
deu |<9979>   5    .    .    5    1 |
eng |    .<9982>   1    1    3    . |
fra |    .   10<9980>   3    3    4 |
ita |    .   13    4<9977>   1    5 |
nld |    6   11    .    3<9978>   2 |
spa |    .    5    1    9    .<9985>|
----+-------------------------------+
(row = reference; col = test)

