### Training probabilistic language models to distinguish between words in different languages - English, French, Spanish & Italian

In [182]:
# -----  Importing Libraries  ------


from nltk import ngrams, FreqDist
from nltk.corpus import udhr
from nltk.tokenize import RegexpTokenizer

In [183]:
# -----  Retriving UDHR of different languages  ------


english = udhr.raw('English-Latin1')
french = udhr.raw('French_Francais-Latin1')
italian = udhr.raw('Italian_Italiano-Latin1')
spanish = udhr.raw('Spanish_Espanol-Latin1')

In [184]:
# -----  Splitting into training & Dev Data  ------


english_train, english_dev = english[0:1000], english[1000:1100]
french_train, french_dev = french[0:1000], french[1000:1100]
italian_train, italian_dev = italian[0:1000], italian[1000:1100]
spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100]

In [185]:
# -----  Getting test data  ------


english_test = udhr.words('English-Latin1')[0:1000]
french_test = udhr.words('French_Francais-Latin1')[0:1000]
italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000]
spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000]

In [186]:
# -----  PREPROCESSING  ------

# -----  Lowercasing training & dev data  ------


e_train = english_train.lower()
e_dev = english_dev.lower()
f_train = french_train.lower()
f_dev = french_dev.lower()
i_train = italian_train.lower()
i_dev = italian_dev.lower()
s_train = spanish_train.lower()
s_dev = spanish_dev.lower()

In [187]:
# -----  Converting Dev data to words for easy handling by the model  ----


e_dev = e_dev.split()
f_dev = f_dev.split()
i_dev = i_dev.split()
s_dev = s_dev.split()

In [188]:
# -----  Tokenizing training data  ------


tokenizer = RegexpTokenizer("[a-zA-Z'`éèî]+")

english_train_tokenized = tokenizer.tokenize(e_train)
french_train_tokenized = tokenizer.tokenize(f_train)
italian_train_tokenized = tokenizer.tokenize(i_train)
spanish_train_tokenized = tokenizer.tokenize(s_train)

english_train_tokenized

['universal',
 'declaration',
 'of',
 'human',
 'rights',
 'preamble',
 'whereas',
 'recognition',
 'of',
 'the',
 'inherent',
 'dignity',
 'and',
 'of',
 'the',
 'equal',
 'and',
 'inalienable',
 'rights',
 'of',
 'all',
 'members',
 'of',
 'the',
 'human',
 'family',
 'is',
 'the',
 'foundation',
 'of',
 'freedom',
 'justice',
 'and',
 'peace',
 'in',
 'the',
 'world',
 'whereas',
 'disregard',
 'and',
 'contempt',
 'for',
 'human',
 'rights',
 'have',
 'resulted',
 'in',
 'barbarous',
 'acts',
 'which',
 'have',
 'outraged',
 'the',
 'conscience',
 'of',
 'mankind',
 'and',
 'the',
 'advent',
 'of',
 'a',
 'world',
 'in',
 'which',
 'human',
 'beings',
 'shall',
 'enjoy',
 'freedom',
 'of',
 'speech',
 'and',
 'belief',
 'and',
 'freedom',
 'from',
 'fear',
 'and',
 'want',
 'has',
 'been',
 'proclaimed',
 'as',
 'the',
 'highest',
 'aspiration',
 'of',
 'the',
 'common',
 'people',
 'whereas',
 'it',
 'is',
 'essential',
 'if',
 'man',
 'is',
 'not',
 'to',
 'be',
 'compelled',
 't

In [189]:
# -----  N-grams creation  -----


# n gram creator function
def ngramer(data, num):
    temp = []
    for word in data:
        temp.append(list(ngrams(word, num, pad_left=True, pad_right=True, left_pad_symbol="_", right_pad_symbol="_")))
    temp = [word for sublist in temp for word in sublist]
    n_grams = temp
    for i, v in enumerate(temp):
        n_grams[i] = ''.join(v)
    return temp



eng_train_unigrams = ngramer(english_train_tokenized, 1)
eng_train_bigrams = ngramer(english_train_tokenized, 2)
eng_train_trigrams = ngramer(english_train_tokenized, 3)

fre_train_unigrams = ngramer(french_train_tokenized, 1)
fre_train_bigrams = ngramer(french_train_tokenized, 2)
fre_train_trigrams = ngramer(french_train_tokenized, 3)

ita_train_unigrams = ngramer(italian_train_tokenized, 1)
ita_train_bigrams = ngramer(italian_train_tokenized, 2)
ita_train_trigrams = ngramer(italian_train_tokenized, 3)

spa_train_unigrams = ngramer(spanish_train_tokenized, 1)
spa_train_bigrams = ngramer(spanish_train_tokenized, 2)
spa_train_trigrams = ngramer(spanish_train_tokenized, 3)

eng_train_trigrams

['__u',
 '_un',
 'uni',
 'niv',
 'ive',
 'ver',
 'ers',
 'rsa',
 'sal',
 'al_',
 'l__',
 '__d',
 '_de',
 'dec',
 'ecl',
 'cla',
 'lar',
 'ara',
 'rat',
 'ati',
 'tio',
 'ion',
 'on_',
 'n__',
 '__o',
 '_of',
 'of_',
 'f__',
 '__h',
 '_hu',
 'hum',
 'uma',
 'man',
 'an_',
 'n__',
 '__r',
 '_ri',
 'rig',
 'igh',
 'ght',
 'hts',
 'ts_',
 's__',
 '__p',
 '_pr',
 'pre',
 'rea',
 'eam',
 'amb',
 'mbl',
 'ble',
 'le_',
 'e__',
 '__w',
 '_wh',
 'whe',
 'her',
 'ere',
 'rea',
 'eas',
 'as_',
 's__',
 '__r',
 '_re',
 'rec',
 'eco',
 'cog',
 'ogn',
 'gni',
 'nit',
 'iti',
 'tio',
 'ion',
 'on_',
 'n__',
 '__o',
 '_of',
 'of_',
 'f__',
 '__t',
 '_th',
 'the',
 'he_',
 'e__',
 '__i',
 '_in',
 'inh',
 'nhe',
 'her',
 'ere',
 'ren',
 'ent',
 'nt_',
 't__',
 '__d',
 '_di',
 'dig',
 'ign',
 'gni',
 'nit',
 'ity',
 'ty_',
 'y__',
 '__a',
 '_an',
 'and',
 'nd_',
 'd__',
 '__o',
 '_of',
 'of_',
 'f__',
 '__t',
 '_th',
 'the',
 'he_',
 'e__',
 '__e',
 '_eq',
 'equ',
 'qua',
 'ual',
 'al_',
 'l__',
 '__a',


In [190]:
# -----  Freq. Distribution of N-grams  -----


eng_unigrams_freqdist = FreqDist(eng_train_unigrams)
eng_bigrams_freqdist = FreqDist(eng_train_bigrams)
eng_trigrams_freqdist = FreqDist(eng_train_trigrams)

fre_unigrams_freqdist = FreqDist(fre_train_unigrams)
fre_bigrams_freqdist = FreqDist(fre_train_bigrams)
fre_trigrams_freqdist = FreqDist(fre_train_trigrams)

ita_unigrams_freqdist = FreqDist(ita_train_unigrams)
ita_bigrams_freqdist = FreqDist(ita_train_bigrams)
ita_trigrams_freqdist = FreqDist(ita_train_trigrams)

spa_unigrams_freqdist = FreqDist(spa_train_unigrams)
spa_bigrams_freqdist = FreqDist(spa_train_bigrams)
spa_trigrams_freqdist = FreqDist(spa_train_trigrams)

eng_trigrams_freqdist

FreqDist({'e__': 31, 'n__': 25, 's__': 25, '__t': 23, 'd__': 23, '__a': 20, '_th': 18, 'the': 17, '__o': 16, 'f__': 16, ...})

<br>

Test data goes into a function called "uni_english_french" as an argument. Pre-processing is done by lowering the text and nothing else is done as the text is already seperate words. Test data goes in with frequency distribution of english training words along with the length of the english training unigrams data. Same happens with the again but now with the frequency distribution of french training words and the length of the french training unigrams data. Inside the uni_lang_prob function the probability of the all the test data words of returned in as a list. The english word probability and french word probability lists are compared for every word and whichever language probability is the highest is given a score of 1. Thus keeping track of the score will give us the most probablity of the language the test data belongs to.


<br>

In [196]:
# English v. French Unigram Model with English test data  ----- 


def uni_lang_prob(test_data, freqdist, total):
    lang_prob = []
    for word in test_data:
        word_prob = 1
        for char in word:
            word_prob *= freqdist[char]/total
        lang_prob.append(word_prob)
    return lang_prob


def uni_english_french(test_data):
    len_test_data = len(test_data)
    test_data = [w.lower() for w in test_data]
    temp_eng_accuracy = temp_fre_accuracy = 0
    eng_prob = uni_lang_prob(test_data, eng_unigrams_freqdist, len(eng_train_unigrams))
    fre_prob = uni_lang_prob(test_data, fre_unigrams_freqdist, len(fre_train_unigrams))
    for i in range(len_test_data):
        if eng_prob[i] >= fre_prob[i]:
            temp_eng_accuracy += 1
        else:
            temp_fre_accuracy += 1
    uni_eng_accuracy = (temp_eng_accuracy/len_test_data)*100
    uni_fre_accuracy = (temp_fre_accuracy/len_test_data)*100
    print(f"Unigram Model accuracy of English is {uni_eng_accuracy}%.")
    print(f"Unigram Model accuracy of French is {uni_fre_accuracy}%.")

    
# ----- DEV DATA -----

uni_english_french(e_dev)

Unigram Model accuracy of English is 70.58823529411765%.
Unigram Model accuracy of French is 29.411764705882355%.


In [197]:
# ----- TEST DATA -----


uni_english_french(english_test)

Unigram Model accuracy of English is 78.4%.
Unigram Model accuracy of French is 21.6%.


<br>


Almost the same procedure given for the unigram model is followed for bigram model as well except for inside the bi_lang_prob function the word probability is calculated by doing count(bigram)+1/count(unigram)+count(vocabulary), the formula for which was taken from the 3rd chapter in the Speech and Patter Recognition - https://web.stanford.edu/~jurafsky/slp3/3.pdf.


<br>

In [198]:
# English v. French Bigram model with English test data  -----


def bi_lang_prob(test_data, unigrams_freqdist, bigrams_freqdist, v):
    lang_prob = []
    for word in test_data:
        word_prob = 1
        word = "_"+word+"_"
        for char in range(len(word)-1):
            word_prob *= ((bigrams_freqdist[word[char:char+2]])+1)/(unigrams_freqdist[word[char]]+v)
        lang_prob.append(word_prob)
    return lang_prob
    
    
def bi_english_french(test_data):
    len_test_data = len(test_data)
    test_data = [w.lower() for w in test_data]
    temp_eng_accuracy = temp_fre_accuracy = 0
    eng_prob = bi_lang_prob(test_data, eng_unigrams_freqdist, eng_bigrams_freqdist, len(set(eng_train_unigrams)))
    fre_prob = bi_lang_prob(test_data, fre_unigrams_freqdist, fre_bigrams_freqdist, len(set(fre_train_unigrams)))
    for i in range(len(eng_prob)):
        if eng_prob[i] >= fre_prob[i]:
            temp_eng_accuracy += 1
        else:
            temp_fre_accuracy += 1
    bi_eng_accuracy = (temp_eng_accuracy/len(eng_prob))*100
    bi_fre_accuracy = (temp_fre_accuracy/len(eng_prob))*100
    print(f"Model accuracy of English Bigrams is {bi_eng_accuracy}%.")
    print(f"Model accuracy of French Bigrams is {bi_fre_accuracy}%.")


# ----- DEV DATA -----

bi_english_french(e_dev)

Model accuracy of English Bigrams is 88.23529411764706%.
Model accuracy of French Bigrams is 11.76470588235294%.


In [199]:
# ----- TEST DATA -----


bi_english_french(english_test)

Model accuracy of English Bigrams is 87.4%.
Model accuracy of French Bigrams is 12.6%.


<br>

Almost the same procedure given for the unigram model is followed for trigram as well except for inside the tri_lang_prob function the word probability is calculated by doing count(trigram)+1/count(bigram)+count(vocabulary), the formula for which was taken from the 3rd chapter in the Speech and Patter Recognition - https://web.stanford.edu/~jurafsky/slp3/3.pdf.

<br>

In [200]:
# ----- English v. French Trigram model with English test data  -----


def tri_lang_prob(test_data, bigrams_freqdist, trigrams_freqdist, v):
    lang_prob = []
    for word in test_data:
        word_prob = 1
        word = "__"+word+"__"
        for char in range(len(word)-2):
            word_prob *= ((trigrams_freqdist[word[char:char+3]]+1)/(bigrams_freqdist[word[char:char+2]]+v))
        lang_prob.append(word_prob)
    return lang_prob
    
    
def tri_english_french(test_data):
    len_test_data = len(test_data)
    test_data = [w.lower() for w in test_data]
    temp_eng_accuracy = temp_fre_accuracy = 0
    eng_prob = tri_lang_prob(test_data, eng_bigrams_freqdist, eng_trigrams_freqdist, len(set(eng_train_unigrams)))
    fre_prob = tri_lang_prob(test_data, fre_bigrams_freqdist, fre_trigrams_freqdist, len(set(fre_train_unigrams)))
    for i in range(len(eng_prob)):
        if eng_prob[i] >= fre_prob[i]:
            temp_eng_accuracy += 1
        else:
            temp_fre_accuracy += 1
    tri_eng_accuracy = (temp_eng_accuracy/len(eng_prob))*100
    tri_fre_accuracy = (temp_fre_accuracy/len(eng_prob))*100
    print(f"Model accuracy of English Trigrams is {tri_eng_accuracy}%.")
    print(f"Model accuracy of French Trigrams is {tri_fre_accuracy}%.")

    
# ----- DEV DATA -----

tri_english_french(e_dev)    

Model accuracy of English Trigrams is 100.0%.
Model accuracy of French Trigrams is 0.0%.


In [201]:
# ----- TEST DATA -----

tri_english_french(english_test)

Model accuracy of English Trigrams is 90.10000000000001%.
Model accuracy of French Trigrams is 9.9%.


<br>

The uni_lang_prob, bi_lang_prob, tri_lang_prob functions were reused for spanish v italian.

<br>

In [175]:
# -----  Spanish v. English Unigram Model with Italian test data  -----



def uni_spanish_italian(test_data):
    len_test_data = len(test_data)
    test_data = [w.lower() for w in test_data]
    temp_spa_accuracy = temp_ita_accuracy = 0
    spa_prob = uni_lang_prob(test_data, spa_unigrams_freqdist, len(spa_train_unigrams))
    ita_prob = uni_lang_prob(test_data, ita_unigrams_freqdist, len(spa_train_unigrams))
    for i in range(len_test_data):
        if spa_prob[i] > ita_prob[i]:
            temp_spa_accuracy += 1
        elif spa_prob[i] < ita_prob[i]:
            temp_ita_accuracy += 1
        else:
            continue
    uni_spa_accuracy = (temp_spa_accuracy/len_test_data)*100
    uni_ita_accuracy = (temp_ita_accuracy/len_test_data)*100
    print(f"Model accuracy of Spanish is {uni_spa_accuracy}%.")
    print(f"Model accuracy of Italian is {uni_ita_accuracy}%.")
    

uni_spanish_italian(italian_test)

Model accuracy of Spanish is 27.900000000000002%.
Model accuracy of Italian is 59.199999999999996%.


In [168]:
# -----  Spanish v. Italian Bigram model with Spanish test data  -----



def bi_spanish_italian(test_data):
    len_test_data = len(test_data)
    test_data = [w.lower() for w in test_data]
    temp_spa_accuracy = temp_ita_accuracy = 0
    spa_prob = bi_lang_prob(test_data, spa_unigrams_freqdist, spa_bigrams_freqdist, len(set(spa_train_unigrams)))
    ita_prob = bi_lang_prob(test_data, ita_unigrams_freqdist, ita_bigrams_freqdist, len(set(spa_train_unigrams)))
    for i in range(len(spa_prob)):
        if spa_prob[i] > ita_prob[i]:
            temp_spa_accuracy += 1
        elif spa_prob[i] < ita_prob[i]:
            temp_ita_accuracy += 1
        else:
            continue
    bi_spa_accuracy = (temp_spa_accuracy/len_test_data)*100
    bi_ita_accuracy = (temp_ita_accuracy/len_test_data)*100
    print(f"Model accuracy of Spanish Bigrams is {bi_spa_accuracy}%.")
    print(f"Model accuracy of Italian Bigrams is {bi_ita_accuracy}%.")

    
bi_spanish_italian(spanish_test)

Model accuracy of Spanish Bigrams is 76.0%.
Model accuracy of Italian Bigrams is 13.5%.


In [169]:
# -----  Spanish v. Italian Trigram model with Italian test data  -----



def tri_spanish_italian(test_data):
    len_test_data = len(test_data)
    test_data = [w.lower() for w in test_data]
    temp_spa_accuracy = temp_ita_accuracy = 0
    spa_prob = tri_lang_prob(test_data, spa_bigrams_freqdist, spa_trigrams_freqdist, len(set(spa_train_unigrams)))
    ita_prob = tri_lang_prob(test_data, ita_bigrams_freqdist, ita_trigrams_freqdist, len(set(spa_train_unigrams)))
    for i in range(len_test_data):
        if spa_prob[i] > ita_prob[i]:
            temp_spa_accuracy += 1
        elif spa_prob[i] < ita_prob[i]:
            temp_ita_accuracy += 1
        else:
            continue
    tri_spa_accuracy = (temp_spa_accuracy/len_test_data)*100
    tri_ita_accuracy = (temp_ita_accuracy/len_test_data)*100
    print(f"Model accuracy of Spanish Trigrams is {tri_spa_accuracy}%.")
    print(f"Model accuracy of Italian Trigrams is {tri_ita_accuracy}%.")

    
tri_spanish_italian(italian_test)

Model accuracy of Spanish Trigrams is 22.1%.
Model accuracy of Italian Trigrams is 66.60000000000001%.
