In [1]:
import nltk
from nltk.corpus import udhr
from nltk.util import ngrams

In [2]:
english = udhr.raw('English-Latin1')
french = udhr.raw('French_Francais-Latin1')
italian = udhr.raw('Italian_Italiano-Latin1')
spanish = udhr.raw('Spanish_Espanol-Latin1')

In [3]:
english_train, english_dev = english[0:1000], english[1000:1100]
french_train, french_dev = french[0:1000], french[1000:1100]
italian_train, italian_dev = italian[0:1000], italian[1000:1100]
spanish_train, spanish_dev = spanish[0:1000], spanish[1000:1100]  
english_test = udhr.words('English-Latin1')[0:1000]
french_test = udhr.words('French_Francais-Latin1')[0:1000]
italian_test = udhr.words('Italian_Italiano-Latin1')[0:1000]
spanish_test = udhr.words('Spanish_Espanol-Latin1')[0:1000]

### Pre-processing text

In [4]:
def clean_text(text):
    punctuation_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens = punctuation_tokenizer.tokenize(text)
    filtered_text = [word.lower() for word in tokens]
    return filtered_text

In [5]:
english_train_temp = ''.join(english_train)
english_train_clean = clean_text(english_train_temp)
french_train_temp = ''.join(french_train)
french_train_clean = clean_text(french_train_temp)
italian_train_temp = ''.join(italian_train)
italian_train_clean = clean_text(italian_train_temp)
spanish_train_temp = ''.join(spanish_train)
spanish_train_clean = clean_text(spanish_train_temp)

english_dev_temp = ''.join(english_dev)
english_dev_clean = clean_text(english_dev_temp)
french_dev_temp = ''.join(french_dev)
french_dev_clean = clean_text(french_dev_temp)
italian_dev_temp = ''.join(italian_dev)
italian_dev_clean = clean_text(italian_dev_temp)
spanish_dev_temp = ''.join(spanish_dev)
spanish_dev_clean = clean_text(spanish_dev_temp)

english_test_temp = ' '.join(english_test)
english_test_clean = clean_text(english_test_temp)
french_test_temp = ' '.join(french_test)
french_test_clean = clean_text(french_test_temp)
italian_test_temp = ' '.join(italian_test)
italian_test_clean = clean_text(italian_test_temp)
spanish_test_temp = ' '.join(spanish_test)
spanish_test_clean = clean_text(spanish_test_temp)

### Extracting character-level unigrams, bigrams and trigrams from corpora

In [6]:
english_train_unigram = []
english_train_bigram = []
english_train_trigram = []
for word in english_train_clean:
    n = len(word)
    for i in range(n):
        english_train_unigram.append(word[i])
        if i + 1 < n:
            english_train_bigram.append((word[i], word[i + 1]))
            if i + 2 < n:
                english_train_trigram.append((word[i], word[i + 1], word[i + 2]))

In [7]:
french_train_unigram = []
french_train_bigram = []
french_train_trigram = []
for word in french_train_clean:
    n = len(word)
    for i in range(n):
        french_train_unigram.append(word[i])
        if i + 1 < n:
            french_train_bigram.append((word[i], word[i + 1]))
            if i + 2 < n:
                french_train_trigram.append((word[i], word[i + 1], word[i + 2]))

In [8]:
italian_train_unigram = []
italian_train_bigram = []
italian_train_trigram = []
for word in italian_train_clean:
    n = len(word)
    for i in range(n):
        italian_train_unigram.append(word[i])
        if i + 1 < n:
            italian_train_bigram.append((word[i], word[i + 1]))
            if i + 2 < n:
                italian_train_trigram.append((word[i], word[i + 1], word[i + 2]))

In [9]:
spanish_train_unigram = []
spanish_train_bigram = []
spanish_train_trigram = []
for word in spanish_train_clean:
    n = len(word)
    for i in range(n):
        spanish_train_unigram.append(word[i])
        if i + 1 < n:
            spanish_train_bigram.append((word[i], word[i + 1]))
            if i + 2 < n:
                spanish_train_trigram.append((word[i], word[i + 1], word[i + 2]))

### Defining probability for language prediction according to ngrams

In [10]:
epsilon_for_unknown = 1e-5

In [11]:
def unigram_probability(word, fdist1, fdist2):
    prob_1 = 1
    prob_2 = 1
    len_1 = sum(fdist1.values())
    len_2 = sum(fdist2.values())
    for letter in word:
        if letter not in fdist1.keys():
            prob_1 *= epsilon_for_unknown
        else:
            prob_1 *= fdist1[letter]/len_1
        if letter not in fdist2.keys():
            prob_2 *= epsilon_for_unknown
        else:
            prob_2 *= fdist2[letter]/len_2
    if (prob_1 == prob_2):
        return 'Undeterminable'
    elif (prob_1 > prob_2):
        return '1'
    else:
        return '2'

In [12]:
def bigram_probability(word, fdist1, fdist2):
    prob_1 = 1
    prob_2 = 1
    len_1 = sum(fdist1.values())
    len_2 = sum(fdist2.values())
    n = len(word)
    for i in range(n):
        if i + 1 < n:
            if (word[i], word[i + 1]) not in fdist1.keys():
                prob_1 *= epsilon_for_unknown
            else:
                prob_1 *= fdist1[(word[i], word[i + 1])]/len_1
            if (word[i], word[i + 1]) not in fdist2.keys():
                prob_2 *= epsilon_for_unknown
            else:
                prob_2 *= fdist2[(word[i], word[i + 1])]/len_2
    if (prob_1 == prob_2):
        return 'Undeterminable'
    elif (prob_1 > prob_2):
        return '1'
    else:
        return '2'

In [13]:
def trigram_probability(word, fdist1, fdist2):
    prob_1 = 1
    prob_2 = 1
    len_1 = sum(fdist1.values())
    len_2 = sum(fdist2.values())
    n = len(word)
    for i in range(n):
        if i + 1 < n and i + 2 < n:
            if (word[i], word[i + 1], word[i + 2]) not in fdist1.keys():
                prob_1 *= epsilon_for_unknown
            else:
                prob_1 *= fdist1[(word[i], word[i + 1], word[i + 2])]/len_1
            if (word[i], word[i + 1], word[i + 2]) not in fdist2.keys():
                prob_2 *= epsilon_for_unknown
            else:
                prob_2 *= fdist2[(word[i], word[i + 1], word[i + 2])]/len_2
    if (prob_1 == prob_2):
        return 'Undeterminable'
    elif (prob_1 > prob_2):
        return '1'
    else:
        return '2'

### Getting unigram, bigram, and trigram models for each language

In [14]:
fdist_english_train_unigram = dict(nltk.FreqDist(english_train_unigram))
fdist_french_train_unigram = dict(nltk.FreqDist(french_train_unigram))
fdist_italian_train_unigram = dict(nltk.FreqDist(italian_train_unigram))
fdist_spanish_train_unigram = dict(nltk.FreqDist(spanish_train_unigram))

In [15]:
fdist_english_train_bigram = dict(nltk.FreqDist(english_train_bigram))
fdist_french_train_bigram = dict(nltk.FreqDist(french_train_bigram))
fdist_italian_train_bigram = dict(nltk.FreqDist(italian_train_bigram))
fdist_spanish_train_bigram = dict(nltk.FreqDist(spanish_train_bigram))

In [16]:
fdist_english_train_trigram = dict(nltk.FreqDist(english_train_trigram))
fdist_french_train_trigram = dict(nltk.FreqDist(french_train_trigram))
fdist_italian_train_trigram = dict(nltk.FreqDist(italian_train_trigram))
fdist_spanish_train_trigram = dict(nltk.FreqDist(spanish_train_trigram))

### Detecting language based on characters

In [17]:
def return_unigram_results(data, fdist1, fdist2):
    final = []
    for word in data:
        final.append(unigram_probability(word, fdist1, fdist2))
    prob_1 = final.count('1')
    prob_2 = final.count('2')
    undeterministic = final.count('Undeterminable')
    return {'lang_1': prob_1, 'lang_2': prob_2, 'lang_undeterminable': undeterministic}

In [18]:
def return_bigram_results(data, fdist1, fdist2):
    final = []
    for word in data:
        final.append(bigram_probability(word, fdist1, fdist2))
    prob_1 = final.count('1')
    prob_2 = final.count('2')
    undeterministic = final.count('Undeterminable')
    return {'lang_1': prob_1, 'lang_2': prob_2, 'lang_undeterminable': undeterministic}

In [19]:
def return_trigram_results(data, fdist1, fdist2):
    final = []
    for word in data:
        final.append(trigram_probability(word, fdist1, fdist2))
    prob_1 = final.count('1')
    prob_2 = final.count('2')
    undeterministic = final.count('Undeterminable')
    return {'lang_1': prob_1, 'lang_2': prob_2, 'lang_undeterminable': undeterministic}

In [20]:
def compute_accuracy(results, lang):
    total = 0
    for key, value in results.items():
        total += value
    return (results[lang]/total)

### Validating evaluation

In [21]:
results_unigram_english = return_unigram_results(english_dev_clean, fdist_english_train_unigram, fdist_french_train_unigram)
print('Unigram accuracy:\t', compute_accuracy(results_unigram_english, 'lang_1'))

results_bigram_english = return_bigram_results(english_dev_clean, fdist_english_train_bigram, fdist_french_train_bigram)
print('Bigram accuracy:\t', compute_accuracy(results_bigram_english, 'lang_1'))

results_trigram_english = return_trigram_results(english_dev_clean, fdist_english_train_trigram, fdist_french_train_trigram)
print('Trigram accuracy:\t', compute_accuracy(results_trigram_english, 'lang_1'))

print('Unigram undeterministic ratio:\t', compute_accuracy(results_unigram_english, 'lang_undeterminable'))
print('Bigram undeterministic ratio:\t', compute_accuracy(results_bigram_english, 'lang_undeterminable'))
print('Trigram undeterministic ratio:\t', compute_accuracy(results_trigram_english, 'lang_undeterminable'))

Unigram accuracy:	 0.7058823529411765
Bigram accuracy:	 0.9411764705882353
Trigram accuracy:	 0.6470588235294118
Unigram undeterministic ratio:	 0.0
Bigram undeterministic ratio:	 0.0
Trigram undeterministic ratio:	 0.11764705882352941


In [22]:
results_unigram_french = return_unigram_results(french_dev_clean, fdist_english_train_unigram, fdist_french_train_unigram)
print('Unigram accuracy:\t', compute_accuracy(results_unigram_french, 'lang_2'))

results_bigram_french = return_bigram_results(french_dev_clean, fdist_english_train_bigram, fdist_french_train_bigram)
print('Bigram accuracy:\t', compute_accuracy(results_bigram_english, 'lang_2'))

results_trigram_french = return_trigram_results(french_dev_clean, fdist_english_train_trigram, fdist_french_train_trigram)
print('Trigram accuracy:\t', compute_accuracy(results_trigram_french, 'lang_2'))

print('Unigram undeterministic ratio:\t', compute_accuracy(results_unigram_french, 'lang_undeterminable'))
print('Bigram undeterministic ratio:\t', compute_accuracy(results_bigram_french, 'lang_undeterminable'))
print('Trigram undeterministic ratio:\t', compute_accuracy(results_trigram_french, 'lang_undeterminable'))

Unigram accuracy:	 0.8947368421052632
Bigram accuracy:	 0.058823529411764705
Trigram accuracy:	 0.42105263157894735
Unigram undeterministic ratio:	 0.0
Bigram undeterministic ratio:	 0.05263157894736842
Trigram undeterministic ratio:	 0.47368421052631576


In [23]:
results_unigram_spanish = return_unigram_results(spanish_dev_clean, fdist_spanish_train_unigram, fdist_italian_train_unigram)
print('Unigram accuracy:\t', compute_accuracy(results_unigram_spanish, 'lang_1'))

results_bigram_spanish = return_bigram_results(spanish_dev_clean, fdist_spanish_train_bigram, fdist_italian_train_bigram)
print('Bigram accuracy:\t', compute_accuracy(results_bigram_spanish, 'lang_1'))

results_trigram_spanish = return_trigram_results(spanish_dev_clean, fdist_spanish_train_trigram, fdist_italian_train_trigram)
print('Trigram accuracy:\t', compute_accuracy(results_trigram_spanish, 'lang_1'))

print('Unigram undeterministic ratio:\t', compute_accuracy(results_unigram_spanish, 'lang_undeterminable'))
print('Bigram undeterministic ratio:\t', compute_accuracy(results_bigram_spanish, 'lang_undeterminable'))
print('Trigram undeterministic ratio:\t', compute_accuracy(results_trigram_spanish, 'lang_undeterminable'))

Unigram accuracy:	 0.8666666666666667
Bigram accuracy:	 0.9333333333333333
Trigram accuracy:	 0.3333333333333333
Unigram undeterministic ratio:	 0.0
Bigram undeterministic ratio:	 0.0
Trigram undeterministic ratio:	 0.3333333333333333


In [24]:
results_unigram_italian = return_unigram_results(italian_dev_clean, fdist_spanish_train_unigram, fdist_italian_train_unigram)
print('Unigram accuracy:\t', compute_accuracy(results_unigram_italian, 'lang_2'))

results_bigram_italian = return_bigram_results(italian_dev_clean, fdist_spanish_train_bigram, fdist_italian_train_bigram)
print('Bigram accuracy:\t', compute_accuracy(results_bigram_italian, 'lang_2'))

results_trigram_italian = return_trigram_results(italian_dev_clean, fdist_spanish_train_trigram, fdist_italian_train_trigram)
print('Trigram accuracy:\t', compute_accuracy(results_trigram_italian, 'lang_2'))

print('Unigram undeterministic ratio:\t', compute_accuracy(results_unigram_italian, 'lang_undeterminable'))
print('Bigram undeterministic ratio:\t', compute_accuracy(results_bigram_italian, 'lang_undeterminable'))
print('Trigram undeterministic ratio:\t', compute_accuracy(results_trigram_italian, 'lang_undeterminable'))

Unigram accuracy:	 0.47058823529411764
Bigram accuracy:	 0.6470588235294118
Trigram accuracy:	 0.5882352941176471
Unigram undeterministic ratio:	 0.0
Bigram undeterministic ratio:	 0.11764705882352941
Trigram undeterministic ratio:	 0.29411764705882354


# Part 1

### Accuracy for english set between english and french

In [30]:
results_unigram_english = return_unigram_results(english_test_clean, fdist_english_train_unigram, fdist_french_train_unigram)
print('Unigram english accuracy:\t', compute_accuracy(results_unigram_english, 'lang_1'))
print('Unigram french accuracy:\t', compute_accuracy(results_unigram_english, 'lang_2'))

results_bigram_english = return_bigram_results(english_test_clean, fdist_english_train_bigram, fdist_french_train_bigram)
print('Bigram english accuracy:\t', compute_accuracy(results_bigram_english, 'lang_1'))
print('Bigram french accuracy:\t', compute_accuracy(results_bigram_english, 'lang_2'))

results_trigram_english = return_trigram_results(english_test_clean, fdist_english_train_trigram, fdist_french_train_trigram)
print('Trigram english accuracy:\t', compute_accuracy(results_trigram_english, 'lang_1'))
print('Trigram french accuracy:\t', compute_accuracy(results_trigram_english, 'lang_2'))

print('Unigram undeterministic ratio:\t', compute_accuracy(results_unigram_english, 'lang_undeterminable'))
print('Bigram undeterministic ratio:\t', compute_accuracy(results_bigram_english, 'lang_undeterminable'))
print('Trigram undeterministic ratio:\t', compute_accuracy(results_trigram_english, 'lang_undeterminable'))

Unigram english accuracy:	 0.7450765864332604
Unigram french accuracy:	 0.237417943107221
Bigram english accuracy:	 0.8096280087527352
Bigram french accuracy:	 0.15645514223194748
Trigram english accuracy:	 0.5207877461706784
Trigram french accuracy:	 0.14660831509846828
Unigram undeterministic ratio:	 0.0175054704595186
Bigram undeterministic ratio:	 0.03391684901531729
Trigram undeterministic ratio:	 0.33260393873085337


### Accuracy for french set between english and french

In [33]:
results_unigram_french = return_unigram_results(french_test_clean, fdist_english_train_unigram, fdist_french_train_unigram)
print('Unigram english accuracy:\t', compute_accuracy(results_unigram_french, 'lang_1'))
print('Unigram french accuracy:\t', compute_accuracy(results_unigram_french, 'lang_2'))

results_bigram_french = return_bigram_results(french_test_clean, fdist_english_train_bigram, fdist_french_train_bigram)
print('Bigram english accuracy:\t', compute_accuracy(results_bigram_french, 'lang_1'))
print('Bigram french accuracy:\t', compute_accuracy(results_bigram_french, 'lang_2'))

results_trigram_french = return_trigram_results(french_test_clean, fdist_english_train_trigram, fdist_french_train_trigram)
print('Trigram english accuracy:\t', compute_accuracy(results_trigram_french, 'lang_1'))
print('Trigram french accuracy:\t', compute_accuracy(results_trigram_french, 'lang_2'))

print('Unigram undeterministic ratio:\t', compute_accuracy(results_unigram_french, 'lang_undeterminable'))
print('Bigram undeterministic ratio:\t', compute_accuracy(results_bigram_french, 'lang_undeterminable'))
print('Trigram undeterministic ratio:\t', compute_accuracy(results_trigram_french, 'lang_undeterminable'))

Unigram english accuracy:	 0.16271186440677965
Unigram french accuracy:	 0.823728813559322
Bigram english accuracy:	 0.14915254237288136
Bigram french accuracy:	 0.7604519774011299
Trigram english accuracy:	 0.09152542372881356
Trigram french accuracy:	 0.5028248587570622
Unigram undeterministic ratio:	 0.013559322033898305
Bigram undeterministic ratio:	 0.0903954802259887
Trigram undeterministic ratio:	 0.4056497175141243


# Part 2

### Accuracy for spanish set between italian and spanish

In [31]:
results_unigram_spanish = return_unigram_results(spanish_test_clean, fdist_spanish_train_unigram, fdist_italian_train_unigram)
print('Unigram spanish accuracy:\t', compute_accuracy(results_unigram_spanish, 'lang_1'))
print('Unigram italian accuracy:\t', compute_accuracy(results_unigram_spanish, 'lang_2'))

results_bigram_spanish = return_bigram_results(spanish_test_clean, fdist_spanish_train_bigram, fdist_italian_train_bigram)
print('Bigram spanish accuracy:\t', compute_accuracy(results_bigram_spanish, 'lang_1'))
print('Bigram italian accuracy:\t', compute_accuracy(results_bigram_spanish, 'lang_2'))

results_trigram_spanish = return_trigram_results(spanish_test_clean, fdist_spanish_train_trigram, fdist_italian_train_trigram)
print('Trigram spanish accuracy:\t', compute_accuracy(results_trigram_spanish, 'lang_1'))
print('Trigram italian accuracy:\t', compute_accuracy(results_trigram_spanish, 'lang_2'))

print('Unigram undeterministic ratio:\t', compute_accuracy(results_unigram_spanish, 'lang_undeterminable'))
print('Bigram undeterministic ratio:\t', compute_accuracy(results_bigram_spanish, 'lang_undeterminable'))
print('Trigram undeterministic ratio:\t', compute_accuracy(results_trigram_spanish, 'lang_undeterminable'))

Unigram spanish accuracy:	 0.7149122807017544
Unigram italian accuracy:	 0.2817982456140351
Bigram spanish accuracy:	 0.6853070175438597
Bigram italian accuracy:	 0.19846491228070176
Trigram spanish accuracy:	 0.4967105263157895
Trigram italian accuracy:	 0.10197368421052631
Unigram undeterministic ratio:	 0.003289473684210526
Bigram undeterministic ratio:	 0.1162280701754386
Trigram undeterministic ratio:	 0.40131578947368424


### Accuracy for italian set between italian and spanish

In [34]:
results_unigram_italian = return_unigram_results(italian_test_clean, fdist_spanish_train_unigram, fdist_italian_train_unigram)
print('Unigram spanish accuracy:\t', compute_accuracy(results_unigram_italian, 'lang_1'))
print('Unigram italian accuracy:\t', compute_accuracy(results_unigram_italian, 'lang_2'))

results_bigram_italian = return_bigram_results(italian_test_clean, fdist_spanish_train_bigram, fdist_italian_train_bigram)
print('Bigram spanish accuracy:\t', compute_accuracy(results_bigram_italian, 'lang_1'))
print('Bigram italian accuracy:\t', compute_accuracy(results_bigram_italian, 'lang_2'))

results_trigram_italian = return_trigram_results(italian_test_clean, fdist_spanish_train_trigram, fdist_italian_train_trigram)
print('Trigram spanish accuracy:\t', compute_accuracy(results_trigram_italian, 'lang_1'))
print('Trigram italian accuracy:\t', compute_accuracy(results_trigram_italian, 'lang_2'))

print('Unigram undeterministic ratio:\t', compute_accuracy(results_unigram_italian, 'lang_undeterminable'))
print('Bigram undeterministic ratio:\t', compute_accuracy(results_bigram_italian, 'lang_undeterminable'))
print('Trigram undeterministic ratio:\t', compute_accuracy(results_trigram_italian, 'lang_undeterminable'))

Unigram spanish accuracy:	 0.37028824833702884
Unigram italian accuracy:	 0.6263858093126385
Bigram spanish accuracy:	 0.20288248337028825
Bigram italian accuracy:	 0.6829268292682927
Trigram spanish accuracy:	 0.10975609756097561
Trigram italian accuracy:	 0.5620842572062085
Unigram undeterministic ratio:	 0.0033259423503325942
Bigram undeterministic ratio:	 0.11419068736141907
Trigram undeterministic ratio:	 0.328159645232816
