# Language Identification

Given a short training corpus--the preamble of the UN Universal Declaration of Human Rights in six languages--we wish to construct a language model sufficient to identify the language of future documents.


Joe Comer
9/22/2018

In [17]:
import pandas as pd
import numpy as np
import os
import re
import unicodedata
import operator

I want to get rid of the easy identifiers like diacritical marks.

In [18]:
def remove_accents(input_str):
    """Remove diacritical marks from text."""
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return str(only_ascii)[2:-3].lower()

Now I'm curious how different the sets of distinct characters are after the above decoding.

In [19]:
charsets = dict()
for root, dirs, files in os.walk("./Language_Identification/train/"):
    for filename in files:
        if filename[:4] != "filt":
            charsets[filename] = set()
            with open("./Language_Identification/train/" + filename, encoding='UTF-8') as iFile:
                for line in iFile:
                    string = remove_accents(line)
                    charsets[filename] = charsets[filename].union(set(string))

In [20]:
for set1 in charsets:
    for set2 in charsets:
        if set1 != set2:
            print(set1, ",", set2, charsets[set1]-charsets[set2])

dut.txt , eng.txt {';', ':'}
dut.txt , esper.txt {'y', ';', 'w', ':'}
dut.txt , frn.txt {':', 'w', ';', 'z', 'k'}
dut.txt , ger.txt {':', ';', 'x', '\\'}
dut.txt , spn.txt {'k', 'w', ':'}
eng.txt , dut.txt {'.', 'q'}
eng.txt , esper.txt {'y', 'w', 'q'}
eng.txt , frn.txt {'k', 'w', 'z'}
eng.txt , ger.txt {'q', 'x', '\\'}
eng.txt , spn.txt {'k', 'w'}
esper.txt , dut.txt {'7', '1', '-', '8', ')', '9', '4', '(', '.', '2'}
esper.txt , eng.txt {'7', '1', '-', '8', ')', '9', '4', '(', '2'}
esper.txt , frn.txt {'7', '1', '8', ')', '9', '4', '(', '2', 'z', 'k'}
esper.txt , ger.txt {'x', '\\'}
esper.txt , spn.txt {'k', '-'}
frn.txt , dut.txt {'-', "'", '.', 'q'}
frn.txt , eng.txt {'-', "'"}
frn.txt , esper.txt {'y', "'", 'q'}
frn.txt , ger.txt {'q', "'", 'x', '\\'}
frn.txt , spn.txt {'-', "'"}
ger.txt , dut.txt {'7', '1', '-', ')', '4', '9', '8', '(', '.', '2'}
ger.txt , eng.txt {'7', '1', '-', ')', '4', '9', '8', '(', '2'}
ger.txt , esper.txt {'w', 'y'}
ger.txt , frn.txt {'7', '1', 'w', ')', '4

Ok it looks like the sets of unique characters associated to each language are pretty similar with the exception of a few consonants that appear in some languages and not others. We will want to deal with irrelevant characters like numbers and punctuation, but first, let's be sure that the decoding didn't result in any strange, unexpected characters.

In [21]:
for chars in charsets:
    print(charsets[chars])

{'m', 'd', 'n', '\\', 's', ':', 'f', 'w', 'e', 'a', 'g', 'j', 'l', 'u', 'y', 'v', 'b', 'c', ',', 'h', 'x', '0', ' ', 'p', 'i', ';', 'z', 'r', 'k', 't', 'o'}
{'n', '\\', 's', 'q', 'e', 'j', 'v', 'b', 'x', '0', 'p', 'z', 'k', 'm', 'd', 'f', 'w', '.', 'g', 'a', 'u', 'y', 'c', ',', 'h', ' ', 'i', 'l', 'r', 't', 'o'}
{'n', '\\', 's', ')', '9', 'e', '2', 'j', 'v', '1', 'b', 'x', '0', '8', '4', 'p', 'z', 'k', 'm', 'd', '7', '-', 'f', '(', '.', 'a', 'g', 'u', 'c', ',', 'h', ' ', 'i', 'l', 'r', 't', 'o'}
{'m', 'd', 'n', '\\', 's', '-', 'f', 'q', '.', 'e', 'a', 'g', 'j', 'u', 'y', "'", 'v', 'b', 'c', ',', 'h', 'x', '0', ' ', 'p', 'i', 'l', 'r', 't', 'o'}
{'n', 's', ')', '9', 'e', '2', 'j', '1', 'v', 'b', '0', '4', '8', 'p', 'z', 'k', '7', 'd', 'm', '-', 'f', 'w', '(', '.', 'g', 'a', 'u', 'y', 'c', ',', 'h', ' ', 'i', 'l', 'r', 't', 'o'}
{'n', '\\', 's', ')', 'q', '9', 'e', '2', 'j', 'v', '1', 'b', 'x', '0', '4', '8', 'p', 'z', 'm', 'd', '7', 'f', '(', '.', 'a', 'g', 'u', 'y', 'c', ',', 'h', ' ',

No crazy wildcard characters anywhere. That's good. Looks like the accent removal went somewhat smoothly.

I'm curious whether one can distinguish among these languages by their patterns in vowel, nasal, fricative, and plosive consonant use. Because English spelling conventions are highly variable, I'll only be able to approximately encode the difference. For example, is 'g' a plosive or a fricative \[or silent\]? The cases and exceptions are too numerous to encode exhaustively. If the following naive encoding doesn't give good classifications, then I may need to try to cover some of them, but first, let's see how this simple encoding performs.

In [22]:
plosives = ['p','t','k','b','d','g','c','q']
fricatives = ['f','g','h','j','l','r','s','v','w','y','z']
nasals = ['n','m']
vowels = ['a','e','i','o','u']

filter_dict = dict()
for letter in plosives:
    filter_dict[letter] = '1'
for letter in fricatives:
    filter_dict[letter] = '2'
for letter in nasals:
    filter_dict[letter] = '3'
for letter in vowels:
    filter_dict[letter] = '4'

# Special case for x, ng, ch, ph, sh, th
filter_dict['x'] = '12'
filter_dict['ng'] = '3'
filter_dict['ch'] = '2'
filter_dict['ph'] = '2'
filter_dict['sh'] = '2'
filter_dict['th'] = '2'

# We don't care about numbers as distinct from one another, because all the training corpus documents
# should include the same numbers, however,
# it may be useful to preserve the information that a character was *some* number.
for i in range(10):
    filter_dict[str(i)] = '0'

Note that because of rules involving certain consonant pairs that produce a single sound, output strings from this encoding may be shorter than the input strings. Each number is meant to represent a member of a class of sound rather than a class of letter.

In [23]:
def filter_string(string):
    out_string = ""
    i = 0
    while i < len(string):
        if string[i] not in filter_dict:
            if string[i] == " ":
                filtered = " "
            else:
                # Non-alphanumeric characters will receive the '?' wildcard, along
                # with any characters not seen in the training corpus.
                # While punctuation conventions do vary from language to language,
                # it is not the focus of this experiment.
                filtered = "?"
        elif (i < len(string) - 1) and (string[i:i+2] in filter_dict):
            sound = string[i:i+2]
            filtered = filter_dict[sound]
            i += 1
        else:
            sound = string[i]
            filtered = filter_dict[sound]
        out_string += filtered
        i += 1
    return out_string

In [24]:
test = "testing this 124 ]"
filter_string(test)

'142143 242 000 ?'

In [25]:
for root, dirs, files in os.walk("./Language_Identification/train/"):
    for filename in files:
        if filename[:4] != "filt":
            with open("./Language_Identification/train/" + filename, 'r+', encoding='UTF-8') as iFile:
                with open("./Language_Identification/train/Filtered/filtered_" + filename, 'w+') as oFile:
                    for line in iFile:
                        oFile.write(filter_string(remove_accents(line)))

Ok now we have the filtered versions of the text. They look like this:

In [26]:
with open("./Language_Identification/train/Filtered/filtered_eng.txt") as testFile:
    print(next(testFile))

434242242 14124241443 42 24343 242212124431242242442 24142341443 42 24 43242431 1423412 431 42 24 41442 431 43424434124242212 42 422 3431422 42 24 24343 243422 42 24 2443141443 42 2244143? 2421414431 14414 43 24 24221?2242442 142242421 431 14314311 242 24343 242212 2424 24242141 43 1421424424112 2242 2424 44124241 24 1432144314 42 3431431? 431 24 412431 42 4 2422143 2242 24343 14432 2422 43242 2244143 42 21442 431 142442 431 22441432243 2442 431 2431 242 1443 1241244341 42 24 2422421 4214241443 42 24 143343144124?2242442 41 42 422431442? 42 343 42 341 14 14 143142241 14 2424 24144224? 42 4 2421242421? 14 241422443 4244321 1224332 431 4112422443? 241 24343 242212 24421 14124141141 12 24 2424 42 242?2242442 41 42 422431442 14 1243414 24 14242413431 42 22443122 242414432 14124433414432?2242442 24 1441242 42 24 434141 3414432 2424 43 24 242142 2442242341 24422442 43 24314343142 24343 242212? 43 24 1423412 431 2422 42 24 24343 142243431 43 24 41442 242212 42 343 431 24343 431 2424 141423434

In [27]:
def count_bigram(bigram, string):
    """This will actually also count unigrams."""
    count = start = 0
    while True:
        start = string.find(bigram, start) + 1
        if start > 0:
            count+=1
        else:
            return count

In [28]:
def train_model(filename):
    """Train a bigram model on a text file.
    filename: name of the text file
    returns: probs, dataframe of conditional probabilities
    The i,jth entry of probs is P(j|i)"""
    def prob(i, j, string):
        """Calculate the probability of j given i"""
        first_char = str(i)
        second_char = str(j)
        conditional_prob = (bigram_counts[first_char + second_char] + (unigram_counts[second_char]/len(string)))/(unigram_counts[first_char] + 1)
        return conditional_prob
    with open("./Language_Identification/train/Filtered/" + filename) as iFile:
        corpus = next(iFile) # These files are all a single line after the filtering above.
        
        # Add a start character and end character to ensure a single probability distribution
        # over documents of any length.
        corpus = "." + corpus + "!" 
        charset = [str(x) for x in set(corpus)]
        bigrams = [x+y for x in charset for y in charset]
        bigram_counts = dict([(big, count_bigram(big, corpus)) for big in bigrams])
        unigram_counts = dict([(uni, count_bigram(uni, corpus)) for uni in charset])
        probs = pd.DataFrame(columns = [[char for char in charset]])
        for char in charset:
            probs.loc[char] = [prob(char,j, corpus) for j in probs.columns]
    return probs

Now we create a dictionary of probability tables for each language.

In [29]:
models = dict()
for root, dirs, files in os.walk("./Language_Identification/train/"):
    for filename in files:
        if filename[:4] == "filt":
            models[filename[9:]] = train_model(filename)

Quick test: What is the probability of seeing a nasal consonant following a plosive consonant in English?

In [30]:
models['eng.txt']['1']['3']

0.32732938722241062

In [31]:
def language_id(filename, models_dict=models):
    print(filename)
    with open("./Language_Identification/test/" + filename,"r+", encoding='UTF-8') as iFile:
        corpus = ""
        for line in iFile:
            corpus += filter_string(remove_accents(line))
        corpus = "." + corpus + "!"
        likelihoods = dict([(model, 0) for model in models_dict])
        for i in range(len(corpus)-1):
            bigram = corpus[i:i+2]
            for model in models_dict:
                likelihoods[model] += np.log(models_dict[model][bigram[1]][bigram[0]]) # Pandas indexing is backwards from standard matrix notation
        most_likely = max(likelihoods.items(), key=operator.itemgetter(1))[0]
    return most_likely

In [32]:
for root, dirs, files in os.walk("./Language_Identification/test/"):
    for filename in files:
        print(filename, language_id(filename))

dut.txt
dut.txt ger.txt
eng.txt
eng.txt eng.txt
esper.txt
esper.txt esper.txt
frn.txt
frn.txt frn.txt
ger.txt
ger.txt ger.txt
spn.txt
spn.txt spn.txt


The naive approach performs pretty well. Dutch and German are qualitatively similar in their use of phonemes, so it is unsurprising that this is where the model breaks down. It is possible that we could get better results by further investigating whether our class assignments for each letter are consistent across these two languages, but I suspect that a more granular model may be required to distinguish these cases.