# IdiomCheck - Language Detection



In [3]:
# -*- coding: utf-8 -*-

In [114]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import re
import os
import glob
import numpy as np
import pandas as pd
import string
from tqdm import tqdm_notebook as tqdm
from pandas_ml import ConfusionMatrix

In [61]:
def corpus(file_list):
    '''
    Inputs:
    file_list - the list of paths of the text files to create the corpus
    
    Returns:
    A list of strings containing the corpus
    '''
    corpus = []

    for file_path in file_list:
        with open(file_path) as f_input:
            sample = re.sub(r'\([^)]*\)', ' ', re.sub('<[^>]+>', '', f_input.read()).replace('\n', ' '))[:20000]
            sample = re.sub(r'/[^\w\s]/gi', '', sample)
            sample = re.sub(r'[^\u0000-\u0800]', '', sample) # select only the first 2048 UTF-8 characters
#             sample = sample.replace(chr(8221),"")
            if len(sample) > 0:
                corpus.append(sample)

    return corpus

In [74]:
def rank(corpus, ngram_range=(1,3), rank_length=200):
    '''
    Inputs:
    corpus - a list of strings
    ngram_range - the range of ngram lengths to consider (defaults to 1-3)
    rank_length - the length of the rank list to return (defaults to 200)
    
    Retuns:
    rank - the array of ngrams ordered by frequency of occurrence in the corpus
    '''
    vectorizer = TfidfVectorizer(input='content',ngram_range=(1,3), analyzer='char_wb')
    transformed = vectorizer.fit_transform(corpus)
    features = vectorizer.get_feature_names()
    sums = transformed.sum(axis=0)
    
    return np.array(features)[np.array(np.argsort(sums[0,:]))[0]][-rank_length:]

In [75]:
def rank_sim(a, b):
    '''
    Takes two ranked arrays of the same shape and returns the average distance between the indices of matching elements
    Elements which don't match are given a distance of the length of the array
    '''
    c = a[np.where(np.in1d(a,b))[0]]
    d = c.reshape((len(c),1))
    e = np.abs(np.where(np.in1d(a,b))[0]-np.where(d==b)[1])
    no_match_penalty = (len(a)-len(e))*len(a) # TODO: make this condition more rigourous
    
    return (e.sum() + no_match_penalty)/len(a)

## Data Cleaning

In [4]:
file_list = glob.glob(os.path.join(os.getcwd(), "txt", "bg","*.txt"))
the_corpus = corpus(file_list[:50])
the_corpus[12]

' Избор на квестори на Европейския парламент  : вж. протокола   . '

Have noticed a few issues with the samples, more cleaning needs to be done. This one above for example contains German (normally in brackets at the end).

In [5]:
file_list = glob.glob(os.path.join(os.getcwd(), "txt", "bg","*.txt"))
the_corpus = corpus(file_list[:50])
the_corpus[12]

' Избор на квестори на Европейския парламент  : вж. протокола   . '

Now it has been removed, great. It has also removed all other text in brackets but this will be in a minority of cases.

In [6]:
file_list = glob.glob(os.path.join(os.getcwd(), "txt", "cs","*.txt"))
the_corpus = corpus(file_list[:50])

In [7]:
print(the_corpus[0])
print(the_corpus[1])
print(the_corpus[2])

 Schválení zápisu z předchozího zasedání: viz zápis 
 Členství ve výborech a delegacích: viz zápis 
 Předložení dokumentů: viz zápis 


Many of the early files contain the same ending which we'll need to strip as these will effect the frequency counts. Luckily each language has the same phrase after the colon in the same corresponding file. My guess is that this doesn't affect the overall statistics, given that it'll be lost amongst large text files. We can always come back and alter this.

## Initial Test

Let's try the functions on some example data.

In [125]:
file_list_sl = glob.glob(os.path.join(os.getcwd(), "txt", "sl","*.txt"))
file_list_lv = glob.glob(os.path.join(os.getcwd(), "txt", "lv","*.txt"))

In [126]:
corpus_sl_1 = corpus(file_list_sl[:50])
corpus_sl_2 = corpus(file_list_sl[50:100])
corpus_lv_1 = corpus(file_list_lv[:50])
corpus_lv_2 = corpus(file_list_lv[50:100])

rank_sl_1 = rank(corpus_sl_1)
rank_sl_2 = rank(corpus_sl_2)
rank_lv_1 = rank(corpus_lv_1)
rank_lv_2 = rank(corpus_lv_2)

print('Slovakian/Latvian similarity score:', rank_sim(rank_sl_1, rank_lv_1))
print('Slovakian/Slovakian similarity score:', rank_sim(rank_sl_1, rank_sl_2))
print('Latvian/Latvian similarity score:', rank_sim(rank_lv_1, rank_lv_2))

Slovakian/Latvian similarity score: 127.535
Slovakian/Slovakian similarity score: 46.435
Latvian/Latvian. similarity score: 55.595


We can see that both languages show similarity to themselves (a low score means a stronger agreement between their ranks), and less similarity between each other. In this simple binary classification, the two languages would have been told apart.

## Extracting Ranks

All we will need to classify a given sample of text are the ranked ngrams for each language after analysing the whole dataset.

In [19]:
%pprint

Pretty printing has been turned OFF


In [20]:
languages = next(os.walk('./txt'))[1]
languages

['bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi', 'fr', 'hu', 'it', 'lt', 'lv', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sv']

In [77]:
ranks = {}

for language in tqdm(languages):
    file_list = glob.glob(os.path.join(os.getcwd(), "txt", language,"*.txt"))
    lang_corpus = corpus(file_list[:500])
    lang_rank = rank(lang_corpus)
    ranks[language] = lang_rank

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))




## Test Data

The test data comes as a .tsv file with the first column representing the language and the second column containing the string of text.

In [36]:
test = pd.read_csv('europarl-test.txt', sep='\t', header=None, names=['language', 'string'])

In [13]:
test.head()

Unnamed: 0,language,string
0,bg,Европа 2020 не трябва да стартира нов конкурен...
1,bg,(CS) Най-голямата несправедливост на сегашната...
2,bg,"(DE) Г-жо председател, г-н член на Комисията, ..."
3,bg,"(DE) Г-н председател, бих искал да започна с к..."
4,bg,"(DE) Г-н председател, въпросът за правата на ч..."


Next we need to strip the '(CS)','(DE)' etc. As well as all the non-alphanumeric characters.

In [37]:
test['string'] = test['string'].str.replace(r"\(.*\)","")
test['string'] = test['string'].str.replace("'","")
test['string'] = test['string'].apply(lambda x: re.sub(r'/[^\w\s]/gi', '', x))
test['string'] = test['string'].apply(lambda x: re.sub(r'[^\u0000-\u0800]', '', x))
test = test[test['string'].apply(len) != 0] # remove empty strings
test.head()

Unnamed: 0,language,string
0,bg,Европа 2020 не трябва да стартира нов конкурен...
1,bg,Най-голямата несправедливост на сегашната общ...
2,bg,"Г-жо председател, г-н член на Комисията, по п..."
3,bg,"Г-н председател, бих искал да започна с комен..."
4,bg,"Г-н председател, въпросът за правата на човек..."


We also need to remove the rows with empty strings.

In [38]:
test.shape

(20762, 2)

So we have a test dataset of 20828 samples. 

In [17]:
test.language.unique()

array(['bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi', 'fr', 'hu',
       'it', 'lt', 'lv', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sv'],
      dtype=object)

In [21]:
set(test.language.unique()).issubset(languages)

True

Therefore all of the languages in the test dataset are in the training dataset, this is good.

## Classifier

In [78]:
def idiom_check(lang_sample, ranks, print_scores=False):
    '''
    Inputs:
    lang_sample - the string to be classified
    ranks - the dict() containing the language keys and their corresponding ranks
    print_scores - bool - if True, the scores for each separate language is printed
    
    Returns:
    The predicted language of lang_sample
    '''
    scores = {}
    
    sample_rank = rank([lang_sample])
    
    for key, rk in ranks.items():
        scores[key] = rank_sim(sample_rank, rk)
        
    if print_scores:
        print(scores)

    return min(scores, key=scores.get)

In [79]:
results = test.copy()[:10000]
results['pred_lang'] = results['string'].apply(lambda x: idiom_check(x, ranks))

In [80]:
print('Accuracy:',(results['language'] == results['pred_lang']).sum()/len(results))

Accuracy: 0.834


This isn't bad, but we could do better. Could also try a markov model at the character level and compare the likelihood for each sentence to be from a particular language.

## Markov Chain MLE

I aim to treat each string as a first-order Markov chain and then extract the character-level transition matrix for each language. Then I can calculate the $\log(probability)$ for a particular string to belong to each language and choose the maximally likely option. Something along the lines of [this](https://pdfs.semanticscholar.org/2bf0/8addb83f51befa8b4bc7ed16b54ed34018d0.pdf) I suppose.

There don't seem to be any Python packages for markov chains(?!) I will have to code this myself.

In [22]:
# From - stackoverflow.com/a/43413801

def strided_axis0(a, L):
    # Store the shape and strides info
    shp = a.shape
    s  = a.strides

    # Compute length of output array along the first axis
    nd0 = shp[0]-L+1

    # Setup shape and strides for use with np.lib.stride_tricks.as_strided
    # and get (n+1) dim output array
    shp_in = (nd0,L)+shp[1:]
    strd_in = (s[0],) + s
    return np.lib.stride_tricks.as_strided(a, shape=shp_in, strides=strd_in)

This gives us the transitions as required.

In [91]:
trial = "This is a test"
test_arr = np.array(list(trial))
pairs = strided_axis0(test_arr, 2)
pairs[:10]

array([['T', 'h'],
       ['h', 'i'],
       ['i', 's'],
       ['s', ' '],
       [' ', 'i'],
       ['i', 's'],
       ['s', ' '],
       [' ', 'a'],
       ['a', ' '],
       [' ', 't']], dtype='<U1')

In [93]:
len(strided_axis0(test_arr, 2))

13

In [94]:
len(np.unique(strided_axis0(test_arr, 2), axis=0))

11

We can see that there are only 11 unique transitions out of 13 measured transitions - this is how we will build our transition matrix.

Our transition matrix could end up being quite large. We already have over 16,000 entries for the 128 characters in US-ASCII, if we extend this to the Greek characters then this could be around 4,000,000 entries.

It's probably best to store these as a (vectorised) numpy array.

In [23]:
def int_encode(groups, encoder):
    '''
    Input:
    groups - array - groups of characters to be encoded
    encoder - a sklearn.preprocessing.LabelEncoder object which has been prefitted to a vocabulary
    
    Returns:
    int_encoded - array - the integer encoded groups
    vocab - array - an array where the index of each item corresponds to the integers in int_encoded: a lookup
    
    TODO: predefined vocab as input
    '''
    flat_groups = groups.flatten()
    try:
        int_encoded = encoder.transform(flat_groups).reshape(groups.shape)
    except ValueError:
        print(''.join(list(groups[:,1])))
    
    return int_encoded

We need to work out a way of dealing with the zero-valued probabilities - these are events that aren't in the training data, but could be seen in the test data. For this it seems we should use some kind of [smoothing](https://pdfs.semanticscholar.org/5b2b/78087e51641a02966d6dcf20b51a5c43ccca.pdf). I should use _absolute discounting_ as it is easy to implement and apparently quite effective (would ideally use _Kneser-Ney smoothing_ but this would be very involved). [Here](http://u.cs.biu.ac.il/~yogo/courses/mt2013/papers/chen-goodman-99.pdf) is a good guide.

However, this still seems too time-consuming. It may suffice to add a very small amount of probability to the zero values, we'll see when we test the model.

In [86]:
def smooth(count_matrix):
    '''
    Smooths zero values with a small number << 1 (an altered form of Laplace smoothing)
    
    TODO: turn this into Kneser-Ney smoothing
    
    '''
    
    count_matrix[count_matrix == 0] = 1e-10
    
    return count_matrix

In [99]:
def smooth(trans_mat):
    '''
    Smooths zero values with a small number << 1 (an altered form of Laplace smoothing)
    
    TODO: turn this into Kneser-Ney smoothing
    
    '''
    smooth_prob = trans_mat[(trans_mat != 0) & ~np.isnan(trans_mat)].min()/10 # take the minimum probability and choose something smaller than it
    trans_mat[trans_mat == 0] = smooth_prob
    trans_mat[np.isnan(trans_mat)] = smooth_prob
    
    return trans_mat

In [25]:
def transition_matrix(text_list, vocab, encoder):
    '''
    Input:
    text_list - list of strings - the texts to analyse
    
    Returns:
    trans_mat - array - the Markovian transition matrix for the text given
    vocab - array - an array where the index of each item corresponds to the position in trans_mat: a lookup
    '''
    count_mat_master = np.zeros((vocab.shape[0],vocab.shape[0])) # intialise the master count matrix with zero counts
    
    for text in tqdm(text_list):
        if len(text) < 2:
            continue
        text = np.array(list(text)) # prepare text as array of separate characters
        pairs = strided_axis0(text, 2) # window characters into consecutive pairs
        int_pairs = int_encode(pairs, encoder) # integer encode the characters
        unique, counts = np.unique(int_pairs, return_counts=True, axis=0) # count the separate instances of the transitions
        count_mat = np.zeros((vocab.shape[0],vocab.shape[0]))
        count_mat[unique[:,0],unique[:,1]] = counts # populate the count matrix
        count_mat_master += count_mat # add the counts to the master count matrix
        
    count_mat_master = smooth(count_mat_master)
    trans_mat = count_mat_master/count_mat_master.sum(axis=1).reshape(count_mat_master.shape[0],1) # normalise the transition matrix (row stochastic)
    
    return trans_mat, vocab

In [29]:
def log_like(text, TM_list, encoder):
    '''
    Inputs:
    text - string - the sample to be analysed
    TM_list - array - the list of transition matrices containing the transition probabilities for the calculation for each langauge
    vocab - array - the characters corresponding to the transition matrix entries
    
    Returns:
    log_likelihood - float - the log-likelihood for the string
    
    '''
    text = np.array(list(text))
    pairs = strided_axis0(text, 2)
    int_pairs = int_encode(pairs, encoder)
    prob_lists = np.zeros((TM_list.shape[0], int_pairs.shape[0]))
    
    for i, TM in enumerate(TM_list):
        prob_lists[i] = TM[int_pairs[:,0],int_pairs[:,1]]
    
    return np.log(prob_lists).sum(axis=1)

Now I need to extract my vocab from the training data, or I could create a sample of every character that's possible to create with two bytes of UTF-8? Let's try using the first $128 + 1920 = 2,048$ characters. We have filtered to these characters in our text preprocessing anyway. This will be a very sparse matrix, but not too large for our means.

In [30]:
vocab = np.array([chr(i) for i in range(2048)])
encoder = LabelEncoder().fit(vocab)

## Preliminary Test

Let's test the routines on our previous example.

In [81]:
file_list_sl = glob.glob(os.path.join(os.getcwd(), "txt", "sl","*.txt"))
file_list_lv = glob.glob(os.path.join(os.getcwd(), "txt", "lv","*.txt"))

corpus_sl_1 = corpus(file_list_sl[:50])
corpus_sl_2 = corpus(file_list_sl[50:100])
corpus_lv_1 = corpus(file_list_lv[:50])
corpus_lv_2 = corpus(file_list_lv[50:100])

In [82]:
TM_list = np.array([transition_matrix(corpus, vocab, encoder)[0] for corpus in [corpus_sl_1, corpus_lv_1]])

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [83]:
log_likes = np.zeros((len(corpus_sl_2),len(TM_list)))
for i, sample in enumerate(corpus_sl_2):
    log_likes[i,:] = log_like(sample, TM_list, encoder)

In [84]:
percentage_accuracy = 100*(np.where(log_likes[:,0] > log_likes[:,1])[0].shape[0] / log_likes.shape[0])
print("Binary detection accuracy: %d%%" % percentage_accuracy)

Binary detection accuracy: 96%


So the model does indeed produce a larger log likelihood for the correct language, this is great. Now I need to create transition matrices for all of the languages and evaluate the model on the whole dataset.

## Evaluating Markov Chain MLE model

In [31]:
languages

['bg', 'cs', 'da', 'de', 'el', 'en', 'es', 'et', 'fi', 'fr', 'hu', 'it', 'lt', 'lv', 'nl', 'pl', 'pt', 'ro', 'sk', 'sl', 'sv']

In [100]:
TM_list = np.zeros((len(languages), len(vocab), len(vocab)))

for i, language in enumerate(tqdm(languages)):
    file_list = glob.glob(os.path.join(os.getcwd(), "txt", language,"*.txt"))
    a_corpus = corpus(file_list[:500])
    TM_list[i] = transition_matrix(a_corpus, vocab, encoder)[0]

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))

HBox(children=(IntProgress(value=0, max=498), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))




Now let's evaluate the performance on our test dataset.

In [33]:
def idiom_check_mkv(lang_sample, TM_list, languages, encoder):
    '''
    Inputs:
    lang_sample - the string to be classified
    TM_list - array containing the transition matrices for each language
    languages - list - list of the two character strings representing each language
    encoder - sklearn.preprocessing.LabelEncoder obj - prefitted encoder
    
    Returns:
    The predicted language of lang_sample - string
    '''

    return languages[np.argmax(log_like(lang_sample, TM_list, encoder))]

In [101]:
results = test.copy()
results['pred_lang'] = results['string'].apply(lambda x: idiom_check_mkv(x, TM_list, languages, encoder))

In [102]:
print('Accuracy:',(results['language'] == results['pred_lang']).sum()/len(results))

Accuracy: 0.9707638955784607


97% is quite good. I think at this point improvements would have to come either by using a larger sample for the transition matrices, or by improving on the smoothing technique (which is very rudimentary as it stands). Let's look to see where it failed.

In [103]:
results[(results['language'] != results['pred_lang'])]['pred_lang'].describe()

count     607
unique     18
top        fi
freq      345
Name: pred_lang, dtype: object

In [105]:
results[(results['language'] != results['pred_lang']) & (results['pred_lang'] == 'fi')].describe()

Unnamed: 0,language,string,pred_lang
count,345,345,345
unique,4,345,1
top,et,Sellise lähenemisega jäämegi ebakindlasse oluk...,fi
freq,336,1,345


It seems almost all of the samples which were mistaken for Finnish were actually Estonian. These languages must be quite similar in terms of their character transition probabilities.

In [110]:
results[results['language'] == 'et'][:5]

Unnamed: 0,language,string,pred_lang
6993,et,Austatud juhataja! Tahaksin kõigepealt avalda...,et
6994,et,"Euroopa Ülemkogu on väljendanud lootust, et v...",et
6995,et,"Härra juhataja, me oleme rääkinud üleilmastum...",et
6996,et,"Härra president, volinik, daamid ja härrad, h...",et
6997,et,"Proua juhataja, Ria Oomen-Ruijten on esitanud...",fi


In [109]:
results[results['language'] == 'fi'][:5]

Unnamed: 0,language,string,pred_lang
7992,fi,"Arvoisa puhemies, haluaisin äänestää Hannes S...",fi
7993,fi,"Arvoisa puhemies, minulla on kaksi lisäkysymy...",fi
7994,fi,"Arvoisa puhemies, poikkean käsikirjoituksestani.",fi
7995,fi,"Arvoisa puhemies, ennen kuin arvostelemme Ven...",fi
7996,fi,"Arvoisa puhemies, haluan kiittää näiden kahde...",fi


Let's produce the confusion matrix for the results.

In [116]:
confusion_matrix = ConfusionMatrix(results['language'], results['pred_lang'])
print("Confusion matrix:\n%s" % confusion_matrix)

Confusion matrix:
Predicted   bg    cs    da   de   el    en    es   et    fi    fr   ...     \
Actual                                                              ...      
bg         997     0     0    0    0     0     0    0     0     0   ...      
cs           0   975     1    0    1     1     2    0     1     0   ...      
da           0     0   991    0    0     0     0    0     0     0   ...      
de           0     0     1  992    1     0     0    0     0     0   ...      
el           0     0     0    0  988     0     0    0     0     0   ...      
en           0     0     0    0    0   998     0    0     0     0   ...      
es           0     0     0    0    0     0   994    0     0     0   ...      
et           0     0     0    3    0     4     0  617   336     0   ...      
fi           0     0     0    0    0     0     0    0   995     0   ...      
fr           0     0     0    0    0     0     1    0     0   996   ...      
hu           1     0     5    0    0     0    

The results are generally good apart from a very poor performance for Estonian. Interestingly, Estonian is mistaken for Finnish, but not vice versa.

In [119]:
et_results = results[results['language'] == 'et']
print('Estonian Accuracy:',((et_results['language'] == et_results['pred_lang'])).sum()/len(et_results))

Estonian Accuracy: 0.6207243460764588


### TODO

- Absolute Discounting