In [1]:
import numpy as np
import seaborn as sns
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import accuracy_score,classification_report
import scipy
import warnings
warnings.filterwarnings('ignore')

<h2>Function replace some symbol</h2>
Follow table below we replace some character to 1 symbol, because all of them resentation the popular word, such as english token [a-zA-Z] 
<img src="img/table1.png"/>

In [2]:
def removeNoise(data):
    data['text'] = data['text'].str.replace('[=<>"[]{}/:-;.,\'\(\)%]', ' ')
    data['text'] = data['text'].str.replace('[0-9]', ' ')
    return(data)

<h2>KullBack-Leibler Divergence</h2>

- Make sure 2 params are array and same length
- Normalize them 

<img src="img/kl.png" />

In [3]:
def KLdivergence(p,q):
    if len(p) != len(q):
        return False
    p = p/np.sum(p)
    q = q/np.sum(q)
    return np.sum(p*np.log(p/q))

<h2>Get data train</h2> 

In [4]:
data = pd.read_csv('formatted_data.csv', sep=';', error_bad_lines=False)
data = removeNoise(data)
data.head()

Unnamed: 0,language,text,length_text
0,bg,Състав на Парламента: вж. протоколиОдобряване ...,327263
1,cs,Schválení zápisu z předchozího zasedání: viz z...,317927
2,da,Genoptagelse af sessionenJeg erklærer Europa-P...,678400
3,de,Wiederaufnahme der SitzungsperiodeIch erkläre ...,747690
4,el,Επαvάληψη της συvσδoυΚηρύσσω την επανάληψη της...,523277


<h2>Get data test from external test set</h2>
With data test:

- Using other source but keep format and language support
- Select random line for test

In [5]:
test = pd.read_csv('europarl.csv',sep=';', error_bad_lines=False)
test = removeNoise(test)
test= test.reindex(np.random.permutation(test.index))
test.head()

Unnamed: 0,language,text
16130,pt,Mas não basta que os países tenham assinado es...
4590,el,Ο συνάδελφός μου Βουλευτής κ. Öger ανέφερε μόλ...
10555,hu,"Nem helyes, hogy a projekt elkészítése a part ..."
17240,ro,"Prin urmare, avem nevoie de un angajament clar..."
17516,ro,"În primul rând, probabil deoarece, cu nivelul ..."


<h2>Prepair data</h2>

Transform data and test set to numpy array with : 
- n: length of data train
- m: length of data test

List of all the languages whose detection is supported:

- 'bg': Bulgarian
- 'cs': Czech
- 'da': Danish
- 'de': German
- 'el': Greek, Modern
- 'en': English
- 'es': Spanish
- 'et': Estonian
- 'fi': Finnish
- 'fr': French
- 'hu': Hungarian
- 'it': Italian
- 'lt': Lithuanian
- 'lv': Latvian
- 'nl': Dutch
- 'pl': Polish
- 'pt': Portuguese
- 'ro': Romanian
- 'sk': Slovak
- 'sl': Slovenian
- 'sv': Swedish

In [6]:
data = data.values
test = test.values

n = data.shape[0]
m = test.shape[0]

labels = data[:,0]
labels_test = test[:,0]

Prepair data train:
- using CountVectorizer to get all token from raw text
- Seperate each token language 

In [7]:
language_train = []
for i,v in enumerate(data):
    vector = CountVectorizer(analyzer='char',encoding='latin-1',ngram_range=(2,2))
    y = vector.fit_transform([v[1].replace(' ','')])
    language_train.append(np.array([vector,y]))

Predict task:

- Get each text from test data
- Get token don't exit from _p with _q assign to _set(p is set token of test, and q is set token of train)
<img src="img/t_cup.png"/>
- If _set empty => _p inside _q then calculate KL-divergence D(_p||_q) will > 0
<img src="img/t_cap.png"/>
- Else _set have some character => _p overlap _q the D(_p||_q) will Infinity
- Store D(_p||_q) into t array
- After run all language support, predict = argmin(t) 

In [8]:
predict_label = []
true_position = 0
for i,(_,data_test) in enumerate(test):
    vector_test = TfidfVectorizer(analyzer='char',encoding='latin-1',ngram_range=(2,2))
    transform = vector_test.fit_transform([data_test.replace(' ','')])
    t = np.array([float('Inf')]*n)
    for j,(vector_train,transform_y) in enumerate(language_train):
        _tmp = set(vector_train.vocabulary_)^set(vector_test.vocabulary_)
        _set = list((set(_tmp)|set(vector_test.vocabulary_))^ set(vector_train.vocabulary_))
        if not bool(_set):
            k = len(vector_train.vocabulary_)
            _q = np.zeros(k)
            _q[transform_y.indices] = transform_y.data
            _p = np.zeros(k)
            _p[transform.indices] = transform.data
            t[j] = scipy.stats.entropy(_p,_q)
    predict_label.append(labels[np.argmin(t)])

In [9]:
print(classification_report(labels_test, predict_label))            
print (accuracy_score(labels_test, predict_label))

precision    recall  f1-score   support

          bg       0.21      1.00      0.35       986
          cs       0.94      0.75      0.84       991
          da       0.89      0.73      0.80       991
          de       0.95      0.58      0.72       989
          el       1.00      0.75      0.86       963
          en       0.95      0.08      0.14       972
          es       0.99      0.34      0.51       979
          et       0.83      0.72      0.77       985
          fi       0.98      0.66      0.79       989
          fr       0.60      0.09      0.16       979
          hu       0.69      0.84      0.75       978
          it       0.96      0.16      0.28       965
          lt       0.47      0.85      0.61       982
          lv       0.86      0.85      0.85       961
          nl       0.73      0.21      0.33       984
          pl       0.85      0.82      0.83       983
          pt       0.74      0.75      0.75       983
          ro       0.90      0.86      0.