We will build a language detection for the 11 South African languages using a naive Bayes classifier on character n-grams. The training and testing data was downloaded from https://github.com/praekelt/feersum-lid-shared-task/tree/master/lid_task_2017a

In [1]:
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
import re
import numpy as np
import pandas as pd

In [2]:
language_code = {'afr':1, 'nbl':2, 'nso':3, 'sot':4, 'ssw':5, 'tso':6, 'tsn':7, 'ven': 8, 'xho':9,
'zul':10, 'eng':11}
code = dict((v,k) for k,v in language_code.items())

Read in the train and test data and map language code to integer

In [3]:
train_data = pd.read_csv("./data/train_full_3k.csv")
train_data["lang_id"] = train_data["lang_id"].map(language_code).astype(int)
test_data = pd.read_csv("./data/test_full_1k.csv")
test_data["lang_id"] = test_data["lang_id"].map(language_code).astype(int)

Do some text preprocessing

In [4]:
# regex to replace all numerics
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

def clean_text(input_text):
    text = input_text.lower()
    text = replace_numbers.sub('', text)

    text = text.replace('ã…â¡', 'š')
    text = text.replace('ï¿½', '')
    text = text.replace('ª', '')

    text = text.rstrip('"')
    text = text.lstrip(' "')

    # All special characters are kept.
    return text

In [5]:
docs_train, docs_test = train_data[" text"], test_data[" text"]
y_train, y_test = train_data["lang_id"], test_data["lang_id"]

In [6]:
docs_train = [clean_text(text) for text in docs_train]
docs_test = [clean_text(text) for text in docs_test]

Use Tfidf terms on all characters ngrams from 1 to 6 as input to the MultinomialNB classifier

In [7]:
vectorizer = TfidfVectorizer(ngram_range=(1, 6),
                             analyzer='char',)

pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', MultinomialNB())
])

pipe.fit(docs_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=Tr...rue,
        vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [13]:
#save model for later use
# from sklearn.externals import joblib
# joblib.dump(pipe, '../language_detect.joblib') 

['../language_detect.joblib']

In [14]:
y_predicted = pipe.predict(docs_test)

In [8]:
#accuracy
pipe.score(docs_test, y_test)

0.9990909090909091

## Classification report

In [135]:
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=language_code.keys(),
                                     digits=5      ))

             precision    recall  f1-score   support

        afr    1.00000   1.00000   1.00000      1000
        nbl    0.99900   0.99700   0.99800      1000
        nso    1.00000   1.00000   1.00000      1000
        sot    1.00000   1.00000   1.00000      1000
        ssw    1.00000   0.99800   0.99900      1000
        tso    1.00000   0.99900   0.99950      1000
        tsn    1.00000   1.00000   1.00000      1000
        ven    1.00000   1.00000   1.00000      1000
        xho    1.00000   0.99700   0.99850      1000
        zul    0.99502   0.99900   0.99701      1000
        eng    0.99602   1.00000   0.99800      1000

avg / total    0.99909   0.99909   0.99909     11000



## Confusion matrix

In [136]:
cm = metrics.confusion_matrix(y_test, y_predicted)

In [137]:
cm

array([[1000,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,  997,    0,    0,    0,    0,    0,    0,    0,    2,    1],
       [   0,    0, 1000,    0,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0, 1000,    0,    0,    0,    0,    0,    0,    0],
       [   0,    0,    0,    0,  998,    0,    0,    0,    0,    0,    2],
       [   0,    0,    0,    0,    0,  999,    0,    0,    0,    0,    1],
       [   0,    0,    0,    0,    0,    0, 1000,    0,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0, 1000,    0,    0,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,  997,    3,    0],
       [   0,    1,    0,    0,    0,    0,    0,    0,    0,  999,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 1000]])