# Build a language detector model

The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to 3 consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.


In [1]:
# Import Dependencies
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import Perceptron, SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd
import numpy as np

In [2]:
# Load in data scraped from the landing page for Wikipedia in 11 different languages
# Full paragraph data
para = load_files("data/exercise01Data/paragraphs", encoding='utf-8')
# Portions of full paragraphs (no Japanese data) data
short_para = load_files("data/exercise01Data/short_paragraphs", encoding='utf-8')


In [3]:
# Lets create a dataframe for our own reference
# Define the index as the target encoded for given languages in the data and match these
# indices as the target_names & full_name for the language then rotate table
targets_df = pd.DataFrame({ 
    "target_name": para.target_names,
    "full_name" : ['Arabic','German','English','Spanish','French','Italian','Japanese','Dutch','Polish','Portuguese','Russian']
    }, index=np.unique(para.target)).T

display(targets_df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
target_name,ar,de,en,es,fr,it,ja,nl,pl,pt,ru
full_name,Arabic,German,English,Spanish,French,Italian,Japanese,Dutch,Polish,Portuguese,Russian


# Train Language Classifier on Paragraphs

In [4]:
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    para.data, para.target, test_size=0.3)

In [5]:
# This is helpful to visualize the data that we are feeding into the model & its classification
data_point = 1
print(f"Entry:\n{docs_train[data_point]}")
print(f"Language: { targets_df.iloc[1,y_train[data_point]] }" )

Entry:
Se han iniciado otros varios proyectos de enciclopedia con formato wiki, en gran parte bajo una filosofía diferente de la apertura y el modelo editorial del «punto de vista neutral» desarrollado por Wikipedia. Por ejemplo, Wikinfo, uno de los portales web, no requiere un punto de vista neutral y permite la investigación original.[44]​ También hubo nuevos proyectos inspirados en Wikipedia —como Citizendium, Scholarpedia, Conservapedia, y Knol de Google[45]​— donde algunos de los aspectos que son fundamentales en Wikipedia se abordan de manera diferenciada, como las políticas de revisión por pares, la investigación original, y la publicidad comercial.

Language: Spanish


In [6]:
# This is the learned way we could design this pipeline
learner_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

In [7]:
# Here is a new tool for simplifying the procedure of counting word occurence, 
# converting to a frequency, and normalizing the value with IDF

# The TfidfVectorizer will do all of those steps in one sweep
vectorizer = TfidfVectorizer(ngram_range=(1,3))

In [8]:
# This is a new pipeline using the optimized vectorizer & SGD Classification
sgd_clf = Pipeline([
    ('vect', vectorizer),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None))
])

In [9]:
# To demonstrate, we can use any ML classifier for our pipeline, 
# In this example, we use a parameter-less Perceptron model
para_clf = Pipeline([
    ('vect', vectorizer),
    ('clf', Perceptron())
])

In [10]:
# Now let's fit both pipelines on the training set
sgd_clf.fit(docs_train, y_train)
para_clf.fit(docs_train, y_train)

In [11]:
# Now we'll predict the outcomes on the testing set
y_predicted_SGD = sgd_clf.predict(docs_test)
y_predicted_Per = para_clf.predict(docs_test)

In [12]:
# Print the classification reports
print(f"SGD Model:\n{metrics.classification_report(y_test, y_predicted_SGD,target_names=para.target_names)}")

print(f"Perceptron Model:\n{metrics.classification_report(y_test, y_predicted_Per,target_names=para.target_names)}")


SGD Model:
              precision    recall  f1-score   support

          ar       1.00      1.00      1.00        17
          de       0.90      1.00      0.95        44
          en       0.93      1.00      0.96        54
          es       1.00      1.00      1.00        42
          fr       1.00      1.00      1.00        38
          it       1.00      1.00      1.00        23
          ja       1.00      0.65      0.78        31
          nl       1.00      1.00      1.00        12
          pl       0.90      1.00      0.95        19
          pt       1.00      1.00      1.00        27
          ru       1.00      1.00      1.00        21

    accuracy                           0.97       328
   macro avg       0.98      0.97      0.97       328
weighted avg       0.97      0.97      0.96       328

Perceptron Model:
              precision    recall  f1-score   support

          ar       1.00      1.00      1.00        17
          de       0.96      1.00      0.98      

In [13]:
# Plot the confusion matrices
cm_SGD = metrics.confusion_matrix(y_test, y_predicted_SGD)
cm_Per = metrics.confusion_matrix(y_test, y_predicted_Per)

print(f"SGD CM:\n{cm_SGD}")
print(f"Perceptron CM:\n{cm_Per}")

SGD CM:
[[17  0  0  0  0  0  0  0  0  0  0]
 [ 0 44  0  0  0  0  0  0  0  0  0]
 [ 0  0 54  0  0  0  0  0  0  0  0]
 [ 0  0  0 42  0  0  0  0  0  0  0]
 [ 0  0  0  0 38  0  0  0  0  0  0]
 [ 0  0  0  0  0 23  0  0  0  0  0]
 [ 0  5  4  0  0  0 20  0  2  0  0]
 [ 0  0  0  0  0  0  0 12  0  0  0]
 [ 0  0  0  0  0  0  0  0 19  0  0]
 [ 0  0  0  0  0  0  0  0  0 27  0]
 [ 0  0  0  0  0  0  0  0  0  0 21]]
Perceptron CM:
[[17  0  0  0  0  0  0  0  0  0  0]
 [ 0 44  0  0  0  0  0  0  0  0  0]
 [ 0  0 54  0  0  0  0  0  0  0  0]
 [ 0  0  0 42  0  0  0  0  0  0  0]
 [ 0  0  0  0 38  0  0  0  0  0  0]
 [ 0  0  0  0  0 22  0  0  0  1  0]
 [ 0  2  3  0  0  1 24  0  0  0  1]
 [ 0  0  0  0  0  0  0 12  0  0  0]
 [ 0  0  0  0  0  0  0  0 19  0  0]
 [ 0  0  0  0  0  0  0  0  0 27  0]
 [ 0  0  0  0  0  0  1  0  0  0 20]]


In [14]:
# Create some new short sentences for testing
sentences = [
    'This is a language detection test.',
    'Ceci est un test de dètection de la langue.',
    'Dies ist ein Test, um die Sprache zu erkennen.',
    'Tengo un sueno a ver los nubes.'
]

# Predict the results on new short sentences:
predicted_sgd = sgd_clf.predict(sentences)
predicted_per = para_clf.predict(sentences)

In [15]:
# Lets combine the predictions into a list
predictions = [predicted_sgd,predicted_per]
# And pretty print our results
for prediction in predictions:
    for s, p in zip(sentences, prediction):
        print(f'The language of "{s}" is "{targets_df.iloc[1,p]}"' )
    print("---- Next model -----")

The language of "This is a language detection test." is "English"
The language of "Ceci est un test de dètection de la langue." is "French"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "German"
The language of "Tengo un sueno a ver los nubes." is "Spanish"
---- Next model -----
The language of "This is a language detection test." is "English"
The language of "Ceci est un test de dètection de la langue." is "French"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "German"
The language of "Tengo un sueno a ver los nubes." is "Japanese"
---- Next model -----


# Train Classifier on Short Paragraphs

In [16]:
# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    short_para.data, short_para.target, test_size=0.3)

In [17]:
# This is helpful to visualize the data that we are feeding into the model & its classification
data_point = 3
print(f"Entry:\n{docs_train[data_point]}")
print(f"Language: { targets_df.iloc[1,y_train[data_point]] }" )

Entry:
samareño, vietnamita, japonés, árabe egipcio,
Language: Spanish


In [18]:
# Lets use the simplest model design & fit it on the short paragraph training data
vectorizer = TfidfVectorizer(ngram_range=(1,1))

sp_clf = Pipeline([
    ('vect', vectorizer),
    ('clf', Perceptron())
]).fit(docs_train, y_train)


In [19]:
# Make predictions with the short paragraph model
y_predicted = sp_clf.predict(docs_test)

In [20]:
# Here we have to modify the list of target names due to lack of Japanese data
mod_targets = short_para.target_names.copy()
# And then we print out a classification report
print(metrics.classification_report(y_predicted,y_test,
                                    labels=mod_targets.remove("ja"),
                                    target_names=mod_targets))

              precision    recall  f1-score   support

          ar       0.98      0.75      0.85       324
          de       0.97      0.97      0.97       339
          en       0.96      0.92      0.94       339
          es       0.90      0.89      0.90       313
          fr       0.93      0.95      0.94       304
          it       0.91      0.96      0.93       306
          nl       0.90      0.98      0.94       178
          pl       0.84      0.95      0.89       248
          pt       0.90      0.90      0.90       312
          ru       0.89      0.96      0.92       265

    accuracy                           0.92      2928
   macro avg       0.92      0.92      0.92      2928
weighted avg       0.92      0.92      0.92      2928



In [21]:
# Predict the result on some short new sentences:
sentences = [
    'This is a language detection test.',
    'Ceci est un test de dètection de la langue.',
    'Dies ist ein Test, um die Sprache zu erkennen.',
    'Tengo un sueno a ver los nubes.'
]
predicted = sp_clf.predict(sentences)
# And pretty print our results
for s, p in zip(sentences, predicted):
    print(f'The language of "{s}" is "{targets_df.iloc[1,p]}"' )

The language of "This is a language detection test." is "English"
The language of "Ceci est un test de dètection de la langue." is "French"
The language of "Dies ist ein Test, um die Sprache zu erkennen." is "German"
The language of "Tengo un sueno a ver los nubes." is "Spanish"


# Final Thoughts
Although we have achieved a high level of accuracy, our dataset is extremely limited. In order to improve the robustness of our  NLP pipeline, we should collect more textual data to train our model on. In the event that we double the amount of data for each language, we would expect to see a far more robust, and thus more accurate, classifier of new sentences. 