In [42]:
"""Build a language detector model

The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to 3 consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.

"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD

import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics
%matplotlib inline


# The training data folder must be passed as first argument
languages_data_folder = "data/languages/paragraphs/"
dataset = load_files(languages_data_folder)

# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.5)


# TASK: Build a an vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens

tfidfVect = TfidfVectorizer(min_df=1)



# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf
from sklearn.linear_model import SGDClassifier
clf = Pipeline([('tfidf', TfidfVectorizer(min_df=1)),
               ('clf', SGDClassifier(loss='hinge', alpha=1e-3, n_iter=5, random_state=42)),])


# TASK: Fit the pipeline on the training set
clf = clf.fit(docs_train, y_train)

# TASK: Predict the outcome on the testing set in a variable named y_predicted
y_predicted = clf.predict(docs_test)

# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=dataset.target_names))

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

#import pylab as pl
#pl.matshow(cm, cmap=pl.cm.jet)
#pl.show()

# Predict the result on some short new sentences:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

for s, p in zip(sentences, predicted):
    print(u'The language of "%s" is "%s"' % (s, dataset.target_names[p]))


             precision    recall  f1-score   support

         ar       0.82      1.00      0.90        14
         de       0.94      0.98      0.96        47
         en       0.99      1.00      0.99        75
         es       1.00      1.00      1.00        52
         fr       1.00      1.00      1.00        55
         it       1.00      0.98      0.99        41
         ja       1.00      0.94      0.97        33
         nl       1.00      1.00      1.00        18
         pl       0.94      0.89      0.92        19
         pt       1.00      0.96      0.98        50
         ru       0.97      0.97      0.97        38

avg / total       0.98      0.98      0.98       442

[[14  0  0  0  0  0  0  0  0  0  0]
 [ 1 46  0  0  0  0  0  0  0  0  0]
 [ 0  0 75  0  0  0  0  0  0  0  0]
 [ 0  0  0 52  0  0  0  0  0  0  0]
 [ 0  0  0  0 55  0  0  0  0  0  0]
 [ 0  1  0  0  0 40  0  0  0  0  0]
 [ 0  0  0  0  0  0 31  0  1  0  1]
 [ 0  0  0  0  0  0  0 18  0  0  0]
 [ 1  1  0  0  0  0 