In [19]:
"""Build a language detector model

The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to 3 consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.

"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD

import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn import metrics


# The training data folder must be passed as first argument
languages_data_folder = "/home/students/klugeh01/cs420/sklearn/scikit-learn/doc/tutorial/text_analytics/data/languages/paragraphs"
dataset = load_files(languages_data_folder)

# Split the dataset in training and test set:
docs_train, docs_test, y_train, y_test = train_test_split(
    dataset.data, dataset.target, test_size=0.5)


# TASK: Build a an vectorizer that splits strings into sequence of 1 to 3
# characters instead of word tokens
vectorizer = TfidfVectorizer(ngram_range=(1,3))

# TASK: Build a vectorizer / classifier pipeline using the previous analyzer
# the pipeline instance should stored in a variable named clf
clf = Pipeline([
    ('vec', vectorizer),
    ('clf', Perceptron()),
])

# TASK: Fit the pipeline on the training set
clf.fit(docs_train, y_train)

# TASK: Predict the outcome on the testing set in a variable named y_predicted
y_predicted = clf.predict(docs_test)


# Print the classification report
print(metrics.classification_report(y_test, y_predicted,
                                    target_names=dataset.target_names))

# Plot the confusion matrix
cm = metrics.confusion_matrix(y_test, y_predicted)
print(cm)

#import pylab as pl
#pl.matshow(cm, cmap=pl.cm.jet)
#pl.show()

# Predict the result on some short new sentences:
sentences = [
    u'This is a language detection test.',
    u'Ceci est un test de d\xe9tection de la langue.',
    u'Dies ist ein Test, um die Sprache zu erkennen.',
]
predicted = clf.predict(sentences)

for s, p in zip(sentences, predicted):
    print(u'The language of "%s" is "%s"' % (s, dataset.target_names[p]))

             precision    recall  f1-score   support

         ar       0.50      1.00      0.67         8
         de       0.96      0.96      0.96        48
         en       0.99      1.00      0.99        72
         es       1.00      1.00      1.00        62
         fr       1.00      1.00      1.00        62
         it       0.97      1.00      0.99        34
         ja       1.00      0.81      0.89        36
         nl       0.95      1.00      0.97        18
         pl       0.94      0.85      0.89        20
         pt       1.00      0.94      0.97        52
         ru       0.97      1.00      0.98        30

avg / total       0.98      0.97      0.97       442

[[ 8  0  0  0  0  0  0  0  0  0  0]
 [ 2 46  0  0  0  0  0  0  0  0  0]
 [ 0  0 72  0  0  0  0  0  0  0  0]
 [ 0  0  0 62  0  0  0  0  0  0  0]
 [ 0  0  0  0 62  0  0  0  0  0  0]
 [ 0  0  0  0  0 34  0  0  0  0  0]
 [ 1  1  1  0  0  1 29  1  1  0  1]
 [ 0  0  0  0  0  0  0 18  0  0  0]
 [ 2  1  0  0  0  0 