In [1]:
"""Build a language detector model

The goal of this exercise is to train a linear classifier on text features
that represent sequences of up to N consecutive characters so as to be
recognize natural languages by using the frequencies of short character
sequences as 'fingerprints'.

The script saves the trained model to disk for later use
"""
# Author: Olivier Grisel <olivier.grisel@ensta.org>
# License: Simplified BSD
# Adapted by: Francesco Mosconi

import numpy as np
from sklearn.datasets import load_files


# The training data folder must be passed as first argument
try:
    dataset = load_files('./wikidata/short_paragraphs')
except OSError as ex:
    print(ex)
    print("Couldn't import the data, did you unzip the wikidata.zip folder?")
    exit(-1)

In [2]:
# Get pre-scaled features (columns) and target (y)
docs = dataset.data
y = dataset.target

In [3]:
# TASK: Split the dataset in training and test set
# (use 20% of the data for test):

from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size=.20, random_state=42)

In [4]:
# TASK: Build a an vectorizer that splits
# strings into sequence of 1 to 3
# characters instead of word tokens
# using the class TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='char')
vectorizer

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [5]:
vectorizer.fit(docs_train)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [6]:
len(vectorizer.get_feature_names())

20998

In [7]:
# TASK: Use the function make_pipeline to build a
#       vectorizer / classifier pipeline
#       using the previous analyzer
#       and a classifier of choice.
#       The pipeline instance should be
#       stored in a variable named model

from sklearn.pipeline import make_pipeline
# from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier ---> TAKES TOO LONG, TOO MANY FEATURES 
# from sklearn.naive_bayes import GaussianNB
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier

#classifier = LogisticRegression(C=10)
classifier = DecisionTreeClassifier(max_depth=5)
model = make_pipeline(vectorizer, classifier)
model

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_i...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [8]:
# TASK: Fit the pipeline on the training set

model.fit(docs_train, y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_i...      min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))])

In [9]:
# TASK: Predict the outcome on the testing set.
# Store the result in a variable named y_predicted

y_predicted = model.predict(docs_test)

In [10]:
# TASK: Print the classification report

from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))

             precision    recall  f1-score   support

          0       0.00      0.00      0.00        62
          1       0.87      0.47      0.61       193
          2       0.90      0.40      0.55       221
          3       0.19      0.91      0.32       213
          4       0.81      0.47      0.59       223
          5       0.75      0.47      0.58       198
          6       0.00      0.00      0.00        78
          7       0.55      0.05      0.10       112
          8       0.82      0.15      0.26       207
          9       1.00      0.91      0.95       185

avg / total       0.68      0.46      0.47      1692



  'precision', 'predicted', average, warn_for)


In [11]:
# TASK: Print the confusion matrix. Bonus points if you make it pretty.

from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_predicted))

[[  0   0   0  62   0   0   0   0   0   0]
 [  0  91   2  93   0   5   0   2   0   0]
 [  0   1  88 131   0   1   0   0   0   0]
 [  0   0   0 194  10   4   0   1   4   0]
 [  0   0   3 107 104   8   0   0   1   0]
 [  0   0   2  99   1  94   0   0   2   0]
 [  0   5   0  69   1   1   0   2   0   0]
 [  0   1   0  97   0   8   0   6   0   0]
 [  0   7   2 149  13   4   0   0  32   0]
 [  0   0   1  15   0   0   0   0   0 169]]


In [12]:
# TASK: Is the score good? Can you improve it changing
#       the parameters or the classifier?
#       Try using cross validation and grid search

# TASK: Use dill and gzip to persist the trained model in memory.
#       1) gzip.open a file called my_model.dill.gz
#       2) dump to the file both your trained classifier
#          and the target_names of the dataset (for later use)
#    They should be passed as a list [model, dataset.target_names]
