Creación de un clasificador de texto con TextBlob
===

In [1]:
##
## Carga de datos
##
train = [
    ("I love this sandwich.", "pos"),
    ("this is an amazing place!", "pos"),
    ("I feel very good about these beers.", "pos"),
    ("this is my best work.", "pos"),
    ("what an awesome view", "pos"),
    ("I do not like this restaurant", "neg"),
    ("I am tired of this stuff.", "neg"),
    ("I can't deal with this", "neg"),
    ("he is my sworn enemy!", "neg"),
    ("my boss is horrible.", "neg"),
]

test = [
    ("the beer was good.", "pos"),
    ("I do not enjoy my job", "neg"),
    ("I ain't feeling dandy today.", "neg"),
    ("I feel amazing!", "pos"),
    ("Gary is a friend of mine.", "pos"),
    ("I can't believe I'm doing this.", "neg"),
]

In [2]:
##
## Creación y entrenamiento de un clasificador
##
from textblob.classifiers import NaiveBayesClassifier

cl = NaiveBayesClassifier(
    # -------------------------------------------------------------------------
    # The training set, either a list of tuples of the form 
    # (text, classification) or a filename. text may be either a string or an 
    # iterable.
    train_set=train,
    # -------------------------------------------------------------------------
    # If train_set is a filename, the file format, e.g. "csv" or "json". 
    # If None, will attempt to detect the file format.
    format=None,
)

In [3]:
##
## Etiquetas encontradas en la data
##
cl.labels()

['pos', 'neg']

In [4]:
##
## Evaluación de la precisión
##
cl.accuracy(
    # -------------------------------------------------------------------------
    # A list of tuples of the form (text, label), or a file pointer.
    test_set=test,
    # -------------------------------------------------------------------------
    # If test_set is a filename, the file format, e.g. "csv" or "json". 
    # If None, will attempt to detect the file format.
    format=None,
)

0.8333333333333334

In [5]:
##
## Clasificación de nuevo texto
##
cl.classify(
    # -------------------------------------------------------------------------
    # A string of text.
    text="This is an amazing library!",
)

'pos'

In [6]:
##
## Extracción de características
##
cl.extract_features(text="This is an amazing library!")

{'contains(these)': False,
 'contains(ca)': False,
 'contains(he)': False,
 'contains(this)': False,
 'contains(boss)': False,
 'contains(tired)': False,
 'contains(good)': False,
 'contains(what)': False,
 'contains(with)': False,
 'contains(best)': False,
 'contains(awesome)': False,
 'contains(beers)': False,
 'contains(place)': False,
 'contains(work)': False,
 'contains(an)': True,
 'contains(feel)': False,
 'contains(do)': False,
 "contains(n't)": False,
 'contains(I)': False,
 'contains(is)': True,
 'contains(am)': False,
 'contains(like)': False,
 'contains(amazing)': True,
 'contains(restaurant)': False,
 'contains(enemy)': False,
 'contains(horrible)': False,
 'contains(sworn)': False,
 'contains(view)': False,
 'contains(my)': False,
 'contains(of)': False,
 'contains(about)': False,
 'contains(stuff)': False,
 'contains(sandwich)': False,
 'contains(love)': False,
 'contains(deal)': False,
 'contains(very)': False,
 'contains(not)': False}

In [7]:
##
## Creación y entrenamiento de un arbol de decisión
##
from textblob.classifiers import DecisionTreeClassifier

dt = DecisionTreeClassifier(
    # -------------------------------------------------------------------------
    # The training set, either a list of tuples of the form 
    # (text, classification) or a filename. text may be either a string or an 
    # iterable.
    train_set=train,
    # -------------------------------------------------------------------------
    # If train_set is a filename, the file format, e.g. "csv" or "json". 
    # If None, will attempt to detect the file format.
    format=None,
)
dt.accuracy(test)

0.5

In [8]:
##
## Creación y entrenamiento de un clasificador de maxima entropia
##
from textblob.classifiers import MaxEntClassifier

me = MaxEntClassifier(
    # -------------------------------------------------------------------------
    # The training set, either a list of tuples of the form 
    # (text, classification) or a filename. text may be either a string or an 
    # iterable.
    train_set=train,
    # -------------------------------------------------------------------------
    # If train_set is a filename, the file format, e.g. "csv" or "json". 
    # If None, will attempt to detect the file format.
    format=None,
)

me.accuracy(test)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.500
             2          -0.64828        1.000
             3          -0.60834        1.000
             4          -0.57264        1.000
             5          -0.54060        1.000
             6          -0.51172        1.000
             7          -0.48557        1.000
             8          -0.46181        1.000
             9          -0.44014        1.000
            10          -0.42030        1.000
            11          -0.40207        1.000
            12          -0.38528        1.000
            13          -0.36976        1.000
            14          -0.35537        1.000
            15          -0.34201        1.000
            16          -0.32957        1.000
            17          -0.31795        1.000
            18          -0.30709        1.000
            19          -0.29691        1.000
 

0.8333333333333334

In [9]:
##
## Clasificación de TextBlobs
##
from textblob import TextBlob

blob = TextBlob(
    "The beer is good. But the hangover is horrible.",
    classifier=cl,
)

blob.classify()

'pos'

In [10]:
##
## La ventaja de la aproximacion anterior es que el clasificador
## queda asociado al objeto TextBlob. Note que en el siguiente
## codigo no se usa explicitamente el clasificador cl.
##
for s in blob.sentences:
    print(s, " ---> ", s.classify())

The beer is good.  --->  pos
But the hangover is horrible.  --->  neg


In [11]:
##
## Actualización con nuevos datos
##
new_data = [
    ("She is my best friend.", "pos"),
    ("I'm happy to have a new friend.", "pos"),
    ("Stay thirsty, my friend.", "pos"),
    ("He ain't from around here.", "neg"),
]

cl.update(new_data)

True

In [12]:
##
## Evaluación de la precisión
##
cl.accuracy(test)

1.0