In [1]:
import pandas as pd

#Cargamos el CSV haciendo uso de pandas
spam = pd.read_csv('spam.csv')
spam.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [2]:
import spacy

#Creamos un modelo vacío
nlp = spacy.blank("en")

textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True,
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

In [3]:
#Añadimos las etiquetas al clasificador
textcat.add_label("ham")
textcat.add_label("spam")

1

In [4]:
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in spam['label']]

In [5]:
#Obtenemos los datos de entrenamiento
train_data = list(zip(train_texts, train_labels))
train_data[0]

('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 {'cats': {'ham': True, 'spam': False}})

In [6]:
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

batches = minibatch(train_data, size=8)

for batch in batches:
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)

In [7]:
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    batches = minibatch(train_data, size=8)
    for batch in batches:
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 0.43189741921099767}
{'textcat': 0.6474976215331196}
{'textcat': 0.7842154536487618}
{'textcat': 0.8716683716818165}
{'textcat': 0.9280939335008995}
{'textcat': 0.9655779922872296}
{'textcat': 0.9939651840090362}
{'textcat': 1.0127976631523663}
{'textcat': 1.0275637812859075}
{'textcat': 1.0378531470013608}


In [8]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA"]
docs = [nlp.tokenizer(text) for text in texts]
    
#Utilizamos textcat para obtener las puntuaciones respecto a cada categoría de docs
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[9.9994397e-01 5.6023764e-05]
 [1.1491306e-02 9.8850864e-01]]


In [9]:
#Obtenemos la etiqueta con una mayor puntuación y la mostramos
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']
