In [1]:
# https://www.kaggle.com/matleonard/text-classification

import pandas as pd

# Loading the spam data
# ham is the label for non-spam messages
spam = pd.read_csv('spam.csv')
spam.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [2]:
#step 1: convert the text into numeric forms: machine learning models don't learn from raw text data
import spacy

# Create an empty model
nlp = spacy.blank("en")

# TextCategorizer is a spacy pipe. pipes are classes for processing and transforming tokens. 
# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
              "textcat",
              config={
                "exclusive_classes": True, #Since the classes are either ham or spam, we set "exclusive_classes" to True.
                "architecture": "bow"})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat) #

In [3]:
# Add labels to text classifier
textcat.add_label("ham")
textcat.add_label("spam")

1

In [10]:
# convert the labels in the data to the form TextCategorizer requires. 
# For each document, we'll create a dictionary of boolean values for each class
# For example, if a text is "ham", we need a dictionary {'ham': True, 'spam': False}. 
# The model is looking for these labels inside another dictionary with the key 'cats'.
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in spam['label']]

#combine the texts and labels into a single list.
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [11]:
# train the model 
# a. create an optimizer using nlp.begin_training(). spaCy uses this optimizer to update the model. 
# b. In general it's more efficient to train models in small batches. spaCy provides the minibatch function that returns 
# a generator yielding minibatches for training. 
# c. Finally, the minibatches are split into texts and labels, then used with nlp.update to update the model's parameters.

# The model will typically need multiple epochs. 
# Use another loop for more epochs, and optionally re-shuffle the training data at the begining of each loop.

from spacy.util import minibatch
import random

random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()
 
losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        # Each batch is a list of (text, label) but we need to
        # send separate lists for texts and labels to update().
        # This is a quick way to split a list of tuples into lists
        texts, labels = zip(*batch)
        nlp.update(texts, labels, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 1.3387270107850782}
{'textcat': 1.673847830779323}
{'textcat': 1.8624543476085123}
{'textcat': 1.9832555457411587}
{'textcat': 2.0643615561263076}
{'textcat': 2.116428931871388}
{'textcat': 2.1529458115884115}
{'textcat': 2.177424438015182}
{'textcat': 2.1961398774591525}
{'textcat': 2.210127401154137}


In [12]:
# once trained, we can make predictions with the predict() method. 
# The input text needs to be tokenized with nlp.tokenizer. 
# Then you pass the tokens to the predict method which returns scores. 
# The scores are the probability the input text belongs to the classes.

texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.tokenizer(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)

print(scores)

[[9.9993169e-01 6.8340050e-05]
 [1.8197354e-02 9.8180264e-01]]


In [13]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']
