In [1]:
import pandas as pd

# Loading the spam data
# ham is the label for non-spam messages
spam = pd.read_csv('input_data/spam.csv')
spam.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [2]:
# Create an empty model
from spacy.lang.en import English
nlp = English()

# Create the TextCategorizer with exclusive classes and "bow" architecture
# https://spacy.io/api/language#create_pipe
# https://spacy.io/api/architectures#TextCatBOW
nlp.create_pipe(
    "textcat",
    config={
        'model': {
            '@architectures': 'spacy.TextCatBOW.v1',
            'exclusive_classes': True,
            'ngram_size': 1,
            'no_output_layer': False
        }
    }
)

textcat = nlp.add_pipe('textcat')

In [3]:
# Add labels to text classifier
textcat.add_label("ham")
textcat.add_label("spam")

1

In [5]:
from spacy.training import Example

train_data = []
for item in spam.iloc:
    doc = nlp.make_doc(item.text)
    example = Example.from_dict(
        doc,
        {
            'cats': {
                'ham': item.label == 'ham',
                'spam': item.label == 'spam'
            }
        }
    )
    train_data.append(example)

In [9]:
import random
import spacy
from spacy.util import minibatch


random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# Iterate through minibatches
# https://spacy.io/api/language#update
losses = {}
for epoch in range(10):
    random.shuffle(train_data)
    # Create the batch generator with batch size = 8
    batches = minibatch(train_data, size=8)
    # Iterate through minibatches
    for batch in batches:
        nlp.update(batch, sgd=optimizer, losses=losses)
    print(losses)

{'textcat': 0.5549327614952294}
{'textcat': 0.6942927120308386}
{'textcat': 0.7603908179353354}
{'textcat': 0.7798789901942472}
{'textcat': 0.8150759574802218}
{'textcat': 0.8868370061354872}
{'textcat': 0.9019728956993387}
{'textcat': 0.9116310472891854}
{'textcat': 0.9546235444465657}
{'textcat': 0.9629162331118234}


In [18]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA" ]
docs = [nlp.make_doc(text) for text in texts]
    
# Use textcat to get the scores for each doc
textcat = nlp.get_pipe('textcat')
scores = textcat.predict(docs)

scores

array([[9.9998641e-01, 1.3548736e-05],
       [3.4928420e-03, 9.9650711e-01]], dtype=float32)

In [19]:
# From the scores, find the label with the highest score/probability
predicted_labels = scores.argmax(axis=1)
[textcat.labels[label] for label in predicted_labels]

['ham', 'spam']