In [2]:
import pandas as pd
spam = pd.read_csv("./spam.csv")
spam.head(10)

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [9]:
import spacy
nlp = spacy.blank('en') # 建立空模型
help(nlp.create_pipe)

# Create the TextCategorizer with exclusive classes 
#                        and "bow" architecture
textcat = nlp.create_pipe('textcat',config={
    "exclusive_classes": True,
    "architecture": "bow"
})

# Add the TextCategorizer to the empty model
nlp.add_pipe(textcat)

Help on method create_pipe in module spacy.language:

create_pipe(name, config={}) method of spacy.lang.en.English instance
    Create a pipeline component from a factory.
    
    name (unicode): Factory name to look up in `Language.factories`.
    config (dict): Configuration parameters to initialise component.
    RETURNS (callable): Pipeline component.
    
    DOCS: https://spacy.io/api/language#create_pipe



In [17]:
# Add labels to text classifier
textcat.add_label("ham")
textcat.add_label("spam")

0

In [26]:
train_texts = spam['text'].values
train_labels = [{'cats': {'ham': label == 'ham',
                          'spam': label == 'spam'}} 
                for label in spam['label']]

In [27]:
train_data = list(zip(train_texts, train_labels))
train_data[:3]

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
  {'cats': {'ham': True, 'spam': False}}),
 ('Ok lar... Joking wif u oni...', {'cats': {'ham': True, 'spam': False}}),
 ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
  {'cats': {'ham': False, 'spam': True}})]

In [25]:
from spacy.util import minibatch

spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

# 数据分批
batches = minibatch(train_data, size=8)
# 迭代
for batch in batches:
    texts, labels = zip(*batch)
    nlp.update(texts, labels, sgd=optimizer)
    
help(nlp.update)

Help on method update in module spacy.language:

update(docs, golds, drop=0.0, sgd=None, losses=None, component_cfg=None) method of spacy.lang.en.English instance
    Update the models in the pipeline.
    
    docs (iterable): A batch of `Doc` objects.
    golds (iterable): A batch of `GoldParse` objects.
    drop (float): The dropout rate.
    sgd (callable): An optimizer.
    losses (dict): Dictionary to update with the loss, keyed by component.
    component_cfg (dict): Config parameters for specific pipeline
        components, keyed by component name.
    
    DOCS: https://spacy.io/api/language#update



In [37]:
import random
random.seed(1)
spacy.util.fix_random_seed(1)
optimizer = nlp.begin_training()

loss = {}
for epoch in range(10):
    # 每次随机打乱数据
    random.shuffle(train_data)
    # 数据分批
    batches = minibatch(train_data, size=8)
    # 迭代
    for batch in batches:
        texts, labels = zip(*batch)
        nlp.update(texts, labels, drop=0.3, sgd=optimizer, losses=loss)
    print(loss)

{'textcat': 0.22436044702671132}
{'textcat': 0.41457826484549287}
{'textcat': 0.5661000985640895}
{'textcat': 0.7119002992385974}
{'textcat': 0.8301601885299159}
{'textcat': 0.9572314705652767}
{'textcat': 1.050187804254974}
{'textcat': 1.1268915971417424}
{'textcat': 1.2132206293363608}
{'textcat': 1.3000399094508472}


In [62]:
texts = ["Are you ready for the tea party????? It's gonna be wild",
         "URGENT Reply to this message for GUARANTEED FREE TEA"]
docs = [nlp.tokenizer(text) for text in texts]
print(docs)

[Are you ready for the tea party????? It's gonna be wild, URGENT Reply to this message for GUARANTEED FREE TEA]


In [63]:
textcat = nlp.get_pipe('textcat')
scores, _ = textcat.predict(docs)
print(scores)

[[9.9999392e-01 6.1252954e-06]
 [4.1843491e-04 9.9958152e-01]]


In [64]:
predicted_labels = scores.argmax(axis=1)
print([textcat.labels[label] for label in predicted_labels])

['ham', 'spam']
