# Introduction to SpaCy
Following allong [this course](https://course.spacy.io/chapter1)

In [1]:
import pprint
import spacy
from spacy.lang.en import English
TEXT = "Hello World! My Name is Cristobal, I live in Canada and am 29 years old"

In [2]:
spacy.prefer_gpu()

False

In [4]:
nlp = English()

In [5]:
doc = nlp(TEXT)

In [7]:
# Doc is a collection of tokens initialized with `nlp()`--each token is an object with attributes that describe it.

# Token attributes
print('Index:   ', [token.i for token in doc])
print('Text:    ', [token.text for token in doc])

print('is_alpha:', [token.is_alpha for token in doc])
print('is_punct:', [token.is_punct for token in doc])
print('like_num:', [token.like_num for token in doc])

Index:    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
Text:     ['Hello', 'World', '!', 'My', 'Name', 'is', 'Cristobal', ',', 'I', 'live', 'in', 'Canada', 'and', 'am', '29', 'years', 'old']
is_alpha: [True, True, False, True, True, True, True, False, True, True, True, True, True, True, False, True, True]
is_punct: [False, False, True, False, False, False, False, True, False, False, False, False, False, False, False, False, False]
like_num: [False, False, False, False, False, False, False, False, False, False, False, False, False, False, True, False, False]


# Statistical Models
## What are statistical models?
- Enable spaCy to predict linguistic attributes in context
    - Part-of-speech tags
    - Syntactic dependencies
    - Named entities
- Trained on labeled example texts
- Can be updated with more examples to fine-tune predictions

## Installation
Download a trained model with `python -m spacy download en_core_web_sm`, which then allows you to
run `nlp = spacy.load('en_core_web_sm')` to load it.

In [3]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(TEXT)

In [98]:
# Predicting Part-Of-Speech tags
pprint.pprint([(token.text, token.pos_) for token in doc])

[('New', 'PROPN'),
 ('Iphone', 'PROPN'),
 ('X', 'PROPN'),
 ('resease', 'NOUN'),
 ('date', 'NOUN'),
 ('leaked', 'VERB'),
 ('.', 'PUNCT'),
 ('I', 'PRON'),
 ('am', 'VERB'),
 ('going', 'VERB'),
 ('to', 'PART'),
 ('buy', 'VERB'),
 ('an', 'DET'),
 ('iphone', 'NOUN'),
 ('then', 'ADV'),
 ('.', 'PUNCT')]


In [99]:
# Predicting Dependency
# In addition to the part-of-speech tags, we can also predict how the words are related. For example, whether a word is the subject of the sentence or an object.
# The head attribute returns the syntactic head token. You can also think of it as the parent token this word is attached to.

pprint.pprint([(token.text, token.dep_, token.head.text) for token in doc])

[('New', 'compound', 'X'),
 ('Iphone', 'compound', 'X'),
 ('X', 'compound', 'date'),
 ('resease', 'compound', 'date'),
 ('date', 'nsubj', 'leaked'),
 ('leaked', 'ROOT', 'leaked'),
 ('.', 'punct', 'leaked'),
 ('I', 'nsubj', 'going'),
 ('am', 'aux', 'going'),
 ('going', 'ROOT', 'going'),
 ('to', 'aux', 'buy'),
 ('buy', 'xcomp', 'going'),
 ('an', 'det', 'iphone'),
 ('iphone', 'dobj', 'buy'),
 ('then', 'advmod', 'buy'),
 ('.', 'punct', 'going')]


![alt](spacy-dependency-scheme.PNG)

In [6]:
# Predicting named entities
# The ents attribute in doc allows access to the predicted named entities.
# It returns an iterator of Span objects, so we can print the entity text and the entity label using the label_ attribute.
print('Entities:   ', [(span.text, span.label_) for span in doc.ents])

# Can also get descriptions of the entity labels
print(spacy.explain('GPE'))

# Note that it get's my name wrong, probably because it's spanish and this is trained only on english text.

Entities:    [('Cristobal', 'GPE'), ('Canada', 'GPE'), ('29 years old', 'DATE')]
Countries, cities, states


# Rule Based Matching
Compared to regular expressions, the matcher works with Doc and Token objects instead of only strings.

It's also more flexible: you can search for texts but also other lexical attributes.

You can even write rules that use the model's predictions.

For example, find the word "duck" only if it's a verb, not a noun.

Match patterns are lists of dictionaries. Each dictionary describes one token. The keys are the names of token attributes, mapped to their expected values.

The matcher dot add method lets you add a pattern. The first argument is a unique ID to identify which pattern was matched. The second argument is an optional callback. We don't need one here, so we set it to None. The third argument is the pattern.

To match the pattern on a text, we can call the matcher on any doc.

This will return the matches.

In [77]:
"""
In this example, we're looking for two tokens with the text "iPhone" and "X".

We can also match on other token attributes. Here, we're looking for two tokens whose lowercase forms equal "iphone" and "x".

We can even write patterns using attributes predicted by the model. Here, we're matching a token with the lemma "buy", plus a noun. The lemma is the base form, so this pattern would match phrases like "buying milk" or "bought flowers".

The matcher is initialized with the shared vocabulary, nlp dot vocab. You'll learn more about this later – for now, just remember to always pass it in.

"OP" can have one of four values:
{'OP': '!'}	Negation: match 0 times
{'OP': '?'}	Optional: match 0 or 1 times
{'OP': '+'}	Match 1 or more times
{'OP': '*'}	Match 0 or more times
"""
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_md')
matcher = Matcher(nlp.vocab)
doc = nlp('New Iphone X resease date leaked. I am going to buy an iphone then.')

In [93]:
# Match exact token texts
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]

# Match lexical attributes
# Won't match because 
pattern_1 = [{'TEXT': 'Iphone'}, {'UPPER': 'X'}]

# Match any token attributes
pattern_2 = [{'LEMMA': 'buy'}, {'POS': 'DET', 'OP': '?'}, {'POS': 'NOUN'}]
          
matcher.add('IPHONE_PATTERN', None, pattern_1)
matcher.add('BUYING_PATTERN', None, pattern_2)
matches = matcher(doc)
for match_id, start, end in matches:
    matched_span = doc[start:end]
    print(matched_span.text)

Iphone X
buy an iphone


# Data Structures
## Vocab
spaCy stores all shared data in a vocabulary, the Vocab.

This includes words, but also the labels schemes for tags and entities.

To save memory, all strings are encoded to hash IDs. If a word occurs more than once, we don't need to save it every time.

Instead, spaCy uses a hash function to generate an ID and stores the string only once in the string store. The string store is available as `nlp.vocab.strings`.

It's a lookup table that works in both directions. You can look up a string and get its hash, and look up a hash to get its string value. Internally, spaCy only communicates in hash IDs.

Hash IDs can't be reversed, though. If a word in not in the vocabulary, there's no way to get its string. That's why we always need to pass around the shared vocab.

## Lexemes
Lexemes are context-independent entries in the vocabulary.

You can get a lexeme by looking up a string or a hash ID in the vocab.

Lexemes expose attributes, just like tokens.

They hold context-independent information about a word, like the text, or whether the the word consists of alphanumeric characters.

Lexemes don't have part-of-speech tags, dependencies or entity labels. Those depend on the context. 

In [9]:
doc = nlp("I love coffee")
lexeme = nlp.vocab['coffee']

# Print the lexical attributes
print(lexeme.text, lexeme.orth, lexeme.is_alpha)

coffee 3197928453018144401 True


# Word vectors and semantic similarity
spaCy can compare two objects and predict how similar they are – for example, documents, spans or single tokens.

The Doc, Token and Span objects have a dot similarity method that takes another object and returns a floating point number between 0 and 1, indicating how similar they are.

One thing that's very important: In order to use similarity, you need a larger spaCy model that has word vectors included.

For example, the medium or large English model – but not the small one. So if you want to use vectors, always go with a model that ends in "md" or "lg". You can find more details on this in the models documentation.

## Word Vectors
But how does spaCy do this under the hood?

Similarity is determined using word vectors, multi-dimensional representations of meanings of words.

You might have heard of Word2Vec, which is an algorithm that's often used to train word vectors from raw text.

Vectors can be added to spaCy's statistical models.

By default, the similarity returned by spaCy is the cosine similarity between two vectors – but this can be adjusted if necessary.

Vectors for objects consisting of several tokens, like the Doc and Span, default to the average of their token vectors.

That's also why you usually get more value out of shorter phrases with fewer irrelevant words.

In [8]:
# Word vectors
doc = list(nlp.pipe('banana'))
token = doc[0]
print(token.vector)


[-5.1171e-01 -1.0681e-01 -4.0689e-01 -3.2231e-01  1.8074e-01  3.7749e-01
  4.3708e-02 -2.8818e-01  2.7681e-01  1.0651e+00 -3.8360e-01  2.9665e-01
 -4.8123e-01  8.7665e-02 -1.1448e-01 -6.6952e-01 -3.7934e-02  1.9361e+00
 -5.5409e-01  2.3255e-01  5.6811e-01  1.2107e-01 -2.9582e-01  2.0801e-01
  3.5881e-01 -1.6022e-01 -3.7637e-01 -2.0065e-01 -1.4099e-01  5.8564e-01
 -5.5319e-01  8.7606e-02  4.0345e-01  3.2074e-01  4.8916e-01 -5.8248e-01
  2.5284e-01  4.5514e-01 -4.7540e-01 -2.2623e-01  1.9731e-01 -5.9291e-01
  8.0345e-02 -4.6022e-02 -1.7881e-01 -1.1827e-01 -5.1545e-03  9.0144e-02
 -3.2856e-02  2.3501e-01  1.9211e-01  2.4003e-01 -2.9716e-01  4.4344e-02
  1.0656e-01 -1.6923e-01  3.4344e-01  1.8520e-01 -2.6764e-01 -1.3614e-01
  1.0132e-01  1.5200e-01  1.5203e-01  2.6930e-01 -3.1428e-01  1.1998e-01
 -4.0277e-02  3.0855e-01  2.7413e-01  1.4967e-01  1.3133e-01  5.9081e-01
 -4.1874e-01 -5.3135e-01  3.2925e-01 -2.1868e-01 -8.1421e-01  3.5644e-01
  4.3644e-01 -7.0250e-01  5.7554e-01  3.8241e-01 -1

In [4]:
nlp = spacy.load('en_core_web_md')
doc1 = nlp('I like fast food')
doc2 = nlp('I love pizza')

# Doc-level similarity
print(doc1.similarity(doc2))
# Token-level similarity
print(doc1[1].similarity(doc2[1]))

[('I', True, 6.4231944, False), ('love', True, 6.04035, False), ('pizza', True, 7.0450306, False)]
0.810140967454893
0.657904


In [6]:
# Similarity and context
# the following phrases have high similarity. It makes sense because both express a feeling about cats,
# but if looking at sentiment, they should be considered very dissimilar
doc1 = nlp("I like cats")
doc2 = nlp("I hate cats")

print(doc1.similarity(doc2))

0.9501447503553421


# Combining models and rules

In [28]:
# PhraseMatcher
# Like Matcher, but takes a Doc object as the pattern
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab)

pattern = nlp("Golden Retriever")
matcher.add('DOG', None, pattern)
doc = nlp("I have a Golden Retriever")

# Iterate over the matches
for match_id, start, end in matcher(doc):
    # Get the matched span
    span = doc[start:end]
    print('Matched span:', span.text)

Matched span: Golden Retriever


# Processing Pipelines
![alt text](spacy-nlp-pipeline.PNG)

First, the tokenizer is applied to turn the string of text into a Doc object. Next, a series of pipeline components is applied to the Doc in order. In this case, the tagger, then the parser, then the entity recognizer. Finally, the processed Doc is returned, so you can work with it.

spaCy ships with the following built-in pipeline components.

- The part-of-speech tagger sets the token dot tag attribute.
- The depdendency parser adds the token dot dep and token dot head attributes and is also responsible for detecting sentences and base noun phrases, also known as noun chunks.
- The named entity recognizer adds the detected entities to the doc dot ents property. It also sets entity type attributes on the tokens that indicate if a token is part of an entity or not.
- Finally, the text classifier sets category labels that apply to the whole text, and adds them to the doc dot cats property. Because text categories are always very specific, the text classifier is not included in any of the pre-trained models by default. But you can use it to train your own system.

In [31]:
print(nlp.pipe_names)
print(nlp.pipeline)

['tagger', 'parser', 'ner']
[('tagger', <spacy.pipeline.pipes.Tagger object at 0x7ff7a65967f0>), ('parser', <spacy.pipeline.pipes.DependencyParser object at 0x7ff7d2861588>), ('ner', <spacy.pipeline.pipes.EntityRecognizer object at 0x7ff7d28615e8>)]


# Custom Components
## Anatomy of a component
Function that takes a doc, modifies it and returns it
Can be added using the `nlp.add_pipe` method
```python
def custom_component(doc):
    # Do something to the doc here
    return doc

nlp.add_pipe(custom_component)
```

To specify where to add the component in the pipeline, you can use the following keyword arguments:

|Argument | Description | Example|
|---------|--------------|------|
|last | If True, add last | `nlp.add_pipe(component, last=True`)|
|first | If True, add first | `nlp.add_pipe(component, first=True`)|
|before | Add before component | `nlp.add_pipe(component, before='ner'`)|
|after | Add after component | `nlp.add_pipe(component, after='tagger'`)|


In [53]:
# Custom component that uses PhraseMatcher to find animal names in Doc and add matched spans to the doc.ents
import spacy
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span

nlp = spacy.load("en_core_web_md")


In [54]:
animals = ['Golden Retriever', 'cat', 'turtle', 'bat']
# nlp.pipe method returns a generator of Doc objects, so animal_patterns here below is just a list of Doc objects.
# So it's just a much faster way of doing: docs = [nlp(text) for text in LOTS_OF_TEXTS]
animal_patterns = list(nlp.pipe(animals))
print(type(animal_patterns[0]))
matcher = PhraseMatcher(nlp.vocab)
matcher.add('ANIMAL', None, *animal_patterns)

<class 'spacy.tokens.doc.Doc'>


In [55]:
def animal_component(doc):
    # Apply matches to the doc
    matches = matcher(doc)
    spans = [Span(doc, start, end, label='ANIMAL') for _, start, end in matches]
    # Overwrite the doc.ents with the matched spans
    doc.ents = spans
    return doc

# Add component after the 'ner' component
nlp.add_pipe(animal_component, after='ner')
print(nlp.pipe_names)

['tagger', 'parser', 'ner', 'animal_component']


In [56]:
#Try it out
# doc = nlp("I have a cat, a Golden Retriever")
doc = nlp("I have a cat, a Golden Retriever and a bat")
print([(span.text, span.label_) for span in doc.ents])

[('cat', 'ANIMAL'), ('Golden Retriever', 'ANIMAL'), ('bat', 'ANIMAL')]


# Custom attributes and methods
Create and access custom properties on Doc, Token and Span objects via the _ attribute: 
```python
doc._.title = 'My Title'
token._.is_color = True
```

In [67]:
# Registered on the global Doc, Token or Span using the set_extension method

from spacy.tokens import Token

# Define getter function
"""
getter function must take one argument (the object to extend). Called only when the object
is retrieved. Whatever is returned, is what get's shown when the particular custom attribute is called
"""
def get_is_color(token):
    colors = ['red', 'yellow', 'blue']
    return token.text in colors

# Set extension on the Token with getter (force=True so that it overwrites if existing)
Token.set_extension('is_color', getter=get_is_color, force=True)

#doc = nlp("The sky is green.")
doc = nlp("The sky is blue.")
print(doc[3].text, '-', doc[3]._.is_color)

blue - True


In [68]:
from spacy.tokens import Span

def get_has_color(span):
    colors = ['red', 'yellow', 'blue']
    return any(token.text in colors for token in span)

Span.set_extension('has_color', getter=get_has_color)

doc = nlp("The sky is blue.")
print(doc[1:4]._.has_color, '-', doc[1:4].text)
print(doc[0:2]._.has_color, '-', doc[0:2].text)

True - sky is blue
False - The sky


In [69]:
# Method extensions
# Instead of passing a `getter` arg to `set_extension`, pass a callable in the `method` arg
from spacy.tokens import Doc

def has_token(doc, token_text):
    """ First argument is always de object itself, so it's like `self` """
    return token_text in [token.text for token in doc]

Doc.set_extension('has_token', method=has_token)

doc = nlp("The sky is blue.")
print(doc._.has_token('blue'), '- blue')
print(doc._.has_token('cloud'), '- cloud')

True - blue
False - cloud


# Performance improvement tips

## Context
`nlp.pipe` supports passing in tuples of (text, context) with the `as_tuples` kwarg, where context is additional metadata on the text (a dictionary). 
Then instead of returning a list of `Doc` objects, it returns a list of tuples `(Doc, context)`, where context is the given dictionary
```python
data = [
    ('This is a text', {'id': 1, 'page_number': 15}),
    ('And another text', {'id': 2, 'page_number': 16}),
]

for doc, context in nlp.pipe(data, as_tuples=True):
    print(doc.text, context['page_number'])
```

##  Using only the tokenizer
Use nlp.make_doc to turn a text in to a Doc object

BAD:
```python
doc = nlp("Hello world")
```

GOOD:
```python
doc = nlp.make_doc("Hello world!")
```

# Training and updating models
## How training works (for a new model)
1. Initialize the model weights randomly with `nlp.begin_training`
2. Predict a few examples with the current weights by calling `nlp.update`
3. Compare prediction with true labels
4. Calculate how to change weights to improve predictions
5. Update weights slightly
6. Go back to 2.

## Creating training data
spaCy expects as training data a list of entries---Each entry in `TRAIN_DATA` should be a tuple with the text as the first item, and a dictionary with the annotations. Example annotations:
```python
{
   "entities": [(0, 4, "ORG")],
   "heads": [1, 1, 1, 5, 5, 2, 7, 5],
   "deps": ["nsubj", "ROOT", "prt", "quantmod", "compound", "pobj", "det", "npadvmod"],
   "tags": ["PROPN", "VERB", "ADP", "SYM", "NUM", "NUM", "DET", "NOUN"],
   "cats": {"BUSINESS": 1.0},
}
```

### `entities`
Train/update the `ner` pipeline, to teach the model how to recognize new entities. values in the dictionary must be lists of tuples, each with
3 items: `start_position`, `end_position`, `entity_label`. Sample train data:
```python
# Note: If you're using an existing model, make sure to mix in examples of
# other entity types that spaCy correctly recognized before. Otherwise, your
# model might learn the new type, but "forget" what it previously knew.
# https://explosion.ai/blog/pseudo-rehearsal-catastrophic-forgetting
TRAIN_DATA = [
    (
        "Horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ),
    ("Do they bite?", {"entities": []}),
    (
        "horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ),
    ("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}),
    (
        "they pretend to care about your feelings, those horses",
        {"entities": [(48, 54, LABEL)]},
    ),
    ("horses?", {"entities": [(0, 6, LABEL)]}),
]
```
### `heads` & `deps`
Used to train the dependency parser. NOT SURE HOW THIS WORKS YET. Sample train data:
```python
TRAIN_DATA = [
    (
        "They trade mortgage-backed securities.",
        {
            "heads": [1, 1, 4, 4, 5, 1, 1],
            "deps": ["nsubj", "ROOT", "compound", "punct", "nmod", "dobj", "punct"],
        },
    ),
    (
        "I like London and Berlin.",
        {
            "heads": [1, 1, 1, 2, 2, 1],
            "deps": ["nsubj", "ROOT", "dobj", "cc", "conj", "punct"],
        },
    ),
]
```
### `tags`
Train the Part-of-speech tagger. Sample train data:
```python
# You need to define a mapping from your data's part-of-speech tag names to the
# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
# See here for the Universal Tag Set:
# http://universaldependencies.github.io/docs/u/pos/index.html
# You may also specify morphological features for your tags, from the universal
# scheme.
TAG_MAP = {"N": {"pos": "NOUN"}, "V": {"pos": "VERB"}, "J": {"pos": "ADJ"}}

# Usually you'll read this in, of course. Data formats vary. Ensure your
# strings are unicode and that the number of tags assigned matches spaCy's
# tokenization. If not, you can always add a 'words' key to the annotations
# that specifies the gold-standard tokenization, e.g.:
# ("Eatblueham", {'words': ['Eat', 'blue', 'ham'], 'tags': ['V', 'J', 'N']})
TRAIN_DATA = [
    ("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
    ("Eat blue ham", {"tags": ["V", "J", "N"]}),
]
```

### `cats`
Used when training a text classification model. data must be lists of tuples and corresponding class:
```python
TRAIN_DATA = [
    ("I like green eggs", 'POSITIVE'),
    ("I hate breakfast", 'NEGATIVE'),
]
```

### Training

```python
nlp = spacy.blank('en')
optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    for text, annotations in TRAIN_DATA:
        nlp.update([text], [annotations], sgd=optimizer)
nlp.to_disk("/model")
```

## Entity recognizer update
Teach the entity recognizer to recognize a new category. For example, you may want spacy to recognize instances of
"Iphone" and apply the entity label `GADGET`.

spaCy recognizes the following entities out of the box:

Type | Description
-----|------------
PERSON | People, including fictional.
NORP | Nationalities or religious or political groups.
FAC | Buildings, airports, highways, bridges, etc.
ORG | Companies, agencies, institutions, etc.
GPE | Countries, cities, states.
LOC | Non-GPE locations, mountain ranges, bodies of water.
PRODUCT | Objects, vehicles, foods, etc. (Not services.)
EVENT | Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART | Titles of books, songs, etc.
LAW | Named documents made into laws.
LANGUAGE | Any named language.
DATE | Absolute or relative dates or periods.
TIME | Times smaller than a day.
PERCENT | Percentage, including "%".
MONEY | Monetary values, including unit.
QUANTITY | Measurements, as of weight or distance.
ORDINAL | "first", "second", etc.
CARDINAL | Numerals that do not fall under another type



## Training a text classifier

[From this kaggle kernel](https://www.kaggle.com/poonaml/text-classification-using-spacy):
>SpaCy provides classification model with multiple, non-mutually exclusive labels. You can change the model architecture rather easily, but by default, the TextCategorizer class uses a convolutional neural network to assign position-sensitive vectors to each word in the document. The TextCategorizer uses its own CNN model, to avoid sharing weights with the other pipeline components. The document tensor is then summarized by concatenating max and mean pooling, and a multilayer perceptron is used to predict an output vector of length nr_class, before a logistic activation is applied elementwise. The value of each output neuron is the probability that some class is present.

### TextCategorizer
This is the component plugged into the pipeline that runs the classification. [Docs](https://spacy.io/api/textcategorizer). Usage:
```python
# If no other TextCategorizer exists:
textcat = nlp.create_pipe(
    "textcat",
    config={
        "exclusive_classes": True,
        "architecture": "simple_cnn",
    }
)
nlp.add_pipe(textcat, last=True)

# add label to text classifier
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
```

config options:
- `exclusive classes`: Make categories mutually exclusive. Defaults to `False`.
- ` architecture`: Model architecture to use, see [architectures](https://spacy.io/api/textcategorizer#architectures) for details. Defaults to "ensemble".

### Training loop: `nlp.update`



In [10]:
import spacy
from spacy.util import minibatch, compounding
import numpy as np
nlp = spacy.load('en_core_web_sm')

In [6]:
#functions from spacy documentation
def load_data(train, limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{'POSITIVE': bool(label)} for label in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score}

#("Number of texts to train from", int)
n_texts=30000
#You can increase texts count if you have more computational power. Limits the number of samples to use.

#("Number of training iterations", int))
n_iter=10

In [12]:
train = [
    ("I ordered this product specifically to try out the three different types of products. However, when I received the product, I opened it up only to find three of the Barn Burner salsas - obviously not what I had ordered.<br /><br />Buy at your own risk - who knows what you'll end up getting.",
  0),
    ('Terrific - easy to use and great taste.  Only complaint is that the price is MUCH too high.',
  0),
    ('I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
  1),
    ("I ordered this product specifically to try out the three different types of products. However, when I received the product, I opened it up only to find three of the Barn Burner salsas - obviously not what I had ordered.<br /><br />Buy at your own risk - who knows what you'll end up getting.",
  0),
    ('Terrific - easy to use and great taste.  Only complaint is that the price is MUCH too high.',
  0),
    ('I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
  1),
    ("I ordered this product specifically to try out the three different types of products. However, when I received the product, I opened it up only to find three of the Barn Burner salsas - obviously not what I had ordered.<br /><br />Buy at your own risk - who knows what you'll end up getting.",
  0),
    ('Terrific - easy to use and great taste.  Only complaint is that the price is MUCH too high.',
  0),
    ('I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
  1),
    ("I ordered this product specifically to try out the three different types of products. However, when I received the product, I opened it up only to find three of the Barn Burner salsas - obviously not what I had ordered.<br /><br />Buy at your own risk - who knows what you'll end up getting.",
  0),
    ('Terrific - easy to use and great taste.  Only complaint is that the price is MUCH too high.',
  0),
    ('I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
  1),
    ("I ordered this product specifically to try out the three different types of products. However, when I received the product, I opened it up only to find three of the Barn Burner salsas - obviously not what I had ordered.<br /><br />Buy at your own risk - who knows what you'll end up getting.",
  0),
    ('Terrific - easy to use and great taste.  Only complaint is that the price is MUCH too high.',
  0),
    ('I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.',
  1),
]

In [17]:
# add the text classifier to the pipeline if it doesn't exist
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'textcat' not in nlp.pipe_names:
    textcat = nlp.create_pipe('textcat')
    nlp.add_pipe(textcat, last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('POSITIVE')

# load the dataset
print("Loading food reviews data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(train, limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(len(train_texts) + len(dev_texts), len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

Loading food reviews data...
Using 15 examples (12 training, 3 evaluation)


In [14]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

Training the model...
LOSS 	  P  	  R  	  F  
0.048	0.500	0.000	0.000
0.022	0.500	0.000	0.000
0.018	1.000	1.000	1.000
0.015	1.000	1.000	1.000
0.013	1.000	1.000	1.000
0.004	1.000	1.000	1.000
0.004	1.000	1.000	1.000
0.000	1.000	1.000	1.000
0.001	1.000	1.000	1.000
0.001	1.000	1.000	1.000


In [16]:
# test the trained model
test_text1 = 'This tea is fun to watch as the flower expands in the water. Very smooth taste and can be used again and again in the same day. If you love tea, you gotta try these "flowering teas"'
test_text2="I bought this product at a local store, not from this seller.  I usually use Wellness canned food, but thought my cat was bored and wanted something new.  So I picked this up, knowing that Evo is a really good brand (like Wellness).<br /><br />It is one of the most disgusting smelling cat foods I've ever had the displeasure of using.  I was gagging while trying to put it into the bowl.  My cat took one taste and walked away, and chose to eat nothing until I replaced it 12 hours later with some dry food.  I would try another flavor of their food - since I know it's high quality - but I wouldn't buy the duck flavor again."
doc = nlp(test_text2)
test_text1, doc.cats

('This tea is fun to watch as the flower expands in the water. Very smooth taste and can be used again and again in the same day. If you love tea, you gotta try these "flowering teas"',
 {'POSITIVE': 0.16774575412273407})