# IMDB Sentiment Example

In [None]:
import spacy

# Set to control training
do_train=False

## Download Pre-trained spaCy English Model

In [None]:
# Restart kernel after installing model and skip this cell
!python -m spacy download en_core_web_md

In [None]:
nlp_pre = spacy.load('en_core_web_md')

## Word Vectors are Fun!

In [None]:
doc = nlp_pre("Queen Man Women")

In [None]:
queen_token = doc[0]
man_token = doc[1]
women_token = doc[2]

In [None]:
queen_token.vector

In [None]:
mystery = man_token.vector - women_token.vector + queen_token.vector

In [None]:
mystery

## What word is most similar to 'mystery'?

In [None]:
from tqdm import tqdm
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

similarities = []

for word in tqdm(nlp_pre.vocab):
    # Ignore words without vectors
    if not word.has_vector:
        continue
    similarity = cosine_similarity(mystery, word.vector)
    similarities.append((word, similarity))

In [None]:
similarities = sorted(similarities, key=lambda item: -item[1])

In [None]:
print([w[0].text for w in similarities[:10]])

## Other Fun Spacy Things

In [None]:
from spacy import displacy

In [None]:
doc = nlp_pre(u"Only months following AT&T's whopper $2.2 billion sale-leaseback deal involving property in Manhattan's Hudson Yards, the telecommunications giant has broadened its strategy to sell off even more of its real estate portfolio in similar sale-leaseback deals throughout the country.")

In [None]:
displacy.render(doc, style='ent')

In [None]:
doc = nlp_pre(u"This is a great first apartment.")

In [None]:
displacy.render(doc, style='dep')

## Sentiment Analysis with spaCy

In [None]:
import thinc.extra.datasets as datasets

In [None]:
imdb = datasets.imdb()

In [None]:
imdb[0][0]

## Load Dataset into spaCy format

In [None]:
def load_data():
    """Load data from the IMDB dataset."""
    train_data, test_data = imdb
    
    train_texts, train_labels = zip(*train_data)
    train_cats = [{"SENTIMENT": bool(y)} for y in train_labels]
    
    test_texts, test_labels = zip(*test_data)
    test_cats = [{"SENTIMENT": bool(y)} for y in test_labels]
    
    return (train_texts, train_cats), (test_texts, test_cats)

In [None]:
# load the IMDB dataset
print("Loading IMDB data...")
(train_texts, train_cats), (test_texts, test_cats) = load_data()
n_texts = len(train_texts) + len(test_texts)
print(
    "Using {} examples ({} training, {} test)".format(
        n_texts, len(train_texts), len(test_texts)
    )
)
train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))
test_data = list(zip(test_texts, [{"cats": cats} for cats in test_cats]))

In [None]:
train_data[0]

## Create spaCy Pipeline

In [None]:
nlp = spacy.blank('en')

textcat = nlp.create_pipe('textcat')        
# add label to text classifier
textcat.add_label('SENTIMENT')
        
nlp.add_pipe(textcat, last=True)

## Train Model

In [None]:
import random
import json
import os
model_path = os.path.join(os.path.pardir, 'models', 'sentiment_model')

In [None]:
%%time

if do_train:

    n_iter = 20

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        batch_sizes = spacy.util.compounding(4.0, 32.0, 1.001)
        for i in tqdm(range(n_iter)):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = spacy.util.minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            print('Training loss: ' + json.dumps(losses))

In [None]:
if do_train:
    nlp.to_disk(model_path)

In [None]:
if not do_train:
    nlp = spacy.load(model_path)

## Evaluate Model

In [None]:
from sklearn import metrics

In [None]:
def evaluate(test_data, model):
    
    X = []
    y_true = []
    y_pred = []
    
    for text, annotations in test_data:
        X.append(text)
        y_true.append(annotations['cats']['SENTIMENT'])
        
    docs = model.pipe(X)
    
    for doc in docs:
        y_pred.append(doc.cats['SENTIMENT'] > 0.5)
   
    print(metrics.classification_report(y_true, y_pred))

In [None]:
evaluate(test_data[:10000], nlp)

## Try it out!

In [None]:
while True:
    text_in = input('Text:')
    if text_in == 'exit':
        break
    print(json.dumps(nlp(text_in).cats))