# Custom Text Classification in SpaCy

In [1]:
import pandas as pd

In [2]:
reviews = pd.read_csv('./dataset/Reviews.csv')
reviews.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
# extract meaningful columns
reviews = reviews[['Review Text','Recommended IND']].dropna()

In [4]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [5]:
reviews.head()

Unnamed: 0,Review Text,Recommended IND
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,I had such high hopes for this dress and reall...,0
3,"I love, love, love this jumpsuit. it's fun, fl...",1
4,This shirt is very flattering to all due to th...,1


In [6]:
nlp.pipe_names

['tagger', 'parser', 'ner']

In [7]:
# add textcat

textcat = nlp.create_pipe("textcat", config={"exclusive_classes":True, "architecture":"simple_cnn"})
nlp.add_pipe(textcat)
nlp.pipe_names

['tagger', 'parser', 'ner', 'textcat']

In [8]:
# add labels to textcat
textcat.add_label('POSITIVE')
textcat.add_label('NEGATIVE')

1

In [9]:
# convert dataframe into list of tuples
reviews['tuples'] = reviews.apply(lambda row: (row['Review Text'], row['Recommended IND']), axis=1)
train = reviews['tuples'].tolist()
train[:10]

[('Absolutely wonderful - silky and sexy and comfortable', 1),
 ('Love this dress!  it\'s sooo pretty.  i happened to find it in a store, and i\'m glad i did bc i never would have ordered it online bc it\'s petite.  i bought a petite and am 5\'8".  i love the length on me- hits just a little below the knee.  would definitely be a true midi on someone who is truly petite.',
  1),
 ('I had such high hopes for this dress and really wanted it to work for me. i initially ordered the petite small (my usual size) but i found this to be outrageously small. so small in fact that i could not zip it up! i reordered it in petite medium, which was just ok. overall, the top half was comfortable and fit nicely, but the bottom half had a very tight under layer and several somewhat cheap (net) over layers. imo, a major design flaw was the net over layer sewn directly into the zipper - it c',
  0),
 ("I love, love, love this jumpsuit. it's fun, flirty, and fabulous! every time i wear it, i get nothing b

In [13]:
# function to load data and perform pre-processing

import random

def load_data(limit=0, split=0.8):
    train_data = train
    # shuffle data
    random.shuffle(train_data)
    texts,labels = zip(*train_data)
    # get the categories for each reviews
    cats = [{'POSITIVE': bool(y), 'NEGATIVE': not bool(y)} for y in labels]
    #spliting the training and evaluation data
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [14]:
n_texts = 23486

(train_texts, train_cats),(dev_texts, dev_cats) = load_data(limit=n_texts)

In [15]:
# processing final format of the text data
train_data = list(zip(train_texts,[{'cats': cats} for cats in train_cats]))
train_data[:10]

[('Wow. had to check label to see if a m was shipped by accident.\r\n\r\nnice details. \r\n\r\ntoo bad!',
  {'cats': {'POSITIVE': True, 'NEGATIVE': False}}),
 ('I bought this blouse in a size 4 (my normal size) to pair with winter white slacks and gun metal pumps. it was a hit. i felt beautiful and professional. and received many compliments at work as well as at an after work gathering the quality is surprisingly sturdy given the lace work. the collar and cuffs really set the design off. this top is much more impressive in person.',
  {'cats': {'POSITIVE': True, 'NEGATIVE': False}}),
 ("I tried on this skirt in stores and loved it. the only thing is, i'm usually a 2 or a 26 and had to buy a 6. for reference, i'm 5' 7 and about 120 pounds. if you want to tuck a light sweater or really anything in, i'd recommend you go up a size as well!",
  {'cats': {'POSITIVE': True, 'NEGATIVE': False}}),
 ("This bra is obviously very pretty and feminine, but i was surprised that it's also really comf

In [17]:
# evaluation function

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 0.0
    tn = 0.0
    fp = 1e-8
    fn = 1e-8
    for i, doc in enumerate(textcat.pipe(cats)):
        gold = cats[i]
        for label,score in doc.cats.items():
            if label not in gold:
                continue
            if label == 'NEGATIVE':
                continue
            if score >=0.5 and gold[label] >= 0.5:
                tp += 1.0
            elif score >=0.5 and gold[label] <0.5:
                fp += 1.0
            elif score <0.5 and gold[label] < 0.5:
                tn += 1
            elif score <0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    if (precision + recall) == 0:
        f_score = 0.0
    else:
        f_score = 2 * (precision * recall) / (precision + recall)
    return {"textcat_p": precision, "textcat_r": recall, "textcat_f": f_score}

In [18]:
n_iter = 10

In [19]:
# train the model and print evaluation

from spacy.util import minibatch, compounding

# Disabling other components
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()

    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

    # Performing training
    for i in range(n_iter):
        losses = {}
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                       losses=losses)

      # Calling the evaluate() function and printing the scores
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f']))

Training the model...
LOSS 	  P  	  R  	  F  


AttributeError: 'dict' object has no attribute 'tensor'

In [20]:
# Testing the model
test_text="I hate this dress"
doc=nlp(test_text)
doc.cats 

{'POSITIVE': 0.02931312657892704, 'NEGATIVE': 0.9706868529319763}