In [1]:
import pandas as pd

In [2]:
def load_data(csv_file, split=0.9):
    data = pd.read_csv(csv_file)
    
    # Shuffle data
    train_data = data.sample(frac=1, random_state=7)
    
    texts = train_data.text.values
    labels = [{"POSITIVE": bool(y), "NEGATIVE": not bool(y)}
              for y in train_data.sentiment.values]
    split = int(len(train_data) * split)
    
    train_labels = [{"cats": labels} for labels in labels[:split]]
    val_labels = [{"cats": labels} for labels in labels[split:]]
    
    return texts[:split], train_labels, texts[split:], val_labels

train_texts, train_labels, val_texts, val_labels = load_data('./input/yelp_ratings.csv')

In [3]:
print('Texts from training data\n------')
print(train_texts[:2])
print('\nLabels from training data\n------')
train_labels[:2]

Texts from training data
------
["Some of the best sushi I've ever had....and I come from the East Coast.  Unreal toro, have some of it's available."
 "One of the best burgers I've ever had and very well priced. I got the tortilla burger and is was delicious especially with there tortilla soup!"]

Labels from training data
------


[{'cats': {'POSITIVE': True, 'NEGATIVE': False}},
 {'cats': {'POSITIVE': True, 'NEGATIVE': False}}]

In [4]:
import spacy

# Create an empty model
nlp = spacy.blank("en")

# Create the TextCategorizer with exclusive classes and "bow" architecture
textcat = nlp.create_pipe(
            "textcat",
            config={
                "exclusive_classes": True,
                "architecture": "bow"})
nlp.add_pipe(textcat)

# Add NEGATIVE and POSITIVE labels to text classifier
textcat.add_label("NEGATIVE")
textcat.add_label("POSITIVE")

1

In [5]:
from spacy.util import minibatch
import random

def train(model, train_data, optimizer, batch_size=8):
        losses = {}
        random.shuffle(train_data)
        batches = minibatch(train_data, size=batch_size)
        for batch in batches:
            texts, labels = zip(*batch)
            model.update(texts, labels, sgd=optimizer, losses=losses)
        return losses

In [6]:
# Fix seed for reproducibility
spacy.util.fix_random_seed(1)
random.seed(1)

optimizer = nlp.begin_training()
train_data = list(zip(train_texts, train_labels))
losses = train(nlp, train_data, optimizer)
print(losses['textcat'])

8.704142065520301


In [7]:
text = "This tea cup was full of holes. Do not recommend."
doc = nlp(text)
print(doc.cats)

{'NEGATIVE': 0.7737048864364624, 'POSITIVE': 0.22629515826702118}


In [8]:
def predict(model, texts):
            # Use the tokenizer to tokenize each input text example
            docs = [model.tokenizer(text) for text in texts]

            # Use textcat to get the scores for each doc
            textcat = model.get_pipe('textcat')
            scores, _ = textcat.predict(docs)

            # From the scores, find the class with the highest score/probability
            predicted_class = scores.argmax(axis=1)

            return predicted_class

In [9]:
texts = val_texts[34:38]
predictions = predict(nlp, texts)

for p, t in zip(predictions, texts):
    print(f"{textcat.labels[p]}: {t} \n")

POSITIVE: Came over and had their "Pick 2" lunch combo and chose their best selling 1/2 chicken sandwich with quinoa.  Both were tasty, the chicken salad is a bit creamy but was perfect with quinoa on the side.  This is a good lunch joint, casual and clean! 

POSITIVE: Went here last night and got oysters, fried okra, fries, and onion rings. I cannot complain. The portions were great and tasty!!! I will definitely be back for more. I cannot wait to try the crawfish boudin and soft shell crab. 

POSITIVE: This restaurant was fantastic! 
The concept of eating without vision was intriguing. The dinner was filled with laughs and good conversation. 

We were lead in a line to our table and each person to their seat. This was not just dark but you could not see something right in front of your face. 

The waiters/waitresses were all blind and allowed us to see how aware you need to be without the vision. 

Taking away one sense is said to increase your other senses so as taste and hearing wh

In [10]:
def evaluate(model, texts, labels):
        # Get predictions from textcat model
        predicted_class = predict(model, texts)

        # From labels, get the true class as a list of integers (POSITIVE -> 1, NEGATIVE -> 0)
        true_class = [int(each['cats']['POSITIVE']) for each in labels]

        # A boolean or int array indicating correct predictions
        correct_predictions = predicted_class == true_class

        # The accuracy, number of correct predictions divided by all predictions
        accuracy = correct_predictions.mean()

        return accuracy

In [11]:
accuracy = evaluate(nlp, val_texts, val_labels)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9488


In [12]:
n_iters = 5
for i in range(n_iters):
    losses = train(nlp, train_data, optimizer)
    accuracy = evaluate(nlp, val_texts, val_labels)
    print(f"Loss: {losses['textcat']:.3f} \t Accuracy: {accuracy:.3f}")

Loss: 4.544 	 Accuracy: 0.947
Loss: 3.211 	 Accuracy: 0.948
Loss: 2.454 	 Accuracy: 0.943
Loss: 2.018 	 Accuracy: 0.944
Loss: 1.614 	 Accuracy: 0.943
