# Objective
***
Our objective here is to classify whether restaurant reviews on Yelp are positive or negative using a perceptron. 

# Imports
***
Here are the imports necessary for this notebook:

In [5]:
from argparse import Namespace
from collections import Counter
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [53]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

# Data Vectorization Classes
***
These classes are used to transform text data into a vectorized form.

## `Vocabulary`
***
The `Vocabulary` object is used for managing the bijection between each token to a numerical version of itself. 

The user can add new tokens with the index autoincrementing. We allow for there to be a "UNK" token, which stands for the "unknown" tokens. This is useful for handling tokens that were never seen in training. 

We can use `add_token()` to add new tokens to the `Vocabulary`, `lookup_token()` to retrieve the index for a token, and `lookup_index()` to retrieve a token given the index.

In [41]:
class Vocabulary(object):
    """
    Class to process text and extract Vocabulary for mapping
    """
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict) - a map from tokens to indices
            add_unk (bool) - a flag that indicates whether to add the UNK token
            unk_token (str) - the UNK token to add into the Vocabulary
        """
        # if no dictionary passed
        if token_to_idx is None:
            # create one
            token_to_idx = {}
        # store token_to_idx
        self._token_to_idx = token_to_idx
        # invert dictionary to have an idx -> token mapping
        self._idx_to_token = {idx: token for token, idx in self._token_to_idx.items()}
        # whether or not to keep a token for unknown words
        self._add_unk = add_unk
        # token to be used for storing unknown words
        self._unk_token = unk_token
        # index corresponding to unknown token
        self.unk_index = -1
        # if we are using a token for unknown words
        if add_unk:
            # add the unknown token to dict and save its index
            self.unk_index = self.add_token(unk_token)
    
    def to_serializable(self):
        """
        returns a dictionary that can be serialized
        """
        return {"token_to_idx":self._token_to_idx, \ 
                "add_unk":self._add_unk, \
                "unk_token":self._unk_token}
    
    @classmethod
    def from_serializable(cls, contents):
        """
        instantiates the Vocabulary from a serialized dictionary
        Args:
            contents (dict): a dictionary of the form returned by to_serializable
        Returns:
            an instance of Vocabulary initialized with contents
        """
        return cls(**contents)
    
    def add_token(self, token):
        """
        updates the mapping dict based on the token
        
        Args:
            token - (str) the item to add into the vocabulary
        Returns:
            index - (int) the integer corresponding to the token
        """
        # if token already added
        if token in self._token_to_idx:
            # return its index
            index = self._token_to_idx[token]
        else:
            # compute next available index for new token
            index = len(self._token_to_idx)
            # update token -> idx mapping
            self._token_to_idx[token] = index
            # update idx -> token mapping 
            self._idx_to_token[index] = token
        return index
    
    def add_many(self, tokens):
        """
        add a list of tokens into the vocabulary
        
        Args:
            tokens - (list) a list of tokens as strings
        Returns:
            indices- (list) a list of indices corresponding to the tokens
        """
        return [self.add_token(token) for token in tokens]
    
    def lookup_token(self, token):
        """
        retrieve the index associated with the token or the UNK index if the token isn't present
        
        Args:
            token - (str) the token to look up
        Returns:
            index - (int) the index corresponding to the token
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
        
    def lookup_index(self, index):
        """
        return the token associated with the index
        
        Args:
            index - (int) the index to look up
        Returns:
            token - (str) the token corresponding to the index
        """
        if index not in self._idx_to_token:
            raise KeyError("the index {0:d} is not in the Vocabulary".format(index))
        return self._idx_to_token[index]
    
    def __str__(self):
        return("Vocabulary(size={0:d})".format(len(self)))
    
    def __len__(self):
        return len(self._token_to_idx)        

## `ReviewVectorizer`
***
The second stage of going from a text dataset to a vectorized minibatch is to iterate through the tokens of an input data point and convert each token to its integer form. The result of this iteration should be a vector. Because this vector will be combined with vectors from other data points, there is a constraint that the vectors produced by the `Vectorizer` should always have the same length. 

The `Vectorizer` must encapsulate the review `Vocabulary`, which maps words in the review to integers. 

The `Vectorizer` uses `@classmethod` decorator for `from_dataframe()` to indicate an entry point to instantiating the `Vectorizer`. 

`from_dataframe()` iterates over the rows of a Pandas DataFrame in order to count the frequency of all tokens in the dataset and to create a `Vocabulary` that only uses tokens that are as frequent as some specified frequency `cutoff`.

`vectorize()` encapsulates the core functionality of the `Vectorizer`. It takes a review and returns a vectorized representation of the review. In this example, we will be using the one-hot representation for our vectors. 

This representation creates a binary vector that has length equal to the size of the vocabulary. This representation has some limitations - it is sparse (the number of unique words in a given single review will always be much smaller than the number of unique words in the `Vocabulary`) and it discards any information about the order of the words.

In [42]:
class ReviewVectorizer(object):
    """
    coordinates the vocabularies and uses them
    """
    def __init__(self, review_vocab, rating_vocab):
        """
        Args:
            review_vocab - (Vocabulary) maps words to integers
            rating_vocab - (Vocabulary) maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab
        
    def vectorize(self, review):
        """
        creates a collapsed one-hot vector for the review
        
        inputs:
        review - (str) the review
        outputs:
        one_hot - (np.array) the collapsed one-hot encoding
        """
        # zeros the length of your vocabulary
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        # for each word
        for token in review.split(" "):
            # if it's not punctuation
            if token not in string.punctuation:
                # lookup_token retrieves the index corresponding to that token
                one_hot[self.review_vocab.lookup_token(token)] = 1
        return one_hot
    
    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        """
        instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df - (DataFrame) review dataset
            cutoff - (int) the parameter for frquency-based filtering
        Returns:
            ReviewVectorizer instance
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        # add ratings
        for rating in sorted(set(review_df["rating"])):
            rating_vocab.add_token(rating)
            
        # add top words if count > provided count
        word_counts = Counter()
        for review in review_df["review"]:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
                    
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)
                
        return cls(review_vocab, rating_vocab)
    
    @classmethod
    def from_serializable(cls, contents):
        """
        instantiate a ReviewVectorizer from a serializable dictionary
        
        Args:
            contents - (dict) the serializable dictionary
        Returns:
            ReviewVectorizer instance
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab = Vocabulary.from_serializable(contents['rating_vocab'])
        
        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)
    
    def to_serializable(self):
        """
        create the serializable dictionary for caching
        
        outputs:
        contents - (dict) the serializable dictionary
        """
        return {"review_vocab": self.review_vocab.to_serializable(), \
                "rating_vocab": self.rating_vocab.to_serializable()}

## `ReviewDataset`
***
The ReviewDataset assumes that the dataset has been minimally cleaned and split into three partitions. It also assumes it can split reviews based on whitespace to get the tokens in a review. In addition, it assumes that the data has an annotation for which split it belongs to. This class inherits from PyTorch's Dataset class in order to provide an API for PyTorch's utilities to work with the dataset. In order to do this, we need to implement the `__getitem__` and `__len__` methods to provide the PyTorch utilities to work with our dataset. 

In [40]:
class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        """
        Args:
            review_df (pandas.DataFrame) - the dataset
            vectorizer (ReviewVectorizer) - vectorizer instantiated from dataset
        """
        self.review_df = review_df
        self._vectorizer = vectorizer

        self.train_df = self.review_df[self.review_df['split'] == "train"]
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df["split"] == 'val']
        self.validation_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df["split"] == "test"]
        self.test_size = len(self.test_df)

        self._lookup_dict = {"train": (self.train_df, self.train_size),
                             "val"  : (self.val_df, self.validation_size),
                             "test" : (self.test_df, self.test_size)}
        # select the split in the dataset to be the training set
        self.set_split("train")

    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """
        load dataset and make new vectorizer

        Args:
            review_csv - (str) location of dataset
        Returns:
            instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):
        """
        selects the splits in the dataset using a column in the dataframe

        Args:
            split - (str) one of 'train', 'val', or 'test'
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """
        primary entrypoint for PyTorch Dataset API
        
        Args:
            index - (int) index to the data point
        Returns:
            a dictionary holding the data points features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]
        review_vector = self._vectorizer.vectorize(row.review)
        rating_index = self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {"x_data": review_vector, "y_target": rating_index}

    def get_num_batches(self, batch_size):
        """
        given a bach size, return number of batches in dataset

        Args:
            batch_size - (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, review_csv, vectorizer_filepath):
        """
        this loads the dataset, as well as the corresponding vectorizer
        this is used after the vectorizer has been cached for repeated use 

        inputs:
        review_csv - (str) location of dataset
        vectorizer_filepath - (str) location of saved vectorizer
        outputs:
        an instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(review_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """
        static method for loading the vectorizer from file

        inputs:
        vectorizer_filepath - (str) location of serialized vectorizer
        outputs:
        instance fo ReviewVectorizer
        """
        with open(vectorizer_filepath) as fp:
            return ReviewVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """
        save vectorizer to disk using json

        inputs:
        vectorizer_filepath - (str) location to save the vectorizer
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serialzable(), fp)

## DataLoader
***
The final stage of the text-to-vectorized-minibatch pipeline is to actually group the vectorized data points. 

PyTorch provides a built-in class called `DataLoader` for coordinating this process. 

We wrap the `DataLoader` inside the `generate_batches()` function, which is a generator to switch data between the CPU and GPU

In [43]:
def generate_batches( dataset, batch_size, shuffle=True, drop_last=True, device="cpu"):
    """
    a generator function wrapping PyTorch DataLoader. 
    Ensures each tensor is on the write device location
    """
    dataloader = DataLoader( dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
    
    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

# Preprocessing
***


Here, we set the Namespace full of parameters for our workspace

In [14]:
args = Namespace(
    raw_train_dataset_csv="data/yelp/raw_train.csv",
    raw_test_dataset_csv="data/yelp/raw_test.csv",
    train_proportion=0.7,
    val_proportion=0.3,
    output_munged_csv="data/yelp/reviews_with_splits_lite.csv",
    seed=1337
)

Now we can read in the raw data into DataFrames

In [8]:
train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])
# we select those entries where the review feature is not empty
train_reviews = train_reviews[~pd.isnull(train_reviews.review)]
test_reviews = pd.read_csv(args.raw_test_dataset_csv, header=None, names=['rating', 'review'])
# we select those entries where the review feature is not empty
test_reviews = test_reviews[~pd.isnull(test_reviews.review)]

We can look at the first few entries of our different subsets of the data

In [9]:
train_reviews.head()

Unnamed: 0,rating,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [10]:
test_reviews.head()

Unnamed: 0,rating,review
0,1,Ordered a large Mango-Pineapple smoothie. Stay...
1,2,Quite a surprise! \n\nMy wife and I loved thi...
2,1,"First I will say, this is a nice atmosphere an..."
3,2,I was overall pretty impressed by this hotel. ...
4,1,Video link at bottom review. Worst service I h...


We can look at the number of each different kind of class in the training set

In [11]:
train_reviews.rating.value_counts()

2    280000
1    280000
Name: rating, dtype: int64

We define a simplistic method for preprocessing the text

In [23]:
def preprocess_text( text ):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [36]:
final_reviews = pd.read_csv(args.output_munged_csv)

We can apply our custom preprocessing function by using `Series.apply`

In [37]:
final_reviews.review = final_reviews.review.apply(preprocess_text)

In [38]:
final_reviews.rating.value_counts()

negative    28000
positive    28000
Name: rating, dtype: int64

In [39]:
final_reviews.head()

Unnamed: 0,rating,review,split
0,negative,terrible place to work for i just heard a stor...,train
1,negative,"hours , minutes total time for an extremely s...",train
2,negative,my less than stellar review is for service . w...,train
3,negative,i m granting one star because there s no way t...,train
4,negative,the food here is mediocre at best . i went aft...,train


# The Binary Classifier
***

In [50]:
class ReviewClassifier(nn.Module):
    """
    simple perceptron-based classifier
    """
    def __init__(self, num_features):
        """
        Args:
            num_features (int) - the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear( in_features=num_features , out_features=1 )
    
    def forward(self, x_in, apply_sigmoid=False):
        """
        compute forward pass of classifier
        
        Args:
            x_in (torch.Tensor) - an input data tensor
                x_in.shape should be (batch, num_features)
            apply_sigmoid (bool) - a flag for the sigmoid activation should be false if used with 
                cross-entropy losses
        Returns:
            the resulting tensor. 
            tensor.shape should be (batch,)
        """
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

In [48]:
training_args = Namespace( frequency_cutoff=25, \
                         model_state_file="model.pth",\
                         review_csv="data/yelp/reviews_with_splits_lite.csv",\
                         save_dir="data/yelp/",\
                         vectorizer_file="vectorizer.json",\
                         batch_size=128,\
                         early_stopping_criteria=5,\
                         learning_rate=0.001,\
                         num_epochs=100,\
                         seed=1337,\
                         cuda=True)

In [46]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': training_args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': training_args.model_state_file}

# Training Loop
***
We instantiate the necessary objects to work with during training

In [70]:
train_state = make_train_state(training_args)

if not torch.cuda.is_available():
    training_args.cuda = False
training_args.device = torch.device("cuda" if training_args.cuda else "cpu")

# dataset and vectorizer
dataset = ReviewDataset.load_dataset_and_make_vectorizer(training_args.review_csv)
vectorizer = dataset.get_vectorizer()

# model
classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))
classifier = classifier.to(training_args.device)

# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=training_args.learning_rate)

scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,mode='min', factor=0.5,patience=1)

# create status bars for the training progresses
epoch_bar = tqdm_notebook(desc='training routine', 
                          total=training_args.num_epochs,
                          position=0)

dataset.set_split('train')
train_bar = tqdm_notebook(desc='split=train',
                          total=dataset.get_num_batches(training_args.batch_size), 
                          position=1, 
                          leave=True)
dataset.set_split('val')
val_bar = tqdm_notebook(desc='split=val',
                        total=dataset.get_num_batches(training_args.batch_size), 
                        position=1, 
                        leave=True)

HBox(children=(IntProgress(value=0, description='training routine', style=ProgressStyle(description_width='ini…

HBox(children=(IntProgress(value=0, description='split=train', max=306, style=ProgressStyle(description_width=…

HBox(children=(IntProgress(value=0, description='split=val', max=65, style=ProgressStyle(description_width='in…

In [None]:
for epoch_index in range(training_args.num_epochs):
    train_state["epoch_index"] = epoch_index
    
    dataset.set_split("train")
    batch_generator = generate_batches(dataset, batch_size=training_args.batch_size, device=training_args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        # step 1: zero the gradients
        optimizer.zero_grad()
        # step 2: compute the output
        predictions = classifier(x_in=batch_dict["x_data"].float())
        # step 3: compute loss
        loss = loss_func(predictions, batch_dict["y_target"].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        # step 4: use loss to produce gradients
        loss.backward()
        # step 5: use optimizer to take gradient step
        optimizer.step()
        
        # compute accuracy
        acc_batch = compute_accuracy(predictions, batch_dict["y_target"])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)
        
        # update bar
        train_bar.set_postfix(loss=running_loss, acc=running_acc, epoch=epoch_index)
        train_bar.update()
        
    train_state["train_loss"].append(running_loss)
    train_state["train_acc"].append(running_acc)

Now we can iterate over the validation set

In [55]:
# now we iterate over the validation set
dataset.set_split("val")
batch_generator = generate_batches(dataset, batch_size=training_args.batch_size, device=training_args.device)

running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # step 1: compute output
    predictions = classifier(x_in=batch_dict["x_data"].float())
    # step 2: compute loss
    loss = loss_func(predictions, batch_dict["y_target"].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    
train_state["val_loss"].append(running_loss)
train_state["val_acc"].append(running_acc)

Now we can report our test accuracy:

In [58]:
dataset.set_split("test")
batch_generator = generate_batches(dataset, batch_size=training_args.batch_size, device=training_args.device)

running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    predictions = classifier(x_in=batch_dict["x_data"].float())
    loss = loss_func(predictions, batch_dict["y_target"].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)
    
    acc_batch = compute_accuracy(predictions, batch_dict["y_target"])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)
    
train_state["test_loss"] = running_loss
train_state["test_acc"] = running_acc

print("Test loss: {:.3f}".format(train_state["test_loss"]))
print("Test accuracy: {:.2f}".format(train_state["test_acc"]))

Test loss: 0.321
Test accuracy: 90.49


# Perform Inference on a New Unseen Example
***

In [64]:
def predict_rating( review, classifier, vectorizer, decision_threshold=0.5):
    """
    predict rating of a new review
    
    Args:
        review (str) - text of the review
        classifier (ReviewClassifier) - trained model
        vectorizer (ReviewVectorizer) - corresponding vectorizer
        decision_threshold (float) - numerical boundary separating positive from negative ratings
    """
    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    result = classifier(vectorized_review.view(1,-1))
    probability_value = torch.sigmoid(result).item()
    index = 1
    if probability_value < decision_threshold:
        index = 0
        
    return vectorizer.rating_vocab.lookup_index(index)

We hope that on the review "this is a pretty awesome book" will yield a "positive" class label

In [65]:
test_review = "this is a pretty awesome book"
classifier = classifier.cpu() # move classifier code to cpu
prediction = predict_rating(test_review, classifier, vectorizer)
print("{} -> {}".format(test_review,prediction))

this is a pretty awesome book -> positive


# Examine Weights Going into Decision Making
***

In [66]:
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

print("Influential Words in Positive Reviews:")
print("--------------------------------------")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential Words in Positive Reviews:
--------------------------------------
chinatown
pleasantly
mmmmmm
deliciousness
nclean
eclectic
hooked
artsy
amazed
nexcellent
heavenly
spotless
nhighly
stunning
keeper
awesomeness
chapel
coma
delectable
mmmm


In [67]:
print("Influential Words in Negative Reviews:")
print("--------------------------------------")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential Words in Negative Reviews:
--------------------------------------
slowest
cancelled
unacceptable
nmaybe
underwhelmed
operator
worst
meh
subject
canceled
gossiping
insulting
blech
mediocre
horrendous
awful
embarrassing
burden
receipts
inexcusable
