In [1]:
from argparse import Namespace
from collections import Counter
from enum import Enum
import json
import os
import re
import string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

In [2]:
from typing import *

In [3]:
class Vocabulary(object):
    def __init__(self, token_to_idx:dict=None, add_unk:bool=True, unk_token:str="<UNK>"):
        """Create vocabulary and index dictionaries."""
        self._token_to_idx = {} if (token_to_idx is None) else token_to_idx
        self._idx_to_token = {idx:token for token, idx in self._token_to_idx.items()}
    
        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = self.add_token(unk_token) if add_unk else -1
        
    def to_serializable(self):
        """Return serializable dictionary."""
        return {"token_to_idx": self._token_to_idx, "add_unk": self._add_unk, "unk_token": self._unk_token}
    
    @classmethod
    def from_serializable(cls, contents:dict):
        """Instantiate Vocabulary from serialized dict."""
        return cls(**contents)
    
    def add_token(self, token:str):
        """Add word and return its index."""
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
    
        return index
    
    def add_many(self, tokens:List[str]):
        """Add words and get their indexes."""
        return [self.add_token(token) for token in tokens]
    
    def lookup_token(self, token:str):
        """Return the index or UNK index for a given word."""
        if self.unk_index >= 0:
            idx = self._token_to_idx.get(token, self.unk_index)
        else: 
            idx = self._token_to_idx[token]
            
        return idx
    
    def lookup_index(self, index:int):
        """Return the word for a given index."""
        if index not in self._idx_to_token:
            raise KeyError(f"The index {index} is not in the Vocabulary.")
            
        return self._idx_to_token[index]
    
    def __str__(self):
        return f"<Vocabulary(size={len(self)})>"
    
    def __len__(self):
        return len(self._token_to_idx)

In [4]:
class ReviewVectorizer(object):
    def __init__(self, review_vocabulary:Vocabulary, rating_vocabulary:Vocabulary):
        """Create a vectorizer to one-hot encode words in reviews."""
        self.review_vocabulary = review_vocabulary
        self.rating_vocabulary = rating_vocabulary
        
    def vectorize(self, review:str):
        """Create one-hot encodings of words in review."""
        one_hot_encoding = np.zeros(len(self.review_vocabulary), dtype=np.float32)
        
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot_encoding[self.review_vocabulary.lookup_token(token)] = 1
                
        return one_hot_encoding
        
    @classmethod
    def from_dataframe(cls, dataset:pd.DataFrame, word_freq_filter:int=25):  
        """Instantiate vectorizer from dataset."""
        review_vocabulary = Vocabulary(add_unk=True)
        rating_vocabulary = Vocabulary(add_unk=False)
        
        for rating in sorted(set(dataset.rating)):
            rating_vocabulary.add_token(rating)
            
        word_counter = Counter()
        for review in dataset.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counter[word] += 1
                    
        for word, count in word_counter.items():
            if count > word_freq_filter:
                review_vocabulary.add_token(word)
                
        return cls(review_vocabulary, rating_vocabulary)
    
    @classmethod
    def from_serializable(cls, contents:dict):
        """Instantiate vectorizer from serializable object."""
        review_vocabulary = Vocabulary.from_serializable(contents["review_vocabulary"])
        rating_vocabulary = Vocabulary.from_serializable(contents["rating_vocabulary"])
        
        return cls(review_vocabulary, rating_vocabulary)
        
    def to_serializable(self):
        """Create serializable object."""
        return {"review_vocabulary":self.review_vocabulary.to_serializable(), 
                "rating_vocabulary":self.rating_vocabulary.to_serializable()}

In [5]:
class ReviewDataset(Dataset):
    def __init__(self, dataset:pd.DataFrame, vectorizer:ReviewVectorizer):
        """Create lookup dictionary containing Tr-Val-Te datasets, and store the vocabulary vectorizer."""
        self.dataset = dataset
        self._vectorizer = vectorizer
        
        train = dataset[dataset.split=="train"]
        validation = dataset[dataset.split=="val"]
        test = dataset[dataset.split=="test"]
        
        self._lookup_dict = {
            "train": (train, len(train)),
            "validation": (validation, len(validation)),
            "test": (test, len(test))
        }
        
        self.set_split("train")
    
    @classmethod
    def load_dataset_and_make_vectorizer(cls, dataset_path:str):
        """Load dataset from file, and create vectorizer."""
        dataset = pd.read_csv(dataset_path)
        train_dataset = dataset[dataset.split == "train"]
        return cls(dataset, ReviewVectorizer.from_dataframe(train_dataset))
    
    def load_dataset_and_vectorizer(cls, dataset_path:str, vectorizer_path:str):
        """Load dataset and vectorizer from files."""
        dataset = pd.read_csv(dataset_path)
        vectorizer = self.load_vectorizer(vectorizer_path)
        return cls(dataset, vectorizer)

    @staticmethod
    def load_vectorizer(vectorizer_path:str):
        """Load vectorizer from file."""
        with open(vectorizer_path) as f:
            return ReviewVectorizer.from_serializable(json.load(f))
    
    def save_vectorizer(self, vectorizer_path:str):
        """Save serializable vectorizer."""
        with open(vectorizer_path, "w") as f:
            json.dump(self._vectorizer.to_serializable(), f)
            
    def get_vectorizer(self):
        """Return vocabulary vectorizer."""
        return self._vectorizer
    
    def set_split(self, split:str="train"):
        """Set dataset split (train, validation or test)."""
        self._target_split = split
        self._target_data, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        return self._target_size
    
    def __getitem__(self, index:int):
        """Get features and label for a review at index."""
        row = self._target_data.iloc[index]
        vector_review = self._vectorizer.vectorize(row.review)
        index_rating = self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {"x":vector_review, "y":index_rating}
    
    def get_number_batches(self, batch_size:int):
        """Return number of batches according to length of the lookup dictionary split."""
        return len(self) // batch_size
    
def generate_batches(dataset:pd.DataFrame, batch_size:int, shuffle:bool=True, drop_last:bool=True, device:str="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [6]:
class ReviewClassifier(nn.Module):
    def __init__(self, num_features:int):
        """Instantiate classifier, and its layers."""
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=1)
        
    def forward(self, x:torch.Tensor, apply_sigmoid:bool=False):
        """Define forward pass."""
        y_out = self.fc1(x).squeeze()
        
        if apply_sigmoid:
            y_out = torch.sigmoid(y_out)
            
        return y_out

In [7]:
def make_train_state(args:dict):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'validation_loss': [],
            'validation_acc': [],
            'test_loss': [],
            'test_acc': [],
            'model_filename': args.model_state_file}

def update_train_state(args:dict, model:ReviewClassifier, train_state:dict):
    if train_state['epoch_index'] == 0:     # Save one model
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False
    elif train_state['epoch_index'] >= 1:   # Save model if improved.
        loss_t = train_state['val_loss'][-1:]

        if loss_t >= train_state['early_stopping_best_val']: # Loss is worse
            train_state['early_stopping_step'] += 1
        else: 
            torch.save(model.state_dict(), train_state['model_filename'])
            train_state['early_stopping_step'] = 0

        train_state['stop_early'] = train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred:torch.Tensor, y_target:torch.Tensor):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

def set_seed_everywhere(seed:int, cuda:bool):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath:str):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
        
args = Namespace(
    # Data and Path information
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv='../../Chapter3/data/preprocess_data.csv',
    save_dir='../output/',
    vectorizer_file='vectorizer.json',
    # Training hyper parameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options
    catch_keyboard_interrupt=True,
    cuda=False,
    expand_filepaths_to_save_dir=True,
    reload_from_files=False,
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("Expanded filepaths: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

# handle dirs
handle_dirs(args.save_dir)

Expanded filepaths: 
	../output/vectorizer.json
	../output/model.pth
Using CUDA: False


In [8]:
if args.reload_from_files:
    # training from a checkpoint
    print("Loading dataset and vectorizer")
    dataset = ReviewDataset.load_dataset_and_load_vectorizer(args.review_csv,
                                                            args.vectorizer_file)
else:
    print("Loading dataset and creating vectorizer")
    # create dataset and vectorizer
    dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
    dataset.save_vectorizer(args.vectorizer_file)    
vectorizer = dataset.get_vectorizer()

classifier = ReviewClassifier(num_features=len(vectorizer.review_vocabulary))

Loading dataset and creating vectorizer


In [9]:
## Train setup ##
classifier = classifier.to(args.device)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode="min", factor=0.5, patience=1)

train_state = make_train_state(args)

epoch_bar = tqdm_notebook(desc="Training", total=args.num_epochs, position=0)

dataset.set_split("train")
train_bar = tqdm_notebook(desc="Train", total=dataset.get_number_batches(args.batch_size), position=1, leave=True)

dataset.set_split("validation")
val_bar = tqdm_notebook(desc="Validation", total=dataset.get_number_batches(args.batch_size), position=1, leave=True)

HBox(children=(IntProgress(value=0, description='Training', style=ProgressStyle(description_width='initial')),…

HBox(children=(IntProgress(value=1, bar_style='info', description='Train', max=1, style=ProgressStyle(descript…

HBox(children=(IntProgress(value=1, bar_style='info', description='Validation', max=1, style=ProgressStyle(des…

In [12]:
def step(dataset, args, split, bar=None):
    dataset.set_split(split)
    batch_generator = generate_batches(dataset=dataset, batch_size=args.batch_size, device=args.device)
    epoch_loss = 0.0
    epoch_accuracy = 0.0

    if split == "train":
        classifier.train()
    else:
        classifier.eval()

    for step, batch in enumerate(batch_generator):
        if split == "train":
            optimizer.zero_grad()
            
        y_pred = classifier(x=batch["x"].float())

        loss = loss_fn(y_pred, batch["y"].float())
        epoch_loss += (loss.item() - epoch_loss) / (step+1)

        if split == "train":
            loss.backward()
            optimizer.step()

        accuracy = compute_accuracy(y_pred, batch["y"].float())
        epoch_accuracy += (accuracy - epoch_accuracy) / (step+1)

        if bar is not None:
            bar.set_postfix(loss=epoch_loss, acc=epoch_accuracy, epoch=epoch)
            bar.update()

    train_state[split+"_loss"].append(epoch_loss)
    train_state[split+"_acc"].append(epoch_accuracy)

In [None]:
try:
    for epoch in range(args.num_epochs):
        train_state["epoch_index"] = epoch
        step(dataset=dataset, args=args, split="train", bar=train_bar)
        step(dataset=dataset, args=args, split="validation", bar=val_bar)
        
        train_state = update_train_state(args=args, model=classifier, train_state=train_state)
        
        scheduler.step(train_state["validation_loss"][-1])
        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
        
        if train_state["stop_early"]:
            break
        
except KeyboardInterrupt:
    print("Exiting loop")

In [18]:
#classifier.load_state_dict(torch.load(train_state["model_filename"]))
classifier = classifier.to(args.device)

step(dataset=dataset, args=args, split="test", bar=None)

print(f"Test loss: {train_state['test_loss'][-1]}")
print(f"Test accuracy: {train_state['test_acc'][-1]}")

test_review = "this is a pretty awesome book"
      
classifier = classifier.cpu()
prediction = predict_rating(review=test_review, classifier=classifier, vectorizer=vectorizer, class_threshold=0.5)

print(f"{test_review} -> {prediction}")

Test loss: 0.0
Test accuracy: 0.0
this is a pretty awesome book -> negative


In [15]:
def preprocess_text(text:str):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [17]:
def predict_rating(review:str, classifier:ReviewClassifier, vectorizer:ReviewVectorizer, class_threshold:float=0.5):
    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    
    result = classifier(vectorized_review.view(1, -1))
    probability = torch.sigmoid(result).item()
    index = 0 if probability < class_threshold else 1
    
    return vectorizer.rating_vocabulary.lookup_index(index)

In [74]:
opinion = Enum("opinion", "positive, negative")

def list_influential_words_for_class(op, vectorizer, num_of_words=20, word_weights=None):
    if op == opinion.positive:
        intro = "Most"
        _, indices = torch.sort(word_weights, dim=0, descending=True)
    else:
        intro = "Least"
        _, indices = torch.sort(word_weights, dim=0, descending=False)
    
    indices = indices.numpy().tolist()
    
    print(f"{intro} influential words for {op.name} Reviews:")
    print("----------------------------------------------")
    for i in range(num_of_words):
        print(vectorizer.review_vocabulary.lookup_index(indices[i]))
    print("----------\n\n\n")

In [75]:
weights_of_words = classifier.fc1.weight.detach()[0]
list_influential_words_for_class(opinion.positive, vectorizer, num_of_words=20, word_weights=weights_of_words)

Most influential words for positive Reviews:
----------------------------------------------
go
from
and
not
food
us
this
or
that
are
back
you
be
was
it
service
what
t
when
so
----------





In [76]:
list_influential_words_for_class(opinion.negative, vectorizer, num_of_words=20, word_weights=weights_of_words)

Least influential words for negative Reviews:
----------------------------------------------
just
would
a
very
good
she
my
<UNK>
as
no
were
our
n
all
had
is
only
can
to
there
----------





In [None]:
try:
    for epoch in range(args.num_epochs):
        train_state["epoch_index"] = epoch
        
        dataset.set_split("train")
        batch_generator = generate_batches(dataset=dataset, batch_size=args.batch_size, device=args.device)
        epoch_loss = 0.0
        epoch_accuracy = 0.0
        
        classifier.train()
        
        for step, batch in enumerate(batch_generator):
            optimizer.zero_grad()
            y_pred = classifier(x=batch["x"].float())
            
            loss = loss_fn(y_pred, batch["y"].float())
            epoch_loss += (loss.item() - epoch_loss) / (step+1)
        
            loss.backward()
            optimizer.step()
        
            accuracy = compute_accuracy(y_pred, batch["y"].float())
            epoch_accuracy += (accuracy - epoch_accuracy) / (step+1)
            
            train_bar.set_postfix(loss=epoch_loss, acc=epoch_accuracy, epoch=epoch)
            train_bar.update()
        
        train_state["train_loss"].append(epoch_loss)
        train_state["train_acc"].append(epoch_accuracy)
        
        
        dataset.set_split("validation")
        batch_generator = generate_batches(dataset=dataset, batch_size=args.batch_size, device=args.device)
        epoch_loss = 0.0
        epoch_accuracy = 0.0
        
        classifier.eval()
        
        for step, batch in enumerate(batch_generator):
            y_pred = classifier(x=batch["x"].float())
            
            loss = loss_fn(y_pred, batch["y"].float())
            epoch_loss += (loss.item() - epoch_loss) / (step+1)
        
            accuracy = compute_accuracy(y_pred, batch["y"].float())
            epoch_accuracy += (accuracy - epoch_accuracy) / (step+1)
            
            val_bar.set_postfix(loss=epoch_loss, acc=epoch_accuracy, epoch=epoch)
            val_bar.update()
        
        train_state["validation_loss"].append(epoch_loss)
        train_state["validation_acc"].append(epoch_accuracy)

        train_state = update_train_state(args=args, model=classifier, train_state=train_state)
        
        scheduler.step(train_state["validation_loss"][-1])
        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
        
        if train_state["stop_early"]:
            break
        
except KeyboardInterrupt:
    print("Exiting loop")