## YELP DATASET - Sentiment Analysis Draft

In [1]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
import string
from collections import Counter
from torch.utils.data import DataLoader
import numpy as np

In [2]:
cols = ["sentiment", "review"]
train_orig = pd.read_csv("../input/yelp-review-dataset/yelp_review_polarity_csv/train.csv", names=cols)
test = pd.read_csv("../input/yelp-review-dataset/yelp_review_polarity_csv/test.csv", names=cols)

In [3]:
train_ = train_orig.groupby('sentiment',as_index=False).apply(lambda x: x.sample(frac=0.3))
train_ = train_.reset_index( level = 0, drop=True)

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    #text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text) 
    return text

In [5]:
train = train_.copy()
train.review = train.review.apply(clean_text)
test.review = test.review.apply(clean_text)

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(train["review"], train["sentiment"], test_size=0.15, random_state=42, stratify=train["sentiment"])
train = pd.concat([X_train, y_train], axis=1)
valid = pd.concat([X_valid, y_valid], axis=1)

In [7]:
train["split"] = "train"
test["split"] = "test"
valid["split"] = "val"
reviews_df = pd.concat([train, test, valid])
reviews_df.rename(columns={"sentiment": "rating"}, inplace=True)
reviews_df.head()

Unnamed: 0,review,rating,split
329348,i should have checked yelp before coming here ...,1,train
510482,when was the last time you heard rock-n-roll b...,2,train
132069,the service here just was not good . it seeme...,1,train
115598,"upon walking into the store today , i observe...",1,train
173477,just had lunch there with the homie . \n\nthe ...,2,train


In [8]:
reviews_df.to_csv("./reviews.csv")

# Dataset class

In [9]:
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):

        self.review_df = review_df
        self._vectorizer = vectorizer

        self.train_df = self.review_df[self.review_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):

        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))

    def get_vectorizer(self):
        return self._vectorizer

    def set_split(self, split="train"):

        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        review_vector = \
            self._vectorizer.vectorize(row.review)
        rating_index = \
            self._vectorizer.rating_vocab.lookup_token(row.rating)
        return {'x_data': review_vector,
                'y_target': rating_index}

    def get_num_batches(self, batch_size):
        return len(self) // batch_size

### Vocabulary

In [10]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}

        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        
        
    def to_serializable(self):
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        return cls(**contents)

    def add_token(self, token):
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

### Vectorizer

In [11]:
class ReviewVectorizer(object):
    def __init__(self, review_vocab, rating_vocab):
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, review):
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1

        return one_hot

    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)

        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
               
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

    @classmethod
    def from_serializable(cls, contents):
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab =  Vocabulary.from_serializable(contents['rating_vocab'])

        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)

    def to_serializable(self):
        return {'review_vocab': self.review_vocab.to_serializable(),
                'rating_vocab': self.rating_vocab.to_serializable()}

### Dataloader

In [12]:
from torch.utils.data import DataLoader

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

### Model

In [13]:
import torch.nn as nn
import torch.nn.functional as F

class ReviewClassifier(nn.Module):
    def __init__(self, num_features):
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, 
                             out_features=1)

    def forward(self, x_in, apply_sigmoid=False):
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

In [14]:
from argparse import Namespace

args = Namespace(

    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv='./reviews.csv',
    save_dir='./',
    vectorizer_file='vectorizer.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
)
args

Namespace(batch_size=128, early_stopping_criteria=5, frequency_cutoff=25, learning_rate=0.001, model_state_file='model.pth', num_epochs=100, review_csv='./reviews.csv', save_dir='./', seed=1337, vectorizer_file='vectorizer.json')

In [15]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [16]:
import torch.optim as optim 

def make_train_state(args):
    return {'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1}
train_state = make_train_state(args)

if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
vectorizer = dataset.get_vectorizer()

classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))
classifier = classifier.to(args.device)

loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

In [17]:
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    
    for batch_index, batch_dict in enumerate(batch_generator):

        optimizer.zero_grad()

        y_pred = classifier(x_in=batch_dict['x_data'].float())

        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        loss.backward()

        optimizer.step()
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)
    print("Accuracy: {} \nLoss: {}".format(running_acc, running_loss))

    dataset.set_split('val')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):

        y_pred = classifier(x_in=batch_dict['x_data'].float())
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)
    

Accuracy: 89.17741031390136 
Loss: 0.3456679702606969
Accuracy: 92.76135089686103 
Loss: 0.22152302182308756
Accuracy: 93.48094170403587 
Loss: 0.19178128307309386
Accuracy: 93.92586883408062 
Loss: 0.17610216936589362
Accuracy: 94.20053251121068 
Loss: 0.16571850479985567
Accuracy: 94.49341367713001 
Loss: 0.1580053976699377
Accuracy: 94.65036434977569 
Loss: 0.15198434374792127
Accuracy: 94.83674327354271 
Loss: 0.146935841675029
Accuracy: 95.01331278026917 
Loss: 0.14257804758078288
Accuracy: 95.13663116591913 
Loss: 0.13891067819611366
Accuracy: 95.25854820627808 
Loss: 0.13561707294040748
Accuracy: 95.3685538116593 
Loss: 0.13274019763140932
Accuracy: 95.44913116591918 
Loss: 0.13010793130828116
Accuracy: 95.54932735426006 
Loss: 0.12764659672975504
Accuracy: 95.63340807174882 
Loss: 0.12548207038921663
Accuracy: 95.70838004484297 
Loss: 0.12344878842143737
Accuracy: 95.77144058295963 
Loss: 0.12158011188354731
Accuracy: 95.81978699551584 
Loss: 0.11975544942668198
Accuracy: 95.92

KeyboardInterrupt: 

In [18]:
dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    y_pred = classifier(x_in=batch_dict['x_data'].float())
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)

    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

In [19]:
def predict_rating(review, classifier, vectorizer,
                   decision_threshold=0.5):

    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    result = classifier(vectorized_review.view(1, -1))

    probability_value = F.sigmoid(result).item()

    index =  1
    if probability_value < decision_threshold:
        index = 0

    return vectorizer.rating_vocab.lookup_index(index)

test_review = "this is a pretty awesome book"
prediction = predict_rating(test_review, classifier, vectorizer)
print("{} -> {}".format(test_review, prediction)

SyntaxError: unexpected EOF while parsing (<ipython-input-19-67492dcc9e32>, line 27)

In [20]:
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()
print("Influential words in Positive Reviews:")
print("--------------------------------------")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Positive Reviews:
--------------------------------------
fanciest
\n\nwe'll
sinful
yummm
\n\nexcellent
disappoint
perfection
=d
vos
hells
gem
exceeded
\nclean
d\u00e9licieux
hesitate
ftw
super-friendly
tastiest
yumm
\nawesome


In [21]:
print("Influential words in Negative Reviews:")
print("--------------------------------------")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Negative Reviews:
--------------------------------------
ripoff
letzten
slowest
furious
\"meh
meh
cockroach
inedible
marginal
aggravating
sham
mediocre
worst
aweful
\n\nmeh
poisoning
underwhelmed
overrated
absurdly
unacceptable
