# Yelp Review Classification

Note: this task is completed via Google Colab

In [0]:
# all the necessary imports
import nltk
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch import optim
import torch
import torchtext
from torchtext.data import Field, LabelField
from torchtext.data import TabularDataset
from torchtext.data import Iterator, BucketIterator
import gensim
import os

In [0]:
# set the seed
manual_seed = 123
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [0]:
# Define tokenizer

# option 1: spacy
import spacy
spacy_en = spacy.load('en_core_web_sm')
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

#option 2: nltk
from nltk.tokenize import word_tokenize


In [0]:
# pretrained model

from torchtext.vocab import Vectors
vectors = Vectors(name = 'glove.840B.300d.txt', cache = "./drive/My Drive/yelp_review/")

In [0]:
# remove stopwords if needed
from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

# define batch size

BATCH_SIZE = 32

# define field
TEXT = Field(sequential=True, tokenize=word_tokenize, lower=True)
LABEL = Field(sequential=False, unk_token = None)



In [0]:
# Load train and validation set
train, val= TabularDataset.splits(
               path="./drive/My Drive/yelp_review/", # the root directory where the data lies
               train='train.tsv', validation="val.tsv",  # file names
               format='tsv',
               skip_header=True, # if your tsv file has a header, make sure to pass this to ensure it doesn't get proceesed as data!
               fields=[('tweet', TEXT), ('label', LABEL)])

# Build vocabulary

TEXT.build_vocab(train, vectors = vectors, min_freq = 3)
LABEL.build_vocab(train)

# split train and val

train_iter, val_iter = BucketIterator.splits(
 (train, val), # we pass in the datasets we want the iterator to draw data from
 batch_sizes=(BATCH_SIZE,BATCH_SIZE),
 sort_key=lambda x: len(x.tweet), 
#  device = device, 
 sort=True,
# A key to use for sorting examples in order to batch together examples with similar lengths and minimize padding. 
 sort_within_batch=True
)



In [0]:
# prepare test set

test = TabularDataset(
    path = "./drive/My Drive/yelp_review/test.tsv",
    format = 'tsv',
    skip_header = True,
    fields = [('tweet', TEXT)]
)

test_iter = Iterator(
    dataset = test,
    batch_size = BATCH_SIZE,
    sort_key = None,
    sort = False,
    shuffle = False,
    sort_within_batch = False,
    device = device,
    train = False
)



In [0]:
class LSTMmodel(nn.Module):
  
  def __init__(self, embedding_size, vocab_size, output_size, hidden_size, num_layers):
    # In the constructor we define the layers for our model (same as our previous RNN)
    super(LSTMmodel, self).__init__()
    # word embedding lookup table
    self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_size).from_pretrained(TEXT.vocab.vectors, freeze = False)
    #self.embedding.weight.data.normal_(0.0,0.05) # mean=0.0, mu=0.05
    #self.embedding.weight.copy_(torch.from_numpy(pretrained_weight)) #if have pre-trained weights
    
    # core LSTM module
    DROPOUT_RATE = 0.2
    #baseline:
    #self.lstm_rnn = nn.LSTM(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers) # input_size, hidden_size, num_layers

    #with droptout:
    self.lstm_rnn = nn.LSTM(input_size=embedding_size, dropout=DROPOUT_RATE, hidden_size=hidden_size, num_layers=num_layers) # input_size, hidden_size, num_layers

    # to get bi-direction:
    #self.lstm_rnn = nn.LSTM(input_size=embedding_size, bidirectional=True, dropout=DROPOUT_RATE, hidden_size=hidden_size, num_layers=num_layers) # input_size, hidden_size, num_layers
    
    self.activation_fn = nn.Sigmoid()
    self.linear_layer = nn.Linear(hidden_size, output_size) 

    self.softmax_layer = nn.LogSoftmax(dim=0)
  
  def forward(self, x):
    # In the forward function we define the forward propagation logic
    out = self.embedding(x)
    #out, (h_state, c_state) = self.lstm_rnn(out) # h_0 initialized to zeros by default
    out, _ = self.lstm_rnn(out, None) # h_0 initialized to zeros by default
    # classify based on the hidden representation at the last token
    out = out[-1] # unsqueeze converts 1D input (D dimension) into 2D input (1xD) 
    #out = self.activation_fn(out)
    out = self.linear_layer(out)
    out = self.softmax_layer(out) # accepts 2D or more dimensional inputs
    return out


In [11]:
# define hyper-parameters
EMBEDDING_SIZE = 300 
VOCAB_SIZE = len(TEXT.vocab.stoi)
NUM_CLASSES = 5
HIDDEN_SIZE = 512
NUM_LAYERS = 4
model = LSTMmodel(EMBEDDING_SIZE, VOCAB_SIZE, NUM_CLASSES, HIDDEN_SIZE, NUM_LAYERS)
model = model.to(device)
print(model)

LEARNING_RATE = 0.2
criterion = nn.NLLLoss()


LSTMmodel(
  (embedding): Embedding(18608, 300)
  (lstm_rnn): LSTM(300, 512, num_layers=4, dropout=0.2)
  (activation_fn): Sigmoid()
  (linear_layer): Linear(in_features=512, out_features=5, bias=True)
  (softmax_layer): LogSoftmax()
)


In [0]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def train(loader):
    total_loss = 0.0
    # iterate throught the data loader
    num_sample = 0
    for batch in loader:
        # load the current batch
        batch_input = batch.tweet
        batch_output = batch.label
        
        batch_input = batch_input.to(device)
        batch_output = batch_output.to(device)
        # forward propagation
        # pass the data through the model
        model_outputs = model(batch_input)
        # compute the loss
        
        cur_loss = criterion(model_outputs, batch_output)
        
        
        total_loss += cur_loss.item() #+ len(batch)*lambda_1*l1_pen

        # backward propagation (compute the gradients and update the model)
        # clear the buffer
        optimizer.zero_grad()
        # compute the gradients
        cur_loss.backward()
        # update the weights
        optimizer.step()

        num_sample += batch_output.shape[0]
    return total_loss/num_sample

# evaluation logic based on classification accuracy
def evaluate(loader):
    all_pred=[]
    all_label = []
    with torch.no_grad(): # impacts the autograd engine and deactivate it. reduces memory usage and speeds up computation
        for batch in loader:
             # load the current batch
            batch_input = batch.tweet
            batch_output = batch.label

            batch_input = batch_input.to(device)
            # forward propagation
            # pass the data through the model
            model_outputs = model(batch_input)
            # identify the predicted class for each example in the batch
            probabilities, predicted = torch.max(model_outputs.cpu().data, 1)
            # put all the true labels and predictions to two lists
            all_pred.extend(predicted)
            all_label.extend(batch_output)

            
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return accuracy,f1score

In [13]:
manual_seed = 123
torch.manual_seed(manual_seed)
if n_gpu > 0:
    torch.cuda.manual_seed(manual_seed)

max_epoch = 20


optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=.9, weight_decay=.00001)

# start the training
for epoch in range(max_epoch):
    # train the model for one pass over the data
    train_loss = train(train_iter)  
    # compute the training accuracy
    train_acc,f1t = evaluate(train_iter)
    # compute the validation accuracy
    val_acc,f1v = evaluate(val_iter)
    
    # print the loss for every epoch
    print('Epoch [{}/{}], Loss: {:.4f}, Training Accuracy: {:.4f}, Validation Accuracy: {:.4f}, F1 score :{:.4f}'.format(epoch+1, max_epoch, train_loss, train_acc, val_acc, f1v))

    model_save = {
            'epoch': epoch,  # number of epoch
            'model_state_dict': model.state_dict(), # model parameters 
            'optimizer_state_dict': optimizer.state_dict(), # save optimizer 
            'loss': train_loss # training loss
            }
    
    # use torch.save to store 
    torch.save(model_save, "./ckpt/model_{}.pt".format(epoch+1))
    

Epoch [1/20], Loss: 0.1065, Training Accuracy: 0.2040, Validation Accuracy: 0.2066, F1 score :0.2000
Epoch [2/20], Loss: 0.1076, Training Accuracy: 0.2615, Validation Accuracy: 0.2743, F1 score :0.2573
Epoch [3/20], Loss: 0.1042, Training Accuracy: 0.2579, Validation Accuracy: 0.2434, F1 score :0.2193
Epoch [4/20], Loss: 0.0968, Training Accuracy: 0.4396, Validation Accuracy: 0.4260, F1 score :0.4041
Epoch [5/20], Loss: 0.0909, Training Accuracy: 0.5383, Validation Accuracy: 0.5274, F1 score :0.5170
Epoch [6/20], Loss: 0.0883, Training Accuracy: 0.5641, Validation Accuracy: 0.5297, F1 score :0.5128
Epoch [7/20], Loss: 0.0866, Training Accuracy: 0.5857, Validation Accuracy: 0.5574, F1 score :0.5471
Epoch [8/20], Loss: 0.0851, Training Accuracy: 0.6371, Validation Accuracy: 0.5829, F1 score :0.5794
Epoch [9/20], Loss: 0.0836, Training Accuracy: 0.6528, Validation Accuracy: 0.5766, F1 score :0.5756
Epoch [10/20], Loss: 0.0825, Training Accuracy: 0.6803, Validation Accuracy: 0.6046, F1 sco