This notebook performs sentiment analysis on restaurant reviews.
It uses GloVe embeddings and an LSTM model with PyTorch.
The process includes data loading, preprocessing, model definition, training, and evaluation.


In [1]:
#Import necessary libraries
!pip install scipy==1.13.1 numpy==1.26.4

Collecting scipy==1.13.1
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m111.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, scipy
  Attempt

In [None]:
# Restart the runtime after installing libraries. This is often necessary in Colab
# to ensure that the newly installed packages are used.
import os
os.kill(os.getpid(), 9)

In [1]:
# Install the gensim library
!pip install gensim
import gensim.downloader as api
glove_vectors = api.load("glove-wiki-gigaword-100")

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m83.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.3.3


In [2]:
# Import pandas for data manipulation and numpy for numerical operations
import pandas as pd
import numpy as np

In [3]:
# Mount Google Drive to access files
from google.colab import drive
drive.mount('RestaurantReview')

Mounted at RestaurantReview


In [4]:
# Load the dataset from Google Drive into a pandas DataFrame
df = pd.read_csv('RestaurantReview/MyDrive/pytorch practice notebooks/RestaurantsReview.csv')

In [5]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [9]:
# Rename the column ' Review' to 'Review' to remove leading space
df.rename(columns={' Review':'Review'}, inplace=True)

In [11]:
# Remove rows where the 'Review' column is empty after splitting into words
df1 = df.drop(df[df['Review'].astype(str).apply(lambda x: len(x.split()) == 0 )].index)

In [13]:
# Perform one-hot encoding on the 'Liked' column
df2 = pd.get_dummies(df1, columns = ['Liked'])

In [14]:
# Rename the one-hot encoded columns for clarity
df2.rename(columns={'Liked_0':'NotLiked', 'Liked_1':'Liked'}, inplace=True)

In [16]:
# Replace boolean values (False, True) with integers (0, 1)
df2.replace([False, True],[0,1], inplace=True)

  df2.replace([False, True],[0,1], inplace=True)


In [20]:
# Import the torch library
import re
import torch

In [21]:
# Define a function to convert text to GloVe embeddings
def text_to_embedding(text):
  text = re.sub(r'[^\w\s]', '', text)
  text = str(text).lower()
  embeddings = []
  for word in text.split():
    try:
      embeddings.append(glove_vectors.get_vector(word))
    except:
      embeddings.append(np.zeros(100))
  return torch.tensor(embeddings)

In [22]:
# Apply the text_to_embedding function to each review and store the results in a list
word_level_sentence_embedding_list = []
for text in df2['Review'].values:
    word_level_sentence_embedding_list.append(text_to_embedding(text))

  return torch.tensor(embeddings)


In [25]:
from torch.nn.utils.rnn import pad_sequence


In [26]:
# Pad the sequences of word embeddings to a fixed length (32)
padded_inputs = pad_sequence(word_level_sentence_embedding_list, batch_first=True, padding_value=1)

In [29]:
# Create an attention mask to indicate which parts of the padded sequences are real data
# 1 indicates original data, 0 indicates padding
attention_mask_list = []
for i in range(len(word_level_sentence_embedding_list)):
    attention_mask_list.append(torch.cat((torch.ones(len(word_level_sentence_embedding_list[i])), torch.zeros(32 - len(word_level_sentence_embedding_list[i])))))

In [31]:
# Define the columns representing the output (Liked and NotLiked)
output_columns = ['NotLiked',	'Liked']

In [32]:
# Convert the output columns to a NumPy array of float32 type
output_array = df2[output_columns].to_numpy(dtype='float32')

In [34]:
# Convert the output array to a PyTorch tensor
targets = torch.tensor(output_array)

In [35]:
from torch.utils.data import Dataset

In [36]:
# Define a custom Dataset class for the review data
class ReviewDataset(Dataset):
    def __init__(self, padded_inputs, attention_mask_list,targets, is_test=False):
        self.padded_inputs = padded_inputs
        self.attention_mask_list = attention_mask_list
        self.targets = targets

    def __getitem__(self, index):
        return self.padded_inputs[index],self.attention_mask_list[index] ,self.targets[index]

    def __len__(self):
        return len(self.padded_inputs)

In [37]:
# Create an instance of the ReviewDataset
dataset = ReviewDataset(padded_inputs, attention_mask_list, targets)

In [39]:
# Import DataLoader and random_split from torch.utils.data
from torch.utils.data import DataLoader, random_split

In [40]:
# Define the size of the training and validation sets (90% for training)
train_size =int(len(dataset)*0.9)
valid_size = len(dataset) - train_size

In [41]:
# Split the dataset into training and validation sets randomly
train_data, valid_data = random_split(dataset, [train_size, valid_size])

In [42]:
BATCH_SIZE = 50

In [43]:
# Create DataLoader for the training data
train_dl = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, num_workers=8, pin_memory=True)
val_dl = DataLoader(valid_data, batch_size=BATCH_SIZE, num_workers=8, pin_memory=True)



In [44]:
import torch.nn as nn
import torch.nn.functional as F

In [45]:
# Define a function to calculate accuracy
def accuracy(out, target):
  _ , pred = torch.max(out, dim=1)
  _ , max_index = torch.max(target,dim=1)
  return torch.tensor(torch.eq(pred, max_index).sum().item()/len(pred))

In [46]:
# Define the Review sentiment analysis model using an LSTM
class Review(nn.Module):

  def __init__(self):
    super().__init__()
    self.lstm = nn.LSTM(100, 50, num_layers=1, batch_first=True, bidirectional=False)
    self.linear1 = nn.Linear(50, 25)
    self.linear2 = nn.Linear(25, 2)

  def forward(self, xb, attention_mask):
    out, (hn, _) = self.lstm(xb)
    last_index_output = torch.argmin(attention_mask).item()-1
    out = F.relu(out[:,last_index_output,:])
    out = self.linear1(out)
    out = self.linear2(out)
    return out

  def training_step(self, batch):
    inputs, attention_mask, targets = batch
    outputs = self(inputs, attention_mask)
    probs = torch.sigmoid(outputs)
    loss = F.binary_cross_entropy(probs, targets)
    return loss

  def validation_step(self, batch):
    inputs, attention_mask, targets = batch
    outputs = self(inputs, attention_mask)
    probs = torch.sigmoid(outputs)
    loss = F.binary_cross_entropy(probs, targets)
    acc = accuracy(probs, targets)
    return{'valid_loss' : loss, 'valid_acc': acc}

  def validation_epoch_end(self, outputs):
    batch_losses = [x['valid_loss'] for x in outputs]
    epoch_loss = torch.stack(batch_losses).mean()
    batch_accs = [x['valid_acc'] for x in outputs]
    epoch_acc = torch.stack(batch_accs).mean()
    return{'valid_loss': epoch_loss.item() ,'valid_acc': epoch_acc.item()}

  def epoch_end(self, epoch, result, epochs):
    if ((epoch+1) % 1 == 0 or epoch == epochs-1):
      print("Epoch [{}], val_loss: {:.4f}, train_loss: {:.4f}, val_acc: {:.4f}, lrs: {:.5f}".format(epoch, result['valid_loss'], result['train_loss'], result['valid_acc'], result['lrs'][-1]))

In [47]:
# Define a function to evaluate the model on the validation set
def evaluate(model, valid_loader):
  outputs = [model.validation_step(batch) for batch in valid_loader]
  return model.validation_epoch_end(outputs)

In [48]:
# Decorator to disable gradient calculation (for inference or getting learning rate)
@torch.no_grad()
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

# Define a function to train the model using the one-cycle learning rate policy
def fit_one_cycle(epochs, max_lr, model, train_loader, val_loader,
                  weight_decay=0, grad_clip=None, opt_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    history = []

    # Set up cutom optimizer with weight decay
    optimizer = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
    # Set up one-cycle learning rate scheduler
    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs,
                                                steps_per_epoch=len(train_loader))

    for epoch in range(epochs):
        # Training Phase
        model.train()
        train_losses = []
        lrs = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()

            # Gradient clipping
            if grad_clip:
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)

            optimizer.step()
            optimizer.zero_grad()

            # Record & update learning rate
            lrs.append(get_lr(optimizer))
            sched.step()

        # Validation phase
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        result['lrs'] = lrs
        model.epoch_end(epoch, result, epochs)
        history.append(result)
    return history

In [49]:
# Instantiate the Review model
model = Review()

In [51]:
# Define hyperparameters for training
epochs = 15
max_lr = 0.01
grad_clip = 0.1
weight_decay = 1e-4
opt_func = torch.optim.Adam

In [52]:
# Train the model
%%time
history = fit_one_cycle(epochs, max_lr, model, train_dl, val_dl,
                             grad_clip=grad_clip,
                             weight_decay=weight_decay,
                             opt_func=opt_func)



Epoch [0], val_loss: 0.6925, train_loss: 0.6937, val_acc: 0.4800, lrs: 0.00143
Epoch [1], val_loss: 0.7027, train_loss: 0.6893, val_acc: 0.4900, lrs: 0.00426
Epoch [2], val_loss: 0.6863, train_loss: 0.6927, val_acc: 0.5300, lrs: 0.00755
Epoch [3], val_loss: 0.6813, train_loss: 0.6934, val_acc: 0.5600, lrs: 0.00970
Epoch [4], val_loss: 0.6608, train_loss: 0.6189, val_acc: 0.6600, lrs: 0.00994
Epoch [5], val_loss: 0.6099, train_loss: 0.5287, val_acc: 0.6700, lrs: 0.00950
Epoch [6], val_loss: 0.6583, train_loss: 0.4643, val_acc: 0.6600, lrs: 0.00867
Epoch [7], val_loss: 0.6198, train_loss: 0.4396, val_acc: 0.6700, lrs: 0.00750
Epoch [8], val_loss: 0.5719, train_loss: 0.4192, val_acc: 0.7100, lrs: 0.00611
Epoch [9], val_loss: 0.5523, train_loss: 0.3661, val_acc: 0.6900, lrs: 0.00463
Epoch [10], val_loss: 0.5455, train_loss: 0.2932, val_acc: 0.6900, lrs: 0.00317
Epoch [11], val_loss: 0.5498, train_loss: 0.3181, val_acc: 0.6900, lrs: 0.00188
Epoch [12], val_loss: 0.5528, train_loss: 0.2792, 