RESTAURANT REVIEW ANALYSIS USING ML & NLP METHODS

IMPLEMENTATION OF DistilBERT

IMPORT REQUIRED LIBRARIES & DATASET

In [None]:
!pip install transformers



In [None]:
import numpy as np
import pandas as pd
import torch.nn as nn
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
# Transformers
from transformers import AutoModel, DistilBertTokenizerFast

# specify GPU
device = torch.device("cuda")

In [None]:
torch.cuda.is_available()

True

In [None]:
df = pd.read_csv('/content/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [None]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [None]:
df.shape

(1000, 2)

In [None]:
# check class distribution
df['Liked'].value_counts(normalize = True)

1    0.5
0    0.5
Name: Liked, dtype: float64

In [None]:
# split train dataset into train, validation and test sets
train_text, temp_text, train_labels, temp_labels = train_test_split(df['Review'], df['Liked'],
                                                                    random_state=2018,
                                                                    test_size=0.3,
                                                                    stratify=df['Liked'])


val_text, test_text, val_labels, test_labels = train_test_split(temp_text, temp_labels,
                                                                random_state=2018,
                                                                test_size=0.5,
                                                                stratify=temp_labels)

IMPORT DistilBERT-base-uncased

In [None]:
# Pretrained model
destilbert = AutoModel.from_pretrained('distilbert-base-uncased')
# Tokenizer.
tokenizer_db = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

TOKENIZE & ENCODE THE SEQUENCES

In [None]:
# tokenize and encode sequences in the training set
tokens_train = tokenizer_db.batch_encode_plus(
    train_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids= False
)

# tokenize and encode sequences in the validation set
tokens_val = tokenizer_db.batch_encode_plus(
    val_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids= False
)

# tokenize and encode sequences in the test set
tokens_test = tokenizer_db.batch_encode_plus(
    test_text.tolist(),
    max_length = 25,
    pad_to_max_length=True,
    truncation=True,
    return_token_type_ids= False
)



LIST TO TENSORS

In [None]:
# convert lists to tensors

train_seq = torch.tensor(tokens_train['input_ids'])
train_mask = torch.tensor(tokens_train['attention_mask'])
train_y = torch.tensor(train_labels.tolist())

val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels.tolist())

test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels.tolist())

DATA LOADER

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

#define a batch size
batch_size = 32

# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)

# sampler for sampling the data during training
train_sampler = RandomSampler(train_data)

# dataLoader for train set
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# wrap tensors
val_data = TensorDataset(val_seq, val_mask, val_y)

# sampler for sampling the data during validating
val_sampler = SequentialSampler(val_data)

# dataLoader for validation set
val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

MODEL ARCHITECTURE

In [None]:
# freeze all the parameters
for param in destilbert.parameters():
    param.requires_grad = False

In [None]:
# Define DistilBERT
class DestilBERT(nn.Module):

    def __init__(self, destilbert):
        super(DestilBERT, self).__init__()

        self.destilbert = destilbert

        # dropout layer
        self.dropout = nn.Dropout(0.1)

        # relu activation function
        self.relu =  nn.ReLU()

        # dense layer 1
        self.fc1 = nn.Linear(768,512)

        # dense layer 2 (Output layer)
        self.fc2 = nn.Linear(512,2)

        #softmax activation function
        self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self,sent_id, mask):

        #pass the inputs to the model
        output = self.destilbert(input_ids = sent_id, attention_mask = mask, return_dict = False)
        cls_hs = output[0][:, 0, :]
        x = self.fc1(cls_hs)

        x = self.relu(x)

        x = self.dropout(x)

        # output layer
        x = self.fc2(x)

        # apply softmax activation
        x = self.softmax(x)

        return x

In [None]:
# pass the pre-trained BERT to our define architecture
model = DestilBERT(destilbert)
# push the model to GPU
model = model.to(device)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
#compute the class weights
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight(class_weight='balanced',classes=np.unique(train_labels),y = train_labels)
print("Class Weights:",class_weights)

#convert class weights to tensor
weights = torch.tensor(class_weights,dtype=torch.float)
weights = weights.to(device)

#loss function
cross_entropy = nn.NLLLoss(weight=weights)
# optimizer from hugging face transformers
from transformers import AdamW

# define the optimizer
optimizer = AdamW(model.parameters(),lr = 1e-3)

Class Weights: [1. 1.]




FINE - TUNE

In [None]:
# function to train the model
def train():

    model.train()
    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds=[]

    # iterate over batches
    for step,batch in enumerate(train_dataloader):

        # progress update after every 50 batches.
        if step % 50 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch

        # clear previously calculated gradients
        model.zero_grad()

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

        # append the model predictions
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

      # predictions are in the form of (no. of batches, size of batch, no. of classes).
      # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds


In [None]:
# function for evaluating the model
def evaluate():

    print("\nEvaluating...")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step,batch in enumerate(val_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:

            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():

            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds


In [None]:
epochs = 10
# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):

    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

    #train model
    train_loss, _ = train()

    #evaluate model
    valid_loss, _ = evaluate()

    #save the best model
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'saved_weights.pt')

    # append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)

    print(f'\nTraining Loss: {train_loss:.3f}')
    print(f'Validation Loss: {valid_loss:.3f}')


 Epoch 1 / 10

Evaluating...

Training Loss: 0.490
Validation Loss: 0.409

 Epoch 2 / 10

Evaluating...

Training Loss: 0.301
Validation Loss: 0.445

 Epoch 3 / 10

Evaluating...

Training Loss: 0.232
Validation Loss: 0.488

 Epoch 4 / 10

Evaluating...

Training Loss: 0.228
Validation Loss: 0.438

 Epoch 5 / 10

Evaluating...

Training Loss: 0.192
Validation Loss: 0.455

 Epoch 6 / 10

Evaluating...

Training Loss: 0.222
Validation Loss: 0.524

 Epoch 7 / 10

Evaluating...

Training Loss: 0.248
Validation Loss: 0.442

 Epoch 8 / 10

Evaluating...

Training Loss: 0.186
Validation Loss: 0.461

 Epoch 9 / 10

Evaluating...

Training Loss: 0.167
Validation Loss: 0.468

 Epoch 10 / 10

Evaluating...

Training Loss: 0.217
Validation Loss: 0.624


In [None]:
#load weights of best model
path = 'saved_weights.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

MAKE PREDICTIONS

In [None]:
# get predictions for test data
with torch.no_grad():
    preds = model(test_seq.to(device), test_mask.to(device))
    preds = preds.detach().cpu().numpy()

In [None]:
# model's performance
preds = np.argmax(preds, axis = 1)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79        75
           1       0.79      0.79      0.79        75

    accuracy                           0.79       150
   macro avg       0.79      0.79      0.79       150
weighted avg       0.79      0.79      0.79       150



In [None]:
#confusion matrix
pd.crosstab(test_y , preds)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,59,16
1,16,59
