# Sentiment Analysis with BERT

In [None]:
!pip install datasets huggingface_hub transformers

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

# Torch ML libraries
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Set intial variables and constants
%config InlineBackend.figure_format='retina'

# Graph Designs
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

# Random seed for reproducibilty
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# Set GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
!kaggle datasets download -d prakharrathi25/google-play-store-reviews

In [None]:
!unzip /content/google-play-store-reviews.zip

In [None]:
df = pd.read_csv("reviews.csv")
df.shape

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
# convert scores to three classes
sns.countplot(data=df, x="score")
plt.xlabel("review_score")
plt.show()

In [None]:
df['sentiment'] = df.score.apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))
class_names = ["negative", "neutral", "positive"]
ax = sns.countplot(data=df, x="sentiment")
plt.xlabel("review_sentiment")
ax.set_xticklabels(class_names)

In [None]:
MODEL_NAME = "bert-base-cased"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
# Some of the common BERT tokens
print(tokenizer.sep_token, tokenizer.sep_token_id) # marker for ending of a sentence
print(tokenizer.cls_token, tokenizer.cls_token_id) # start of each sentence, so BERT knows we’re doing classification
print(tokenizer.pad_token, tokenizer.pad_token_id) # special token for padding
print(tokenizer.unk_token, tokenizer.unk_token_id) # tokens not found in training set

In [None]:
token_lens = []

for txt in df.content:
  tokens = tokenizer.encode(txt, max_length=512)
  token_lens.append(len(tokens))

In [None]:
tokenizer.encode_plus(
    df.content.iloc[0],
    add_special_tokens=True,
    max_length=160,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors="pt",
    )

In [None]:
sns.distplot(token_lens)
plt.xlim([0, 256])
plt.xlabel("Token count")

In [None]:
MAX_LEN = 160

# Data Generator Class
class GPReviewDataset(Dataset):
  def __init__(self, reviews, targets, tokenizer, max_len):
    self.reviews = reviews
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, item):
    review = str(self.reviews[item])
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors="pt",
    )

    return {
        "review_text" : review,
        "input_ids" : encoding["input_ids"].flatten(),
        "attention_mask" : encoding["attention_mask"].flatten(),
        "targets" : torch.tensor(target, dtype=torch.long)
    }

# train-test-val set
df_train, df_test = train_test_split(df, test_size=0.2, random_state=RANDOM_SEED)
df_val, df_test = train_test_split(df_test, test_size=0.5, random_state=RANDOM_SEED)

print("Training set shape: {}\nTest set shape: {}\nValidation set shape: {}".format(df_train.shape, df_test.shape, df_val.shape))

In [None]:
# Create Dataloader
def create_dataloader(df, tokenizer, max_len, batch_size):
  ds = GPReviewDataset(
      reviews=df.content.to_numpy(),
      targets=df.sentiment.to_numpy(),
      tokenizer=tokenizer,
      max_len=max_len
  )

  return DataLoader(
      ds,
      batch_size=batch_size,
      num_workers=0
  )

BATCH_SIZE = 16
train_data_loader = create_dataloader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_dataloader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_dataloader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [None]:
data = next(iter(train_data_loader))
print(data.keys())

print("Input shape: {}\nAttention mask: {}\nTarget shape: {}".format(data['input_ids'].shape, data['attention_mask'].shape, data['targets'].shape))

In [None]:
bert_model = BertModel.from_pretrained(MODEL_NAME)

In [None]:
class SentimentClassifier(nn.Module):
  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(MODEL_NAME)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    # self.optimizer = optim.Adam(self.parameters(), lr=23-5, correct_bias=False)

  def forward(self, input_ids=None, attention_mask=None):
    _, pooled_output = self.bert(
        input_ids=input_ids,
        attention_mask=attention_mask,
        return_dict=False
    )
    output = self.drop(pooled_output)
    return self.out(output)

In [None]:
model = SentimentClassifier(len(class_names))
model = model.to(device)

In [None]:
print(bert_model.config.hidden_size)

In [None]:
# Number of iterations
EPOCHS = 10

optimizer = optim.Adam(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS

# using scheduler without warmup so there is no ramp up with a smaller learning rate
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

In [None]:
# training
# Function for a single training iteration
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
  model = model.train()
  losses = []
  correct_predictions = 0

  for d in data_loader:
    input_ids = d["input_ids"].to(device)
    attention_mask = d["attention_mask"].to(device)
    targets = d["targets"].to(device)

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )

    _, preds = torch.max(outputs, dim=1)
    loss = loss_fn(outputs, targets)
    correct_predictions += torch.sum(preds == targets)
    losses.append(loss.item())

    # Backward prop
    loss.backward()

    # Gradient Descent
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

  return correct_predictions.double() / n_examples, np.mean(losses)

# evaluation
def eval_model(model, data_loader, loss_fn, device, n_examples):
  model = model.eval()

  losses = []
  correct_predictions = 0

  with torch.no_grad():
    for d in data_loader:
      input_ids = d["input_ids"].to(device)
      attention_mask = d["attention_mask"].to(device)
      targets = d["targets"].to(device)

      # Get model ouptuts
      outputs = model(
          input_ids=input_ids,
          attention_mask=attention_mask
      )

      _, preds = torch.max(outputs, dim=1)
      loss = loss_fn(outputs, targets)

      correct_predictions += torch.sum(preds == targets)
      losses.append(loss.item())
  return correct_predictions.double() / n_examples, np.mean(losses)

In [None]:
%%time

history = defaultdict(list)
best_accuracy = 0

for epoch in range(EPOCHS):
  print(f"Epoch {epoch + 1} / {EPOCHS}")
  print("-" * 10)

  train_acc, train_loss = train_epoch(
      model,
      train_data_loader,
      loss_fn,
      optimizer,
      device,
      scheduler,
      len(df_train)
  )

  print(f"Train loss {train_loss} accuracy {train_acc}")

  val_acc, val_loss = eval_model(
      model,
      val_data_loader,
      loss_fn,
      device,
      len(df_val)
  )
  print(f"Val loss: {val_loss}, Val Accuracy: {val_acc}")
  print()

  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)
  history['val_acc'].append(val_acc)
  history['val_loss'].append(val_loss)

  if val_acc > best_accuracy:
    torch.save(model.state_dict(), "best_model_state.bin")
    best_accuracy = val_acc

In [None]:
def move_to_cpu(data):
    if isinstance(data, torch.Tensor):
        return data.cpu().numpy()
    elif isinstance(data, list):
        return [move_to_cpu(item) for item in data]
    else:
        return data
history_cpu = {k: move_to_cpu(v) for k, v in history.items()}

# Plot training and validation accuracy
plt.plot(history_cpu['train_acc'], label='train accuracy')
plt.plot(history_cpu['val_acc'], label='validation accuracy')

# Graph chars
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0, 1]);

In [None]:
test_acc, _ = eval_model(
    model,
    test_data_loader,
    loss_fn,
    device,
    len(df_test)
)
test_acc.item()

In [None]:
def get_predictions(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            # Get outouts
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            _, preds = torch.max(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values


In [None]:
y_review_texts, y_pred, y_pred_probs, y_test = get_predictions(
    model,
    test_data_loader
)

print(classification_report(y_test, y_pred, target_names=class_names))

In [None]:
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot=True, fmt="d", cmap="Blues")
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation=0, ha='right')
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation=30, ha='right')
    plt.ylabel('True sentiment')
    plt.xlabel('Predicted sentiment');

cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index=class_names, columns=class_names)
show_confusion_matrix(df_cm)


In [None]:
review_text = "I love completing my todos! Best app ever!!!"
encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
)
input_ids = encoded_review['input_ids'].to(device)
attention_mask = encoded_review['attention_mask'].to(device)

output = model(input_ids, attention_mask)
_, prediction = torch.max(output, dim=1)

print(f'Review text: {review_text}')
print(f'Sentiment  : {class_names[prediction]}')

In [None]:
# save the model
!dir
torch.save(model.state_dict(), "./saved_model/google_review_finetuned_BERT_model.pt")

# save the tokenizer
tokenizer.save_pretrained("./saved_model/google_review_BERT_tokenizer.pt")

In [None]:
from google.colab import files
files.download("/content/saved_model/google_review_finetuned_BERT_model.pt")

In [None]:
!zip "/content/saved_model/google_review_BERT_tokenizer.zip" "/content/saved_model/google_review_BERT_tokenizer"

In [None]:
files.download("/content/saved_model/google_review_BERT_tokenizer.zip")

# Finetuning: Hugging Face

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
! pip install transformers datasets evaluate bitsandbytes

In [None]:
from datasets import load_dataset

dataset = load_dataset("fancyzhx/yelp_polarity")
dataset["train"][0]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

def tokenize_function(example):
  return tokenizer(example["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
id2label = {0: "NEGATIVE", 1:"POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
# select just a small batch to work with
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [None]:
small_train_dataset

In [None]:
# create model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased",
                                                           num_labels=2, id2label=id2label,
                                                           label2id=label2id)


In [None]:
# set evaluation
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metric(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

In [None]:
# set training hyperparameters
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="yelp_polarity_tuned_bart_base_10K",
                                  eval_strategy="epoch",
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=2,
                                  weight_decay=0.01,
                                  save_strategy="epoch",
                                  optim="adamw_bnb_8bit",
                                  gradient_checkpointing=True,
                                  fp16=True,
                                  load_best_model_at_end=True,
                                  push_to_hub=True,
)

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import bitsandbytes as bnb
from torch import nn
from transformers.trainer_pt_utils import get_parameter_names

decay_parameters = get_parameter_names(model, [nn.LayerNorm])
decay_parameters = [name for name in decay_parameters if "bias" not in name]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
        "weight_decay": training_args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if n not in decay_parameters],
        "weight_decay": 0.0,
    },
]

optimizer_kwargs = {
    "betas": (training_args.adam_beta1, training_args.adam_beta2),
    "eps": training_args.adam_epsilon,
}
optimizer_kwargs["lr"] = training_args.learning_rate
adam_bnb_optim = bnb.optim.Adam8bit(
    optimizer_grouped_parameters,
    betas=(training_args.adam_beta1, training_args.adam_beta2),
    eps=training_args.adam_epsilon,
    lr=training_args.learning_rate,
)

In [None]:
# create Trainer object
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metric,
    tokenizer=tokenizer,
    optimizers=(adam_bnb_optim, None),
)

In [None]:
trainer.train()

In [None]:
# saving tokenizer
tokenizer.save_pretrained("yelp_polarity_bert_tokenizer")

In [None]:
from google.colab import files

!zip -r yelp_tokenizer.zip yelp_polarity_bert_tokenizer

In [None]:
files.download("./yelp_tokenizer.zip")