In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np

# Statistical functions
from scipy.stats import zscore

# For concurrency (running functions in parallel)
from concurrent.futures import ThreadPoolExecutor

# For caching (to speed up repeated function calls)
from functools import lru_cache

# For progress tracking
from tqdm import tqdm

# Plotting and Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Language Detection packages
# `langdetect` for detecting language
from langdetect import detect as langdetect_detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
# `langid` for an alternative language detection method
from langid import classify as langid_classify

# Text Preprocessing and NLP
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords
# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Part-of-speech tagging
from nltk import pos_tag
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer
import nltk
# Regular expressions for text pattern matching
import re

# Word Cloud generation
from wordcloud import WordCloud

In [None]:
data = pd.read_csv('../final_df.csv')

In [3]:
data

Unnamed: 0,year,month,sentiment,processed_full_review
0,2024,3,Neutral,ok use airlin go singapor london heathrow issu...
1,2024,3,Negative,don give money book paid receiv email confirm ...
2,2024,3,Positive,best airlin world best airlin world seat food ...
3,2024,3,Negative,premium economi seat singapor airlin not worth...
4,2024,3,Negative,imposs get promis refund book flight full mont...
...,...,...,...,...
11513,2021,11,Negative,websit buggi paid first busi class ticket webs...
11514,2021,10,Negative,reduc level qualiti servic fear futur airlin t...
11515,2021,10,Negative,chang would cost usd book ticket singapor airl...
11516,2021,8,Negative,disappoint flight check secur check frankfurt ...


# Basic DistilBERT

DistilBERT is a smaller, faster, and lighter version of BERT, designed to retain most of BERT's language understanding capabilities while being more computationally efficient.

DistilBERT has only 6 layers instead of BERT's 12, which makes it half the size of BERT in terms of layers. However, it retains the same hidden size (768), meaning it still processes and represents data similarly to BERT but with fewer computational steps. This results in a smaller number of parameters, making DistilBERT about 40-60% faster to train and use in inference.

DistilBERT is trained using knowledge distillation, a technique where a smaller model (the "student", DistilBERT) learns to mimic a larger model (the "teacher", BERT) rather than directly learning from the raw data. During training, the student model doesn't just learn from the labelled dataset but also from the "soft labels" (probabilities) provided by the teacher model. This method allows DistilBERT to capture much of the knowledge from the original BERT model even with fewer layers, preserving 97% of BERT's performance on many NLP tasks.

In [4]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
import tensorflow as tf
import numpy as np
import random

# Set random seed for reproducibility
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Labels mapping
sentiment_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = data['sentiment'].map(sentiment_dict).values  # Convert sentiments to numeric labels

train_d, val_d, train_labels, val_labels = train_test_split(data['processed_full_review'], y, test_size=0.2, random_state=42)

In [6]:
texts_train = list(train_d)
texts_val = list(val_d)

max_length = 64

In [7]:
tokenized_texts_train = tokenizer(texts_train, padding=True, truncation=True, return_tensors="pt", max_length=max_length)
tokenized_texts_val = tokenizer(texts_val, padding=True, truncation=True, return_tensors="pt", max_length=max_length)

In [8]:
tokenized_texts_train['input_ids'][0]

tensor([  101,  3532, 23488, 24209, 11475,  3775,  2833,  3532, 18151,  2278,
         3532,  2835,  2067,  2234,  2835,  2053, 18151,  2278,  4521,  2435,
         2053, 18064,  3462,  5463,  2729,  3482,  3256,  6949,  2452,  3198,
         2296,  2239,  2151,  2239,  2215,  2028,  2025,  2053,  2300, 14262,
         2615,  2053,  2522, 16020,  5572,  2445,  4983,  3198,  2028,  3730,
         4392, 14262,  2615,  2048,  2093,  2711,  4511,  2025,  2130,  5254,
          102,     0,     0,     0])

In [9]:
tokenized_texts_train['attention_mask'][0]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0])

In [10]:
import torch

In [11]:
train_labels = torch.tensor(list(train_labels))
val_labels = torch.tensor(list(val_labels)) 

In [12]:
train_dataset = TensorDataset(tokenized_texts_train['input_ids'], tokenized_texts_train['attention_mask'], train_labels)
val_dataset = TensorDataset(tokenized_texts_val['input_ids'], tokenized_texts_val['attention_mask'], val_labels)

In [13]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from torch.nn.utils import clip_grad_norm_
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=5e-6)
criterion = torch.nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=1, gamma=0.1)



In [15]:
batch_size = 64

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True, prefetch_factor=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True, prefetch_factor=2)

In [16]:
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [17]:
train_losses = []
val_losses = []
val_accuracies = []
train_accuracies = []

In [18]:
from tqdm import tqdm

for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss = 0.0
    tr_correct_preds = 0
    all_tr_labels = []
    all_tr_preds = []

    # Use tqdm to create a progress bar for the training loop
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Training"):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        tr_loss = outputs.loss
        train_loss += tr_loss.item()
        tr_loss.backward()

        tr_logits = outputs.logits
        tr_preds = torch.argmax(tr_logits, dim=1)
        tr_correct_preds += torch.sum(tr_preds == labels).item()

        # Collect predictions and true labels
        all_tr_labels.extend(labels.cpu().numpy())
        all_tr_preds.extend(tr_preds.cpu().numpy())

        clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    scheduler.step()

    # Calculate average training loss and accuracy
    avg_train_loss = train_loss / len(train_loader)
    train_losses.append(avg_train_loss)
    train_accuracy = tr_correct_preds / len(train_d)
    train_accuracies.append(train_accuracy)

    # Calculate Precision, Recall, F1 for training
    train_precision = precision_score(all_tr_labels, all_tr_preds, average='weighted')
    train_recall = recall_score(all_tr_labels, all_tr_preds, average='weighted')
    train_f1 = f1_score(all_tr_labels, all_tr_preds, average='weighted')

    # Validation phase with tqdm progress bar
    model.eval()
    val_loss = 0.0
    correct_preds = 0
    all_val_labels = []
    all_val_preds = []

    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Epoch {epoch + 1}/{num_epochs} - Validation"):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_preds += torch.sum(preds == labels).item()

            # Collect predictions and true labels
            all_val_labels.extend(labels.cpu().numpy())
            all_val_preds.extend(preds.cpu().numpy())

    # Calculate average validation loss and accuracy
    avg_val_loss = val_loss / len(val_loader)
    val_losses.append(avg_val_loss)
    val_accuracy = correct_preds / len(val_d)
    val_accuracies.append(val_accuracy)

    # Calculate Precision, Recall, F1 for validation
    val_precision = precision_score(all_val_labels, all_val_preds, average='weighted')
    val_recall = recall_score(all_val_labels, all_val_preds, average='weighted')
    val_f1 = f1_score(all_val_labels, all_val_preds, average='weighted')

    # Print metrics
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")
    print(f"Training Precision: {train_precision:.4f}, Training Recall: {train_recall:.4f}, Training F1: {train_f1:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")
    print(f"Validation Precision: {val_precision:.4f}, Validation Recall: {val_recall:.4f}, Validation F1: {val_f1:.4f}")


Epoch 1/5 - Training: 100%|██████████| 144/144 [00:52<00:00,  2.73it/s]
Epoch 1/5 - Validation: 100%|██████████| 36/36 [00:07<00:00,  4.88it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1/5
Training Loss: 0.7198, Training Accuracy: 0.7244
Training Precision: 0.6440, Training Recall: 0.7244, Training F1: 0.6611
Validation Loss: 0.4831, Validation Accuracy: 0.8320
Validation Precision: 0.7508, Validation Recall: 0.8320, Validation F1: 0.7893


Epoch 2/5 - Training: 100%|██████████| 144/144 [00:53<00:00,  2.70it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Epoch 2/5 - Validation: 100%|██████████| 36/36 [00:07<00:00,  4.78it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 2/5
Training Loss: 0.4789, Training Accuracy: 0.8293
Training Precision: 0.7529, Training Recall: 0.8293, Training F1: 0.7880
Validation Loss: 0.4603, Validation Accuracy: 0.8381
Validation Precision: 0.7619, Validation Recall: 0.8381, Validation F1: 0.7972


Epoch 3/5 - Training: 100%|██████████| 144/144 [00:54<00:00,  2.66it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Epoch 3/5 - Validation: 100%|██████████| 36/36 [00:07<00:00,  4.67it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 3/5
Training Loss: 0.4673, Training Accuracy: 0.8342
Training Precision: 0.7560, Training Recall: 0.8342, Training F1: 0.7922
Validation Loss: 0.4581, Validation Accuracy: 0.8342
Validation Precision: 0.7610, Validation Recall: 0.8342, Validation F1: 0.7943


Epoch 4/5 - Training: 100%|██████████| 144/144 [00:55<00:00,  2.60it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Epoch 4/5 - Validation: 100%|██████████| 36/36 [00:07<00:00,  4.76it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 4/5
Training Loss: 0.4674, Training Accuracy: 0.8345
Training Precision: 0.7575, Training Recall: 0.8345, Training F1: 0.7929
Validation Loss: 0.4580, Validation Accuracy: 0.8342
Validation Precision: 0.7610, Validation Recall: 0.8342, Validation F1: 0.7943


Epoch 5/5 - Training: 100%|██████████| 144/144 [00:55<00:00,  2.61it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Epoch 5/5 - Validation: 100%|██████████| 36/36 [00:07<00:00,  4.76it/s]


Epoch 5/5
Training Loss: 0.4677, Training Accuracy: 0.8341
Training Precision: 0.7568, Training Recall: 0.8341, Training F1: 0.7923
Validation Loss: 0.4580, Validation Accuracy: 0.8342
Validation Precision: 0.7610, Validation Recall: 0.8342, Validation F1: 0.7943



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
