In [None]:
# !pip3 install -r requirements.txt

In [1]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np
from datetime import datetime 

# Statistical functions
from scipy.stats import zscore

# Text Preprocessing and NLP
import nltk
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.corpus import wordnet

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer


# For generating n-grams
from nltk.util import ngrams
from collections import Counter

## Data Preparation (Loading CSV)

Load the three CSV files into a pandas DataFrame `data`.

In [2]:
data = pd.read_csv('final_df.csv')

In [3]:
data.head()

Unnamed: 0,year,month,sentiment,processed_full_review
0,2024,3,Neutral,ok use airlin go singapor london heathrow issu...
1,2024,3,Negative,don give money book paid receiv email confirm ...
2,2024,3,Positive,best airlin world best airlin world seat food ...
3,2024,3,Negative,premium economi seat singapor airlin not worth...
4,2024,3,Negative,imposs get promis refund book flight full mont...


In [4]:
data['sentiment'].value_counts()

sentiment
Positive    7913
Negative    2441
Neutral     1164
Name: count, dtype: int64

In [5]:
data['year'].value_counts()

year
2019    5129
2018    2596
2022    1184
2023    1111
2020     888
2024     514
2021      96
Name: count, dtype: int64

In [12]:
# You can verify your data structure
print(data.columns)  # Should include 'processed_full_review' and 'sentiment'
print(data['sentiment'].unique())  # Should show your 3 classes

Index(['year', 'month', 'sentiment', 'processed_full_review'], dtype='object')
['Neutral' 'Negative' 'Positive']


In [13]:
# Optional: Check your text lengths
lengths = data['processed_full_review'].str.len()
print(f"Average length: {lengths.mean():.0f}")
print(f"95th percentile length: {lengths.quantile(0.95):.0f}")
print(f"Max length: {lengths.max()}")

Average length: 292
95th percentile length: 752
Max length: 1452


# ELECTRA (Efficiently Learning an Encoder that Classifies Token Replacements Accurately)

### Replaced Token Detection (RTD)
- Unlike BERT, which masks words in the input and then tries to predict the, ELECTRA randomly replaces certain tokens with plausible alternatives generated by a small generator model and trains a larger discriminator model to detect whether each token is "real" (original) or "replaced" (fake).

- This setup means that every token in the input sequence is used during training, not just the masked ones, which leads to more efficient training.

### Two-Part Architecture
- **Generator:** A smaller model (often a smaller BER) that replaces tokens with plausible alternatives. It essentially "corrupts" the input sentence by substituting some tokens with similar words.

- **Discriminator:** The main ELECTRA model, which learns to classify each token as either "real" or "fake" based on whether the token was replaced by the generator. This part of the model is fine-tuned for downstream tasks after pretraining.

### Performance
- Typically outperforms BERT on NLP tasks. ELECTRA-small can perform similarly to BERT-based, and ELECTRA-base often surpasses BERT-base while being more efficient.


We'll be using ELECTRA-based, which is a standard-size model, comparable to BERT-base in terms of size and performance. It has 110M parameters.


### Imports and Setup

In [None]:
# !pip install transformers
# !pip install tqdm
# !pip install scikit-learn

In [25]:
import numpy as np
import torch
import random
from transformers import ElectraTokenizer, ElectraForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import torch.nn.functional as F
from tqdm import tqdm

# Set random seeds for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


## Load model and tokenizer

In [26]:
# Load tokenizer and model
model_name = "google/electra-base-discriminator"
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = ElectraForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    problem_type="single_label_classification"
)
model.to(device)

# Initialize label encoder
le = LabelEncoder()

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Preparing the data

In [27]:
# Prepare data
MAX_LENGTH = 512 # Average length of the reviews is 292 words

def prepare_data(data):
    # Encode labels
    labels = le.fit_transform(data['sentiment'])
    
    # Tokenize texts
    encodings = tokenizer(
        data['processed_full_review'].tolist(),
        truncation=True,
        padding=True,
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )
    
    # Create dataset
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    labels = torch.tensor(labels)
    
    return input_ids, attention_mask, labels

# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['sentiment'])

# Prepare train and test sets
train_input_ids, train_attention_mask, train_labels = prepare_data(train_data)
test_input_ids, test_attention_mask, test_labels = prepare_data(test_data)

### Create DataLoader

In [28]:
# Calculate class weights
class_counts = data['sentiment'].value_counts()
total_samples = len(data)
class_weights = torch.tensor([
    total_samples / (3 * class_counts['Positive']),
    total_samples / (3 * class_counts['Negative']),
    total_samples / (3 * class_counts['Neutral'])
], dtype=torch.float).to(device)

# Create datasets and dataloaders
batch_size = 16

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [29]:
print("Number of training batches:", len(train_loader))
print("Number of test batches:", len(test_loader))
print("\nClass weights:", class_weights)

Number of training batches: 576
Number of test batches: 144

Class weights: tensor([0.4852, 1.5729, 3.2984], device='cuda:0')


### Training

In [30]:
from tqdm import tqdm

def train_epoch(model, train_loader, optimizer, scheduler):
    model.train()
    total_loss = 0
    
    progress_bar = tqdm(train_loader, desc="Training")
    
    for batch in progress_bar:
        batch_input_ids = batch[0].to(device)
        batch_attention_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=batch_input_ids,
            attention_mask=batch_attention_mask
        )
        
        # Use CrossEntropyLoss for multi-class classification
        loss = F.cross_entropy(
            outputs.logits,
            batch_labels,
            weight=class_weights
        )
        
        total_loss += loss.item()
        
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(train_loader)

### Evaluation Function

In [None]:
def evaluate(model, test_loader):
    model.eval()
    predictions = []
    true_labels = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            batch_input_ids = batch[0].to(device)
            batch_attention_mask = batch[1].to(device)
            batch_labels = batch[2].to(device)
            
            outputs = model(
                input_ids=batch_input_ids,
                attention_mask=batch_attention_mask
            )
            
            # For multi-class, we use argmax directly on logits
            preds = torch.argmax(outputs.logits, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch_labels.cpu().numpy())
    
    return classification_report(
        true_labels, 
        predictions, 
        target_names=le.classes_,
        digits=4
    )

### Training Loop

In [32]:
from transformers import get_linear_schedule_with_warmup

# Training parameters
epochs = 3
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Create scheduler
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Training loop
for epoch in range(epochs):
    print(f"\nEpoch {epoch+1}/{epochs}")
    
    avg_train_loss = train_epoch(model, train_loader, optimizer, scheduler)
    print(f"Average training loss: {avg_train_loss:.4f}")
    
    print("\nEvaluation on test set:")
    eval_report = evaluate(model, test_loader)
    print(eval_report)

# Save the model
torch.save(model.state_dict(), 'electra_sentiment_model.pth')


Epoch 1/3


Training: 100%|██████████| 576/576 [02:42<00:00,  3.55it/s, loss=0.1595]


Average training loss: 0.2791

Evaluation on test set:


Evaluating: 100%|██████████| 144/144 [00:11<00:00, 12.71it/s]


              precision    recall  f1-score   support

    Negative     0.8848    0.6455    0.7464       488
     Neutral     0.3679    0.4421    0.4016       233
    Positive     0.9149    0.9640    0.9388      1583

    accuracy                         0.8438      2304
   macro avg     0.7225    0.6838    0.6956      2304
weighted avg     0.8532    0.8438    0.8437      2304


Epoch 2/3


Training: 100%|██████████| 576/576 [02:38<00:00,  3.64it/s, loss=0.0888]


Average training loss: 0.1731

Evaluation on test set:


Evaluating: 100%|██████████| 144/144 [00:11<00:00, 12.55it/s]


              precision    recall  f1-score   support

    Negative     0.8861    0.7172    0.7928       488
     Neutral     0.4462    0.4979    0.4706       233
    Positive     0.9248    0.9634    0.9437      1583

    accuracy                         0.8641      2304
   macro avg     0.7523    0.7261    0.7357      2304
weighted avg     0.8682    0.8641    0.8639      2304


Epoch 3/3


Training: 100%|██████████| 576/576 [02:38<00:00,  3.63it/s, loss=0.0493]


Average training loss: 0.1340

Evaluation on test set:


Evaluating: 100%|██████████| 144/144 [00:11<00:00, 12.49it/s]


              precision    recall  f1-score   support

    Negative     0.9117    0.6557    0.7628       488
     Neutral     0.4140    0.5579    0.4753       233
    Positive     0.9292    0.9621    0.9454      1583

    accuracy                         0.8563      2304
   macro avg     0.7516    0.7253    0.7278      2304
weighted avg     0.8734    0.8563    0.8592      2304

