In [3]:
!pip install nlpaug
!pip install optuna

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m16.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [4]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import nlpaug.augmenter.word as naw
import os
import optuna
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

- source of data: https://huggingface.co/datasets/QuotaClimat/frugalaichallenge-text-train

In [5]:
train1 = pd.read_csv('/kaggle/input/d/rafechang/balanced/train1.csv')
train2 = pd.read_csv('/kaggle/input/d/rafechang/balanced/train2.csv')
train3 = pd.read_csv('/kaggle/input/d/rafechang/balanced/train3.csv')
train4 = pd.read_csv('/kaggle/input/d/rafechang/balanced/train4.csv')
test = pd.read_csv('/kaggle/input/d/rafechang/balanced/test.csv')

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


- Distilbert should be less energy consuming, it has less params 
- Lower case so less params 

**split data**

In [7]:
train1_texts = train1['quote']
train1_labels = train1['numeric_label']
train2_texts = train2['quote']
train2_labels = train2['numeric_label']
train3_texts = train3['quote']
train3_labels = train3['numeric_label']
train4_texts = train4['quote']
train4_labels = train4['numeric_label']

train12_texts = pd.concat([train1_texts, train2_texts], ignore_index=True)
train12_labels = pd.concat([train1_labels, train2_labels], ignore_index=True)
train123_texts = pd.concat([train12_texts, train3_texts], ignore_index=True)
train123_labels = pd.concat([train12_labels, train3_labels], ignore_index=True)

test_texts = test['quote']
test_labels = test['numeric_label']

**Tokenize** 

In [8]:
# Initialize the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

# Function to tokenize data
def tokenize_data(texts, labels):
    try:
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.tolist()

        encodings = tokenizer(
            texts, 
            padding=True, 
            truncation=True, 
            max_length=367, 
            return_tensors="pt"
        )

        dataset = CustomTextDataset(encodings, labels)
        return dataset

    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None
# Custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = [int(label) for label in labels]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
train1_dataset = tokenize_data(train1_texts, train1_labels)
train2_dataset = tokenize_data(train2_texts, train2_labels)
train3_dataset = tokenize_data(train3_texts, train3_labels)
train4_dataset = tokenize_data(train4_texts, train4_labels)
train12_dataset = tokenize_data(train12_texts, train12_labels)
train123_dataset = tokenize_data(train123_texts, train123_labels)

test_dataset = tokenize_data(test_texts, test_labels)

In [10]:
train1_loader = DataLoader(train1_dataset, batch_size=32, shuffle=True)
train2_loader = DataLoader(train2_dataset, batch_size=32, shuffle=True)
train3_loader = DataLoader(train3_dataset, batch_size=32, shuffle=True)
train4_loader = DataLoader(train4_dataset, batch_size=32, shuffle=True)
train12_loader = DataLoader(train12_dataset, batch_size=32, shuffle=True)
train123_loader = DataLoader(train123_dataset, batch_size=32, shuffle=True)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [11]:
model1 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)
model1.to(device)
optimizer1 = AdamW(model1.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def train_and_evaluate_model(model, train_loader, val_loader, optimizer, device, num_epochs=2):
    """
    Trains a given model for a specified number of epochs and evaluates it on a validation set,
    printing out the loss, accuracy, and F1 score.

    Parameters:
    - model: the PyTorch model to be trained.
    - train_loader: DataLoader for the training data.
    - val_loader: DataLoader for the validation data.
    - optimizer: the optimizer used to update the model's weights.
    - device: the device (e.g., 'cuda' or 'cpu') to perform training on.
    - num_epochs: the number of epochs to train for (default is 2).
    """
    model.to(device)

    for epoch in range(num_epochs):  # Loop over the dataset multiple times
        model.train()  # Set the model to training mode
        total_loss = 0
        total_correct = 0
        total_examples = 0

        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()  # Clear gradients
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update parameters

            total_loss += loss.item()
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            total_correct += (predictions == batch['labels']).sum().item()
            total_examples += predictions.size(0)

        avg_loss = total_loss / len(train_loader)
        avg_accuracy = 100 * total_correct / total_examples
        print(f"Epoch {epoch + 1}, Training Loss: {avg_loss:.2f}, Training Accuracy: {avg_accuracy:.2f}%")

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        val_total_loss = 0
        val_total_correct = 0
        val_total_examples = 0
        val_predictions = []
        val_true_labels = []

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                val_loss = outputs.loss
                val_total_loss += val_loss.item()

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=-1)
                val_total_correct += (predictions == batch['labels']).sum().item()
                val_total_examples += predictions.size(0)

                val_predictions.extend(predictions.cpu().numpy())
                val_true_labels.extend(batch['labels'].cpu().numpy())

        val_avg_loss = val_total_loss / len(val_loader)
        val_avg_accuracy = 100 * val_total_correct / val_total_examples
        val_f1 = f1_score(val_true_labels, val_predictions, average='weighted')

        print(f"Epoch {epoch + 1}, Validation Loss: {val_avg_loss:.2f}, Validation Accuracy: {val_avg_accuracy:.2f}%, F1 Score: {val_f1:.2f}")

Step 1: train on train1 and validate on train 2 

In [13]:
train_and_evaluate_model(model1, train1_loader, train2_loader, optimizer1, device)

Epoch 1, Training Loss: 1.52, Training Accuracy: 47.70%
Epoch 1, Validation Loss: 1.21, Validation Accuracy: 57.73%, F1 Score: 0.57
Epoch 2, Training Loss: 0.69, Training Accuracy: 79.61%
Epoch 2, Validation Loss: 1.29, Validation Accuracy: 55.04%, F1 Score: 0.55


Step 2: train on train1+train 2, validate on train 3

In [14]:
train_and_evaluate_model(model1, train12_loader, train3_loader, optimizer1, device)

Epoch 1, Training Loss: 0.68, Training Accuracy: 78.39%
Epoch 1, Validation Loss: 1.11, Validation Accuracy: 62.20%, F1 Score: 0.62
Epoch 2, Training Loss: 0.24, Training Accuracy: 93.30%
Epoch 2, Validation Loss: 1.41, Validation Accuracy: 60.94%, F1 Score: 0.61


Step 3: train on train1+train2+train3, validate on train4 

In [15]:
train_and_evaluate_model(model1, train123_loader, train4_loader, optimizer1, device)

Epoch 1, Training Loss: 0.42, Training Accuracy: 87.57%
Epoch 1, Validation Loss: 1.36, Validation Accuracy: 60.45%, F1 Score: 0.61
Epoch 2, Training Loss: 0.13, Training Accuracy: 96.51%
Epoch 2, Validation Loss: 1.69, Validation Accuracy: 60.93%, F1 Score: 0.61


Before hyperparam optimization, I am trying to compare the above method vs training everything at once 

In [16]:
model2 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)
model2.to(device)
optimizer2 = AdamW(model2.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step 4: hyperparam optimization (only here since can be resource intensive); use train4 for validation 

In [17]:
def objective(trial):
    # Hyperparameters to tune
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
    batch_size = trial.suggest_categorical('batch_size', [4, 8, 16])
    num_trainable_layers = trial.suggest_int('num_trainable_layers', 1, 6)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)

    # Call the training function
    loss = train_model(lr, batch_size, num_trainable_layers, dropout_rate, dataset)

    return loss  # Objective to minimize

havnt done early stopping 