In [1]:
# !pip install nlpaug
!pip install optuna

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [2]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim import AdamW, lr_scheduler
# import nlpaug.augmenter.word as naw
import optuna
import shutil
import zipfile

To compare, this is the result of the original run: 
- Trial 0 finished with value: 0.6890894175553732 and parameters: {'learning_rate': 1.4818151091980784e-05, 'num_trainable_layers': 4, 'dropout_rate': 0.119110690825033, 'batch_size': 16, 'step_size': 4, 'gamma': 0.8960630325737292, 'epochs': 3}. Best is trial 0 with value: 0.6890894175553732. 3083.8s 66 [I 2025-01-24 09:37:48,516]
- Trial 1 finished with value: 0.8728465955701394 and parameters: {'learning_rate': 2.517549631081625e-05, 'num_trainable_layers': 6, 'dropout_rate': 0.41403406363825646, 'batch_size': 16, 'step_size': 4, 'gamma': 0.42358721860354653, 'epochs': 5}. Best is trial 1 with value: 0.8728465955701394. 4771.2s 67 [I 2025-01-24 10:05:55,931]
- Trial 2 finished with value: 0.8884331419196062 and parameters: {'learning_rate': 5.202379803067906e-05, 'num_trainable_layers': 3, 'dropout_rate': 0.298865675518514, 'batch_size': 32, 'step_size': 5, 'gamma': 0.7444522192227857, 'epochs': 5}. Best is trial 2 with value: 0.8884331419196062. 5828.5s 68 [I 2025-01-24 10:23:33,239]
- Trial 3 finished with value: 0.7875307629204266 and parameters: {'learning_rate': 0.00016564874914610164, 'num_trainable_layers': 4, 'dropout_rate': 0.24484638353248456, 'batch_size': 32, 'step_size': 8, 'gamma': 0.7133676325508109, 'epochs': 3}. Best is trial 2 with value: 0.8884331419196062. 7368.5s 69 [I 2025-01-24 10:49:13,274]
- Trial 4 finished with value: 0.8310090237899918 and parameters: {'learning_rate': 6.24880645646062e-05, 'num_trainable_layers': 6, 'dropout_rate': 0.1787526406980196, 'batch_size': 16, 'step_size': 9, 'gamma': 0.4218077557604636, 'epochs': 4}. Best is trial 2 with value: 0.8884331419196062. 9207.9s 70 [I 2025-01-24 11:19:52,632] 

In [3]:
df1 = pd.read_csv("/kaggle/input/balancedfull/df1.csv")
df2 = pd.read_csv("/kaggle/input/balancedfull/df2.csv")
df3 = pd.read_csv("/kaggle/input/balancedfull/df3.csv")
df4 = pd.read_csv("/kaggle/input/balancedfull/df4.csv")
df5 = pd.read_csv("/kaggle/input/balancedfull/df5.csv")

df_balanced1 = pd.read_csv("/kaggle/input/balancedfull/df_balanced1.csv")
df_balanced2 = pd.read_csv("/kaggle/input/balancedfull/df_balanced2.csv")
df_balanced3 = pd.read_csv("/kaggle/input/balancedfull/df_balanced3.csv")
df_balanced4 = pd.read_csv("/kaggle/input/balancedfull/df_balanced4.csv")
df_balanced5 = pd.read_csv("/kaggle/input/balancedfull/df_balanced5.csv")

df1_text = df1['quote']
df2_text = df2['quote']
df3_text = df3['quote']
df4_text = df4['quote']
df5_text = df5['quote']

df1_label = df1['numeric_label']
df2_label = df2['numeric_label']
df3_label = df3['numeric_label']
df4_label = df4['numeric_label']
df5_label = df5['numeric_label']

df_balanced1_text = df_balanced1['quote']
df_balanced2_text = df_balanced2['quote']
df_balanced3_text = df_balanced3['quote']
df_balanced4_text = df_balanced4['quote']
df_balanced5_text = df_balanced5['quote']

df_balanced1_label = df_balanced1['numeric_label']
df_balanced2_label = df_balanced2['numeric_label']
df_balanced3_label = df_balanced3['numeric_label']
df_balanced4_label = df_balanced4['numeric_label']
df_balanced5_label = df_balanced5['numeric_label']

In [4]:
datasets = [df_balanced1, df_balanced2, df_balanced3, df_balanced4]

# Extract quotes and labels using list comprehension
texts = [ds['quote'] for ds in datasets]
labels = [ds['numeric_label'] for ds in datasets]

# Concatenate all texts and labels using pandas.concat
train_texts = pd.concat(texts, ignore_index=True)
train_labels = pd.concat(labels, ignore_index=True)

val_texts = df5_text = df5['quote']
val_labels = df5_label = df5['numeric_label']

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") 

Using device: cuda


In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
MAX_LENGTH = 365

# Dataset and DataLoader preparation
class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    try:
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.tolist()
            
        encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
        return QuotesDataset(encodings, labels)

    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [7]:
train_dataset = encode_data(tokenizer, train_texts, train_labels, MAX_LENGTH)
val_dataset = encode_data(tokenizer, val_texts, val_labels, MAX_LENGTH) 

**Objective1: Run a few trials of original model + early stopping + higher epochs max (increase from 5 to 10)**

Optimization: 

In [8]:
def modify_model(model, num_trainable_layers, dropout_rate):
    # Freeze layers: only the last 'num_trainable_layers' are trainable
    total_layers = len(model.distilbert.transformer.layer)
    for layer_index, layer in enumerate(model.distilbert.transformer.layer):
        if layer_index < total_layers - num_trainable_layers:
            for param in layer.parameters():
                param.requires_grad = False

    # Adjust dropout rates in applicable transformer layers
    for layer in model.distilbert.transformer.layer:
        layer.attention.dropout.p = dropout_rate
        layer.ffn.dropout.p = dropout_rate

    return model

In [9]:
def train_one_epoch(model, train_loader, optimizer, device):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_train += (predictions == batch['labels']).sum().item()
        total_train += batch['labels'].size(0)
    average_loss = train_loss / len(train_loader)
    accuracy = correct_train / total_train
    return average_loss, accuracy

In [10]:
def validate_model(model, val_loader, device):
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions = []
    all_true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())
            correct_val += (predictions == batch['labels']).sum().item()
            total_val += batch['labels'].size(0)
    average_val_loss = val_loss / len(val_loader)
    accuracy = correct_val / total_val
    return average_val_loss, accuracy, all_predictions, all_true_labels

In [13]:
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    num_trainable_layers = trial.suggest_int('num_trainable_layers', 1, 6)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    step_size = trial.suggest_int('step_size', 1, 10)
    gamma = trial.suggest_float('gamma', 0.1, 0.9)
    epochs = trial.suggest_int('epochs', 2, 10) 

    # Early stopping criteria
    patience = 3
    min_delta = 0.001
    best_val_accuracy = 0
    no_improve_epochs = 0
    
    # Model setup and modification
    model_config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=8)
    model = DistilBertForSequenceClassification(model_config)
    model = modify_model(model, num_trainable_layers, dropout_rate)
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Training and validation
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    val_accuracies = []

    for epoch in range(epochs):
        train_loss, train_accuracy = train_one_epoch(model, train_loader, optimizer, device)
        val_loss, val_accuracy, _, _ = validate_model(model, val_loader, device)
        scheduler.step()

        if val_accuracy > best_val_accuracy + min_delta:
            best_val_accuracy = val_accuracy
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1

        if no_improve_epochs >= patience:
            print(f"Stopping early at epoch {epoch+1}")
            break
    
    # for epoch in range(epochs):
    #     train_loss, train_accuracy = train_one_epoch(model, train_loader, optimizer, device)
    #     val_loss, val_accuracy, all_predictions, all_true_labels = validate_model(model, val_loader, device)
    #     scheduler.step()

    #     # Collect metrics
    #     val_accuracies.append(val_accuracy)

    # file_path = f"/kaggle/working/output_{trial.number}.pth"
    # torch.save(model.state_dict(), file_path)

    # Store the best or last validation accuracy
    # best_val_accuracy = max(val_accuracies)  # or you could use val_accuracies[-1] for the last

    return best_val_accuracy

In [14]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)  # Adjust the number of trials as needed

print("Best trial:")
print(study.best_trial.params)

[I 2025-01-28 16:45:24,144] A new study created in memory with name: no-name-83b9c072-a99a-4d9c-8b83-5a61d6bb641e
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
[I 2025-01-28 17:30:02,491] Trial 0 finished with value: 0.2947454844006568 and parameters: {'learning_rate': 0.0003786550610492913, 'num_trainable_layers': 4, 'dropout_rate': 0.32010594371594264, 'batch_size': 16, 'step_size': 8, 'gamma': 0.33452792460926456, 'epochs': 10}. Best is trial 0 with value: 0.2947454844006568.


Stopping early at epoch 8


[I 2025-01-28 17:40:43,969] Trial 1 finished with value: 0.12643678160919541 and parameters: {'learning_rate': 0.000638182315640936, 'num_trainable_layers': 3, 'dropout_rate': 0.1336002179576891, 'batch_size': 64, 'step_size': 5, 'gamma': 0.6350661291473785, 'epochs': 2}. Best is trial 0 with value: 0.2947454844006568.
[I 2025-01-28 18:10:15,157] Trial 2 finished with value: 0.5566502463054187 and parameters: {'learning_rate': 2.05713683503022e-05, 'num_trainable_layers': 5, 'dropout_rate': 0.41690115957387186, 'batch_size': 16, 'step_size': 9, 'gamma': 0.40762971238383405, 'epochs': 5}. Best is trial 2 with value: 0.5566502463054187.
[I 2025-01-28 18:39:30,692] Trial 3 finished with value: 0.5665024630541872 and parameters: {'learning_rate': 0.00015533133816326115, 'num_trainable_layers': 1, 'dropout_rate': 0.143048848905019, 'batch_size': 16, 'step_size': 7, 'gamma': 0.5469729079801496, 'epochs': 8}. Best is trial 3 with value: 0.5665024630541872.


Stopping early at epoch 6


[I 2025-01-28 19:03:49,407] Trial 4 finished with value: 0.5796387520525451 and parameters: {'learning_rate': 0.00011601732158263483, 'num_trainable_layers': 1, 'dropout_rate': 0.25521230309869525, 'batch_size': 32, 'step_size': 9, 'gamma': 0.10224710801514512, 'epochs': 7}. Best is trial 4 with value: 0.5796387520525451.


Stopping early at epoch 5
Best trial:
{'learning_rate': 0.00011601732158263483, 'num_trainable_layers': 1, 'dropout_rate': 0.25521230309869525, 'batch_size': 32, 'step_size': 9, 'gamma': 0.10224710801514512, 'epochs': 7}


**Objective2: Run a few trials of original model + gradient clipping**

In [15]:
def train_one_epoch_clipping(model, train_loader, optimizer, grad_clip, device):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        train_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_train += (predictions == batch['labels']).sum().item()
        total_train += batch['labels'].size(0)
    average_loss = train_loss / len(train_loader)
    accuracy = correct_train / total_train
    return average_loss, accuracy

In [18]:
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    num_trainable_layers = trial.suggest_int('num_trainable_layers', 1, 6)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    step_size = trial.suggest_int('step_size', 1, 10)
    gamma = trial.suggest_float('gamma', 0.1, 0.9)
    epochs = trial.suggest_int('epochs', 2, 5) 
    grad_clip = 1.0

    # Model setup and modification
    model_config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=8)
    model = DistilBertForSequenceClassification(model_config)
    model = modify_model(model, num_trainable_layers, dropout_rate)
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Training and validation
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    val_accuracies = []

    for epoch in range(epochs):
        train_loss, train_accuracy = train_one_epoch_clipping(model, train_loader, optimizer, grad_clip, device)
        val_loss, val_accuracy, all_predictions, all_true_labels = validate_model(model, val_loader, device)
        scheduler.step()

        # Collect metrics
        val_accuracies.append(val_accuracy)

    best_val_accuracy = max(val_accuracies)

    return best_val_accuracy

In [19]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)  # Adjust the number of trials as needed

print("Best trial:")
print(study.best_trial.params)

[I 2025-01-28 19:04:27,405] A new study created in memory with name: no-name-4fce4f5c-7b3b-4912-898c-9e423bba40c2
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
[I 2025-01-28 19:19:55,118] Trial 0 finished with value: 0.5279146141215106 and parameters: {'learning_rate': 0.00023971978599227062, 'num_trainable_layers': 2, 'dropout_rate': 0.4892850221795769, 'batch_size': 16, 'step_size': 2, 'gamma': 0.11922139670221972, 'epochs': 3}. Best is trial 0 with value: 0.5279146141215106.
[I 2025-01-28 19:43:10,357] Trial 1 finished with value: 0.5188834154351396 and parameters: {'learning_rate': 0.00019622431960104585, 'num_trainable_layers': 5, 'dropout_rate': 0.13237435636268813, 'batch_size': 64, 'step_size': 3, 'gamma': 0.8425947552179673, 'epochs': 4}. Best is trial 0 with value: 0.5279146141215106.
[I 2025-01-28 19:58:30,561] Trial 2 finished with value: 0.26929392446633826 and parameters: {'learning_rate': 0.0004739541634733061, 'num_trainable_layers': 2, '

Best trial:
{'learning_rate': 0.00023971978599227062, 'num_trainable_layers': 2, 'dropout_rate': 0.4892850221795769, 'batch_size': 16, 'step_size': 2, 'gamma': 0.11922139670221972, 'epochs': 3}


**Objective 3: Run a few trials of original model + switching training / testing**

This aproach is aiming to increase the hyperparameter's generalizability. However, due to the limit time left, this will be left to the end of the project as a stretch goal. 