In [1]:
!pip install optuna



In [2]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn.utils.prune as prune
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW, lr_scheduler
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import shutil
import zipfile

In [3]:
# Ensure output directory exists
output_dir = "/kaggle/working/output"
os.makedirs(output_dir, exist_ok=True)

In [4]:
df_balanced1 = pd.read_csv("/kaggle/input/balancedfull/df_balanced1.csv")
df_balanced2 = pd.read_csv("/kaggle/input/balancedfull/df_balanced2.csv")
df_balanced3 = pd.read_csv("/kaggle/input/balancedfull/df_balanced3.csv")
df_balanced4 = pd.read_csv("/kaggle/input/balancedfull/df_balanced4.csv")
df_balanced5 = pd.read_csv("/kaggle/input/balancedfull/df_balanced5.csv")

In [5]:
df_balanced1_text = df_balanced1['quote']
df_balanced2_text = df_balanced2['quote']
df_balanced3_text = df_balanced3['quote']
df_balanced4_text = df_balanced4['quote']
df_balanced5_text = df_balanced5['quote']
df_balanced1_label = df_balanced1['numeric_label']
df_balanced2_label = df_balanced2['numeric_label']
df_balanced3_label = df_balanced3['numeric_label']
df_balanced4_label = df_balanced4['numeric_label']
df_balanced5_label = df_balanced5['numeric_label']

In [6]:
text_combined = pd.concat([df_balanced1_text, df_balanced2_text, df_balanced3_text, df_balanced4_text, df_balanced5_text], ignore_index=True)
label_combined = pd.concat([df_balanced1_label, df_balanced2_label, df_balanced3_label, df_balanced4_label, df_balanced5_label], ignore_index = True) 

In [7]:
# Load and prepare data
# test
df = pd.read_parquet("/kaggle/input/test-parquet/test-00000-of-00001.parquet")
df['label_int'] = df['label'].str.split("_").str[0].astype('int')

texts = df["quote"].to_list()
labels = df["label_int"].to_list()

In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [9]:
# Create dictionary for label names
# label_dict = df1[['numeric_label', 'label']].drop_duplicates().set_index('numeric_label')['label'].to_dict()

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
MAX_LENGTH = 365

# Dataset and DataLoader preparation
class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    try:
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.tolist()
            
        encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
        return QuotesDataset(encodings, labels)

    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [11]:
train_dataset = encode_data(tokenizer, text_combined, label_combined, MAX_LENGTH)
val_dataset = encode_data(tokenizer, texts, labels, MAX_LENGTH)

- Above is the basic data setup. Now train + hyperparam tune the model
- Start with defining some functions to be used in objective

In [12]:
def setup_model_for_hyperopt(num_trainable_layers, dropout_rate):
    config = DistilBertConfig.from_pretrained(
        'distilbert-base-uncased',
        num_labels=8,
        dropout=dropout_rate,
        attention_dropout=dropout_rate
    )
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
    for name, param in model.named_parameters():
        param.requires_grad = False
    for layer_idx in range(6 - num_trainable_layers, 6):
        for name, param in model.distilbert.transformer.layer[layer_idx].named_parameters():
            param.requires_grad = True
    for name, param in model.classifier.named_parameters():
        param.requires_grad = True
    return model

In [13]:
# Model training 
def train_one_epoch(model, train_loader, optimizer, device):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_train += (predictions == batch['labels']).sum().item()
        total_train += batch['labels'].size(0)
    average_loss = train_loss / len(train_loader)
    accuracy = correct_train / total_train
    return average_loss, accuracy

In [14]:
# Model validation 
def validate_model(model, val_loader, device):
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions = []
    all_true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())
            correct_val += (predictions == batch['labels']).sum().item()
            total_val += batch['labels'].size(0)
    average_val_loss = val_loss / len(val_loader)
    accuracy = correct_val / total_val
    return average_val_loss, accuracy, all_predictions, all_true_labels

In [15]:
def objective(trial):
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    num_trainable_layers = trial.suggest_int('num_trainable_layers', 1, 6)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    epochs = trial.suggest_int('epochs', 2, 5)
    step_size = trial.suggest_int('step_size', 1, 10)
    gamma = trial.suggest_float('gamma', 0.1, 0.9)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = setup_model_for_hyperopt(num_trainable_layers, dropout_rate)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    val_accuracies = []
    for epoch in range(epochs):
        train_loss, train_accuracy = train_one_epoch(model, train_loader, optimizer, device)
        val_loss, val_accuracy, all_predictions, all_true_labels = validate_model(model, val_loader, device)
        scheduler.step()
        val_accuracies.append(val_accuracy)

    best_val_accuracy = max(val_accuracies)
    return best_val_accuracy


In [16]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)  # Adjust the number of trials as needed

print("Best trial:")
print(study.best_trial.params)

[I 2025-01-28 23:15:02,554] A new study created in memory with name: no-name-40d3678b-8040-48e1-a8be-b779813a3cc4


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
[I 2025-01-28 23:27:36,002] Trial 0 finished with value: 0.8424938474159147 and parameters: {'learning_rate': 6.29611775347733e-05, 'num_trainable_layers': 2, 'dropout_rate': 0.18975583316602163, 'batch_size': 64, 'epochs': 3, 'step_size': 3, 'gamma': 0.13679076288910175}. Best is trial 0 with value: 0.8424938474159147.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You s

Best trial:
{'learning_rate': 8.509924985836568e-05, 'num_trainable_layers': 4, 'dropout_rate': 0.19553005296446951, 'batch_size': 16, 'epochs': 5, 'step_size': 4, 'gamma': 0.8523421613311146}


For comparism, run the below on augmented data. 

Run the blow code once have best hyperparams 