In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from torch.optim import AdamW, lr_scheduler
import optuna

# Kaggle setup
output_dir = "/kaggle/working/output"
os.makedirs(output_dir, exist_ok=True)

# Load and prepare data
df = pd.read_parquet("/kaggle/input/climatetext/train.parquet")
df['label_int'] = df['label'].str.split("_").str[0].astype('int')
label_dict = df[['label_int', 'label']].drop_duplicates().set_index('label_int')['label'].to_dict()

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    return QuotesDataset(encodings, labels)

texts = df["quote"].to_list()
labels = df["label_int"].to_list()

X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

def objective(trial):
    # Hyperparameters to optimize
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 5e-4)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.1, 0.5)
    num_trainable_layers = trial.suggest_int('num_trainable_layers', 1, 6)
    train_batch_size = trial.suggest_categorical('train_batch_size', [16, 32, 64])
    grad_clip = trial.suggest_uniform('grad_clip', 0.5, 5.0)
    step_size = trial.suggest_int('step_size', 5, 10)
    gamma = trial.suggest_uniform('gamma', 0.7, 0.9)
    max_length = trial.suggest_int('max_length', 128, 512)

    # Dataset
    train_dataset = encode_data(tokenizer, X_train, y_train, max_length)
    val_dataset = encode_data(tokenizer, X_val, y_val, max_length)

    train_loader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=train_batch_size, shuffle=False)

    # Model configuration
    config = DistilBertConfig.from_pretrained(
        'distilbert-base-uncased', 
        num_labels=len(label_dict),
        dropout=dropout_rate,
        attention_dropout=dropout_rate
    )
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

    # Freeze layers except for the last `num_trainable_layers`
    for name, param in model.distilbert.named_parameters():
        param.requires_grad = False
    for layer_idx in range(6 - num_trainable_layers, 6):
        for name, param in model.distilbert.transformer.layer[layer_idx].named_parameters():
            param.requires_grad = True
    for name, param in model.classifier.named_parameters():
        param.requires_grad = True

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Training loop (only 3 epochs for HPO efficiency)
    epochs = 3
    best_val_f1 = 0
    for epoch in range(epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
            optimizer.step()
        scheduler.step()

        # Validation loop
        model.eval()
        all_predictions, all_true_labels = [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                predictions = torch.argmax(outputs.logits, dim=-1)
                all_predictions.extend(predictions.cpu().numpy())
                all_true_labels.extend(batch['labels'].cpu().numpy())
        val_f1 = f1_score(all_true_labels, all_predictions, average='weighted')
        best_val_f1 = max(best_val_f1, val_f1)

    return best_val_f1

# Run hyperparameter optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Adjust the number of trials as needed

# Save best hyperparameters
print("Best Hyperparameters:", study.best_params)
with open(f"{output_dir}/best_hyperparameters.txt", "w") as f:
    f.write(str(study.best_params))
