In [1]:
!pip install nlpaug
!pip install optuna

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11


In [2]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim import AdamW, lr_scheduler
import nlpaug.augmenter.word as naw
import optuna
import shutil
import zipfile

In [3]:
# Ensure output directory exists
output_dir = "/kaggle/working/output"
os.makedirs(output_dir, exist_ok=True)

In [4]:
# Load and prepare data
# original
df = pd.read_parquet("/kaggle/input/train-parquet")
df['label_int'] = df['label'].str.split("_").str[0].astype('int')

texts = df["quote"].to_list()
labels = df["label_int"].to_list()

X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)

In [5]:
# augmented 
train1 = pd.read_csv('/kaggle/input/balanced/train1.csv')
train2 = pd.read_csv('/kaggle/input/balanced/train2.csv')
train3 = pd.read_csv('/kaggle/input/balanced/train3.csv')
train4 = pd.read_csv('/kaggle/input/balanced/train4.csv')

datasets = [train1, train2, train3, train4]

# Extract quotes and labels using list comprehension
texts = [ds['quote'] for ds in datasets]
labels = [ds['numeric_label'] for ds in datasets]

# Concatenate all texts and labels using pandas.concat
train1234_texts = pd.concat(texts, ignore_index=True)
train1234_labels = pd.concat(labels, ignore_index=True)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
# Create dictionary for label names
label_dict = df[['label_int', 'label']].drop_duplicates().set_index('label_int')['label'].to_dict()

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
MAX_LENGTH = 365

# Dataset and DataLoader preparation
class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    try:
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.tolist()
            
        encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
        return QuotesDataset(encodings, labels)

    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [10]:
train_dataset = encode_data(tokenizer, X_train, y_train, MAX_LENGTH)
train1234_dataset = encode_data(tokenizer, train1234_texts, train1234_labels, MAX_LENGTH)
val_dataset = encode_data(tokenizer, X_test, y_test, MAX_LENGTH)

# train_loader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
# train1234_loader = DataLoader(train1234_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=VAL_BATCH_SIZE, shuffle=False)

- Above is the basic data setup. Now train + hyperparam tune the model
- Start with defining some functions to be used in objective

In [11]:
# Changes the layers freezed and drop out rate 
def modify_model(model, num_trainable_layers, dropout_rate):
    # Freeze layers: only the last 'num_trainable_layers' are trainable
    total_layers = len(model.distilbert.transformer.layer)
    for layer_index, layer in enumerate(model.distilbert.transformer.layer):
        if layer_index < total_layers - num_trainable_layers:
            for param in layer.parameters():
                param.requires_grad = False

    # Adjust dropout rates in applicable transformer layers
    for layer in model.distilbert.transformer.layer:
        layer.attention.dropout.p = dropout_rate
        layer.ffn.dropout.p = dropout_rate

    return model

In [12]:
# Model training 
def train_one_epoch(model, train_loader, optimizer, device):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_train += (predictions == batch['labels']).sum().item()
        total_train += batch['labels'].size(0)
    average_loss = train_loss / len(train_loader)
    accuracy = correct_train / total_train
    return average_loss, accuracy

In [13]:
# Model validation 
def validate_model(model, val_loader, device):
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions = []
    all_true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())
            correct_val += (predictions == batch['labels']).sum().item()
            total_val += batch['labels'].size(0)
    average_val_loss = val_loss / len(val_loader)
    accuracy = correct_val / total_val
    return average_val_loss, accuracy, all_predictions, all_true_labels

In [14]:
# Hyperparam tune
def objective(trial):
    # Suggest hyperparameters
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    num_trainable_layers = trial.suggest_int('num_trainable_layers', 1, 6)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    step_size = trial.suggest_int('step_size', 1, 10)
    gamma = trial.suggest_float('gamma', 0.1, 0.9)
    epochs = trial.suggest_int('epochs', 2, 5)  # Allowing optimization of number of epochs

    # Model setup and modification
    model_config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=8)
    model = DistilBertForSequenceClassification(model_config)
    model = modify_model(model, num_trainable_layers, dropout_rate)
    model.to(device)

    # Optimizer and scheduler
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    # Training and validation
    train_loader = DataLoader(train1234_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    val_accuracies = []
    for epoch in range(epochs):
        train_loss, train_accuracy = train_one_epoch(model, train_loader, optimizer, device)
        val_loss, val_accuracy, all_predictions, all_true_labels = validate_model(model, val_loader, device)
        scheduler.step()

        # Collect metrics
        val_accuracies.append(val_accuracy)

    file_path = f"/kaggle/working/output_{trial.number}.pth"
    torch.save(model.state_dict(), file_path)

    # Store the best or last validation accuracy
    best_val_accuracy = max(val_accuracies)  # or you could use val_accuracies[-1] for the last

    return best_val_accuracy

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Adjust the number of trials as needed

print("Best trial:")
print(study.best_trial.params)

[I 2025-01-24 07:58:27,168] A new study created in memory with name: no-name-bbcb4c2c-82ec-4197-affb-e21df8ff774f
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
[I 2025-01-24 08:12:47,827] Trial 0 finished with value: 0.6333059885151764 and parameters: {'learning_rate': 0.0009534888137944198, 'num_trainable_layers': 1, 'dropout_rate': 0.20731170379209174, 'batch_size': 64, 'step_size': 9, 'gamma': 0.6609460758555095, 'epochs': 3}. Best is trial 0 with value: 0.6333059885151764.
[I 2025-01-24 08:27:56,886] Trial 1 finished with value: 0.8859721082854799 and parameters: {'learning_rate': 0.0001524517384759535, 'num_trainable_layers': 2, 'dropout_rate': 0.355217129984266, 'batch_size': 32, 'step_size': 5, 'gamma': 0.772201387432277, 'epochs': 3}. Best is trial 1 with value: 0.8859721082854799.


For comparism, run the below on augmented data. 

In [None]:
# def objective1(trial):
#     # Suggest hyperparameters
#     learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
#     num_trainable_layers = trial.suggest_int('num_trainable_layers', 1, 6)
#     dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
#     batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
#     step_size = trial.suggest_int('step_size', 1, 10)
#     gamma = trial.suggest_float('gamma', 0.1, 0.9)
#     epochs = trial.suggest_int('epochs', 2, 5)  # Allowing optimization of number of epochs

#     # Model setup and modification
#     model_config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=8)
#     model = DistilBertForSequenceClassification(model_config)
#     model = modify_model(model, num_trainable_layers, dropout_rate)
#     model.to(device)

#     # Optimizer and scheduler
#     optimizer = AdamW(model.parameters(), lr=learning_rate)
#     scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

#     # Training and validation
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
#     val_accuracies = []
#     for epoch in range(epochs):
#         train_loss, train_accuracy = train_one_epoch(model, train_loader, optimizer, device)
#         val_loss, val_accuracy, all_predictions, all_true_labels = validate_model(model, val_loader, device)
#         scheduler.step()

#         # Collect metrics
#         val_accuracies.append(val_accuracy)

#     file_path = f"/kaggle/working/output1_{trial.number}.pth"
#     torch.save(model.state_dict(), file_path)

#     # Store the best or last validation accuracy
#     best_val_accuracy = max(val_accuracies)  # or you could use val_accuracies[-1] for the last

#     return best_val_accuracy

In [None]:
# study1 = optuna.create_study(direction='maximize')
# study1.optimize(objective1, n_trials=50)  # Adjust the number of trials as needed

# print("Best trial:")
# print(study1.best_trial.params)

Run the blow code once have best hyperparams 

In [None]:
# store this in csv. Try not to re-run the code above. 
# study.best_trial.params

In [None]:
# Training params
# MAX_LENGTH = 365


# TRAIN_BATCH_SIZE = 16
# VAL_BATCH_SIZE = 64
# LEARNING_RATE = 1e-5
# STEP_SIZE = 2
# GAMMA = 0.1
# EPOCHS = 2

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=8)
# model.to(device)
# optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
# scheduler = lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

In [None]:
# # Plotting function for accuracy
# def plot_accuracies(training_accuracies, validation_accuracies):
#     plt.figure(figsize=(8, 6))
#     plt.plot(training_accuracies, label='Training Accuracy')
#     plt.plot(validation_accuracies, label='Validation Accuracy')
#     plt.title('Training and Validation Accuracy')
#     plt.xlabel('Epoch')
#     plt.ylabel('Accuracy')
#     plt.legend()
#     plt.savefig(f"{output_dir}/accuracy_plot.png")
#     plt.close()

# def plot_confusion_matrix(cm, class_labels, epoch, output_dir):
#     plt.figure(figsize=(10, 8))
#     sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
#     plt.title(f'Confusion Matrix for Epoch {epoch + 1}')
#     plt.ylabel('True Label')
#     plt.xlabel('Predicted Label')
#     plt.savefig(os.path.join(output_dir, f'confusion_matrix_epoch_{epoch + 1}.png'))
#     plt.close()

In [None]:
# # Training and validation loop
# train_losses = []
# val_losses = []
# train_accuracies = []
# val_accuracies = []
# metrics_df = pd.DataFrame()

# for epoch in range(EPOCHS):
#     model.train()
#     train_loss = 0
#     correct_train = 0
#     total_train = 0

#     for batch in train_loader:
#         optimizer.zero_grad()
#         batch = {k: v.to(device) for k, v in batch.items()}
#         outputs = model(**batch)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()
#         train_loss += loss.item()

#         predictions = torch.argmax(outputs.logits, dim=-1)
#         correct_train += (predictions == batch['labels']).sum().item()
#         total_train += batch['labels'].size(0)

#     scheduler.step()
#     train_losses.append(train_loss / len(train_loader))
#     train_accuracies.append(correct_train / total_train)

#     model.eval()
#     val_loss = 0
#     correct_val = 0
#     total_val = 0
#     all_predictions, all_true_labels = [], []

#     with torch.no_grad():
#         for batch in val_loader:
#             batch = {k: v.to(device) for k, v in batch.items()}
#             outputs = model(**batch)
#             val_loss += outputs.loss.item()
#             predictions = torch.argmax(outputs.logits, dim=-1)
#             all_predictions.extend(predictions.cpu().numpy())
#             all_true_labels.extend(batch['labels'].cpu().numpy())

#             correct_val += (predictions == batch['labels']).sum().item()
#             total_val += batch['labels'].size(0)

#     val_losses.append(val_loss / len(val_loader))
#     val_accuracies.append(correct_val / total_val)

#     # Compute confusion matrix
#     cm = confusion_matrix(all_true_labels, all_predictions)
#     class_labels = [label_dict.get(i, f'Class {i}') for i in range(len(np.unique(all_true_labels)))]
#     plot_confusion_matrix(cm, class_labels, epoch, output_dir)

#     # Calculate metrics
#     balanced_acc = balanced_accuracy_score(all_true_labels, all_predictions)
#     average_f1 = f1_score(all_true_labels, all_predictions, average='macro')
#     weighted_f1 = f1_score(all_true_labels, all_predictions, average='weighted')
#     f1_scores_per_class = f1_score(all_true_labels, all_predictions, average=None)
#     precision_per_class = precision_score(all_true_labels, all_predictions, average=None, zero_division=0)
#     recall_per_class = recall_score(all_true_labels, all_predictions, average=None, zero_division=0)

#     # Append metrics to DataFrame with class labels
#     epoch_metrics = {
#         "Epoch": epoch + 1,
#         "Train Loss": train_losses[-1],
#         "Validation Loss": val_losses[-1],
#         "Train Accuracy": train_accuracies[-1],
#         "Validation Accuracy": val_accuracies[-1],
#         "Balanced Accuracy": balanced_acc,
#         "Average F1": average_f1,
#         "Weighted F1": weighted_f1
#     }
#     epoch_metrics.update({f"{label_dict[i]} F1": f1_scores_per_class[i] for i in range(len(f1_scores_per_class))})
#     epoch_metrics.update({f"{label_dict[i]} Precision": precision_per_class[i] for i in range(len(precision_per_class))})
#     epoch_metrics.update({f"{label_dict[i]} Recall": recall_per_class[i] for i in range(len(recall_per_class))})

#     metrics_df = pd.concat([metrics_df, pd.DataFrame([epoch_metrics])], ignore_index=True)

# # Save metrics to CSV and plot accuracies
# metrics_df.to_csv(f"{output_dir}/training_metrics.csv", index=False)
# plot_accuracies(train_accuracies, val_accuracies)

# # Print overall model accuracy
# overall_accuracy = accuracy_score(y_test, all_predictions)
# print(f"Overall Model Accuracy: {overall_accuracy:.4f}")

In [None]:
# def zip_directory(folder_path, output_path):
#     """Zip the contents of an entire directory and save the archive to the specified output path."""
#     with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
#         for root, dirs, files in os.walk(folder_path):
#             for file in files:
#                 # Create a relative path for files to preserve the directory structure
#                 zipf.write(os.path.join(root, file),
#                            os.path.relpath(os.path.join(root, file),
#                                            os.path.join(folder_path, '..')))

In [None]:
# # Directory to be zipped
# input_dir = '/kaggle/working/output'

# # Output path for the zip file
# zip_file_path = '/kaggle/working/output.zip'

# # Creating the ZIP file
# zip_directory(input_dir, zip_file_path)

# print(f"Created zip file at: {zip_file_path}")

In [None]:
# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=20)  # Adjust the number of trials as needed

# print("Best trial:")
# print(study.best_trial.params)