1. Create a baseline model 

In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from sklearn.metrics import f1_score, confusion_matrix, balanced_accuracy_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from torch.optim import AdamW, lr_scheduler
import shutil
import zipfile

In [2]:
learning_rate = 8.509924985836568e-05 
num_trainable_layers = 4 
dropout_rate = 0.19553005296446951 
batch_size = 16 
epochs = 5 
step_size = 4 
gamma = 0.8523421613311146 

In [3]:
df_balanced1 = pd.read_csv("/kaggle/input/balancedfull/df_balanced1.csv")
df_balanced2 = pd.read_csv("/kaggle/input/balancedfull/df_balanced2.csv")
df_balanced3 = pd.read_csv("/kaggle/input/balancedfull/df_balanced3.csv")
df_balanced4 = pd.read_csv("/kaggle/input/balancedfull/df_balanced4.csv")
df_balanced5 = pd.read_csv("/kaggle/input/balancedfull/df_balanced5.csv")

df_balanced1_text = df_balanced1['quote']
df_balanced2_text = df_balanced2['quote']
df_balanced3_text = df_balanced3['quote']
df_balanced4_text = df_balanced4['quote']
df_balanced5_text = df_balanced5['quote']
df_balanced1_label = df_balanced1['numeric_label']
df_balanced2_label = df_balanced2['numeric_label']
df_balanced3_label = df_balanced3['numeric_label']
df_balanced4_label = df_balanced4['numeric_label']
df_balanced5_label = df_balanced5['numeric_label']

text_combined = pd.concat([df_balanced1_text, df_balanced2_text, df_balanced3_text, df_balanced4_text, df_balanced5_text], ignore_index=True)
label_combined = pd.concat([df_balanced1_label, df_balanced2_label, df_balanced3_label, df_balanced4_label, df_balanced5_label], ignore_index = True) 

In [4]:
df = pd.read_parquet("/kaggle/input/test-parquet/test-00000-of-00001.parquet")
df['label_int'] = df['label'].str.split("_").str[0].astype('int')

texts = df["quote"].to_list()
labels = df["label_int"].to_list()

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") 

Using device: cuda


In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)
MAX_LENGTH = 365

# Dataset and DataLoader preparation
class QuotesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

def encode_data(tokenizer, texts, labels, max_length):
    try:
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.tolist()
            
        encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
        return QuotesDataset(encodings, labels)

    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [7]:
train_dataset = encode_data(tokenizer, text_combined, label_combined, MAX_LENGTH)
val_dataset = encode_data(tokenizer, texts, labels, MAX_LENGTH)

In [8]:
def setup_model_for_hyperopt(num_trainable_layers, dropout_rate):
    config = DistilBertConfig.from_pretrained(
        'distilbert-base-uncased',
        num_labels=8,
        dropout=dropout_rate,
        attention_dropout=dropout_rate
    )
    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)
    for name, param in model.named_parameters():
        param.requires_grad = False
    for layer_idx in range(6 - num_trainable_layers, 6):
        for name, param in model.distilbert.transformer.layer[layer_idx].named_parameters():
            param.requires_grad = True
    for name, param in model.classifier.named_parameters():
        param.requires_grad = True
    return model

In [9]:
def train_one_epoch(model, train_loader, optimizer, device):
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0
    for batch in train_loader:
        optimizer.zero_grad()
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        correct_train += (predictions == batch['labels']).sum().item()
        total_train += batch['labels'].size(0)
    average_loss = train_loss / len(train_loader)
    accuracy = correct_train / total_train
    return average_loss, accuracy

In [10]:
def validate_model(model, val_loader, device):
    model.eval()
    val_loss = 0
    correct_val = 0
    total_val = 0
    all_predictions = []
    all_true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=-1)
            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(batch['labels'].cpu().numpy())
            correct_val += (predictions == batch['labels']).sum().item()
            total_val += batch['labels'].size(0)
    average_val_loss = val_loss / len(val_loader)
    accuracy = correct_val / total_val
    return average_val_loss, accuracy, all_predictions, all_true_labels

In [11]:
model = setup_model_for_hyperopt(num_trainable_layers, dropout_rate)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.19553005296446951, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.19553005296446951, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout)

In [12]:
optimizer = AdamW(model.parameters(), lr= learning_rate)
scheduler = lr_scheduler.StepLR(optimizer, step_size= step_size, gamma= gamma)

In [13]:
train_loader = DataLoader(train_dataset, batch_size= batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size= batch_size, shuffle=False) 
val_accuracies = []

for epoch in range(epochs):
    train_loss, train_accuracy = train_one_epoch(model, train_loader, optimizer, device)
    val_loss, val_accuracy, all_predictions, all_true_labels = validate_model(model, val_loader, device)
    scheduler.step()

    val_accuracies.append(val_accuracy)
    print(val_accuracy)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


0.7957342083675144
0.8802296964725185
0.8974569319114027
0.9401148482362592
0.9794913863822805


2. Save the moodel so don't have to retrain it. 

In [14]:
model_save_path = '/kaggle/working/distilbert_trained.pth'
torch.save(model.state_dict(), model_save_path)
print(f'Model saved to {model_save_path}')

Model saved to /kaggle/working/distilbert_trained.pth
