In [1]:
pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl.metadata (14 kB)
Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: nlpaug
Successfully installed nlpaug-1.1.11
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import nlpaug.augmenter.word as naw
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

- source of data: https://huggingface.co/datasets/QuotaClimat/frugalaichallenge-text-train

In [7]:
train1 = pd.read_csv('/kaggle/input/balanced/train1.csv')
train2 = pd.read_csv('/kaggle/input/balanced/train2.csv')
train3 = pd.read_csv('/kaggle/input/balanced/train3.csv')
train4 = pd.read_csv('/kaggle/input/balanced/train4.csv')

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


- Distilbert should be less energy consuming, it has less params 
- Lower case so less params 

**split data**

In [8]:
train1_texts = train1['quote']
train1_labels = train1['numeric_label']
train2_texts = train2['quote']
train2_labels = train2['numeric_label']
train3_texts = train3['quote']
train3_labels = train3['numeric_label']
train4_texts = train4['quote']
train4_labels = train4['numeric_label']

**Tokenize** 

In [9]:
# Initialize the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

# Function to tokenize data
def tokenize_data(texts, labels):
    try:
        if isinstance(texts, pd.Series):
            texts = texts.tolist()
        if isinstance(labels, pd.Series):
            labels = labels.tolist()

        encodings = tokenizer(
            texts, 
            padding=True, 
            truncation=True, 
            max_length=367, 
            return_tensors="pt"
        )

        dataset = CustomTextDataset(encodings, labels)
        return dataset

    except Exception as e:
        print(f"Error during tokenization: {e}")
        return None
# Custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = [int(label) for label in labels]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [10]:
train1_dataset = tokenize_data(train1_texts, train1_labels)
train2_dataset = tokenize_data(train2_texts, train2_labels)
train3_dataset = tokenize_data(train3_texts, train3_labels)
train4_dataset = tokenize_data(train4_texts, train4_labels)

In [18]:
train1_loader = DataLoader(train1_dataset, batch_size=32, shuffle=True)
train2_loader = DataLoader(train2_dataset, batch_size=32, shuffle=True)
train3_loader = DataLoader(train3_dataset, batch_size=32, shuffle=True)
train4_loader = DataLoader(train4_dataset, batch_size=32, shuffle=True)

In [24]:
model1 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)
model1.to(device)
optimizer1 = AdamW(model1.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
print(device)

cuda


Step 1: train on train1 and validate on train 2 

In [25]:
model1.train()  # Set the model to training mode

for epoch in range(2):  # Train for 4 epochs
    total_loss = 0
    total_correct = 0
    total_examples = 0

    for batch in train1_loader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch to device
        outputs = model1(**batch)  # Forward pass
        loss = outputs.loss
        loss.backward()  # Backpropagation
        optimizer1.step()  # Update parameters
        optimizer1.zero_grad()  # Clear gradients

        # Calculate the loss
        total_loss += loss.item()

        # Calculate accuracy
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_examples += predictions.size(0)

    # Calculate average loss and accuracy for the epoch
    avg_loss = total_loss / len(train1_loader)
    avg_accuracy = 100 * total_correct / total_examples

    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.2f}, Accuracy: {avg_accuracy:.2f}%")

Epoch 1, Loss: 1.52, Accuracy: 47.01%
Epoch 2, Loss: 0.71, Accuracy: 78.58%


In [26]:
model1.eval()
total1_eval_accuracy = 0
total1_eval_loss = 0

for batch in train2_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    loss = outputs.loss
    total1_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
    total1_eval_accuracy += accuracy

avg1_test_accuracy = total1_eval_accuracy / len(train2_loader)
avg1_test_loss = total1_eval_loss / len(train2_loader)

print(f"Test Loss: {avg1_test_loss}, Test Accuracy: {avg1_test_accuracy}")

predictions1, true_labels1 = [], []

for batch in train2_loader:
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)

    # Collect predictions and true labels
    predictions1.extend(pred_labels.cpu().numpy())
    true_labels1.extend(batch['labels'].cpu().numpy())

f1 = f1_score(true_labels1, predictions1, average='weighted')  # Change 'weighted' to 'macro' if needed

print(f"F1 Score: {f1}")

Test Loss: 1.3220378246652074, Test Accuracy: 54.66867469879518
F1 Score: 0.5439709223591096


Step 2: train on train1+train 2, validate on train 3

In [27]:
train12_texts = pd.concat([train1_texts, train2_texts], ignore_index=True)
train12_labels = pd.concat([train1_labels, train2_labels], ignore_index=True)
train12_dataset = tokenize_data(train12_texts, train12_labels)
train12_loader = DataLoader(train12_dataset, batch_size=32, shuffle=True)

In [28]:
model1.train()  # Set the model to training mode

for epoch in range(2):  # Train for 4 epochs
    total_loss = 0
    total_correct = 0
    total_examples = 0

    for batch in train12_loader:
        batch = {k: v.to(device) for k, v in batch.items()}  # Move batch to device
        outputs = model1(**batch)  # Forward pass
        loss = outputs.loss
        loss.backward()  # Backpropagation
        optimizer1.step()  # Update parameters
        optimizer1.zero_grad()  # Clear gradients

        # Calculate the loss
        total_loss += loss.item()

        # Calculate accuracy
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_correct += (predictions == batch['labels']).sum().item()
        total_examples += predictions.size(0)

    # Calculate average loss and accuracy for the epoch
    avg_loss = total_loss / len(train12_loader)
    avg_accuracy = 100 * total_correct / total_examples

    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.2f}, Accuracy: {avg_accuracy:.2f}%")

Epoch 1, Loss: 0.68, Accuracy: 78.05%
Epoch 2, Loss: 0.26, Accuracy: 92.11%


In [31]:
model1.eval()
total2_eval_accuracy = 0
total2_eval_loss = 0

for batch in train3_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    loss = outputs.loss
    total2_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
    total2_eval_accuracy += accuracy

avg2_test_accuracy = total2_eval_accuracy / len(train3_loader)
avg2_test_loss = total2_eval_loss / len(train3_loader)

print(f"Test Loss: {avg2_test_loss}, Test Accuracy: {avg2_test_accuracy}")

predictions2, true_labels2 = [], []

for batch in train3_loader:
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)

    # Collect predictions and true labels
    predictions2.extend(pred_labels.cpu().numpy())
    true_labels2.extend(batch['labels'].cpu().numpy())

f12 = f1_score(true_labels2, predictions2, average='weighted')  # Change 'weighted' to 'macro' if needed

print(f"F1 Score: {f12}")

Test Loss: 0.9829255719500852, Test Accuracy: 73.26807228915662
F1 Score: 0.733328190754836


Step 3: train on train1+train2+train3, validate on train4 

Step 4: hyperparam optimization (only here since can be resource intensive) 