In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

- Add a column to turn the label column to numerical 

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
def extract_number(label):
    match = re.match(r'(\d+)_', label)
    if match:
        return int(match.group(1))  
    return None

- source of data: https://huggingface.co/datasets/QuotaClimat/frugalaichallenge-text-train

In [4]:
df = pd.read_parquet('../input/train-parquet')
df['numeric_label'] = df['label'].apply(extract_number)
# print(df.head())

In [5]:
train_texts, test_texts, train_labels, test_labels = train_test_split(df['quote'], df['numeric_label'], test_size=0.2, random_state=42)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


- Distilbert should be less energy consuming, it has less params 
- Lower case so less params 

In [7]:
# Initialize the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', do_lower_case=True)

# Function to tokenize data
def tokenize_data(texts, labels):
    encodings = tokenizer(texts.tolist(), padding=True, truncation=True, max_length=367, return_tensors="pt")
    dataset = CustomTextDataset(encodings, labels.tolist())
    return dataset

# Custom Dataset class
class CustomTextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = [int(label) for label in labels]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Prepare datasets
train_dataset = tokenize_data(train_texts, train_labels)
test_dataset = tokenize_data(test_texts, test_labels)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [8]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [9]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)
model.to(device)  # Move model to GPU if available
optimizer = AdamW(model.parameters(), lr=5e-5)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
print(device)

cuda


In [11]:
# Training loop
model.train()

for epoch in range(4):  # Train for 4 epochs
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.8415148258209229
Epoch 2, Loss: 0.5734939575195312
Epoch 3, Loss: 0.7246766090393066
Epoch 4, Loss: 0.09047771990299225


In [12]:
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    loss = outputs.loss
    total_eval_loss += loss.item()

    predictions = torch.argmax(logits, dim=-1)
    accuracy = (predictions == batch['labels']).cpu().numpy().mean() * 100
    total_eval_accuracy += accuracy

avg_test_accuracy = total_eval_accuracy / len(test_loader)
avg_test_loss = total_eval_loss / len(test_loader)

print(f"Test Loss: {avg_test_loss}, Test Accuracy: {avg_test_accuracy}")

Test Loss: 1.1808117994895349, Test Accuracy: 66.07905982905983


In [13]:
# Assuming you have the test_loader set up and the model in evaluation mode
predictions, true_labels = [], []

for batch in test_loader:
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)

    # Collect predictions and true labels
    predictions.extend(pred_labels.cpu().numpy())
    true_labels.extend(batch['labels'].cpu().numpy())

# Now predictions and true_labels are complete lists of all test data


In [14]:
f1 = f1_score(true_labels, predictions, average='weighted')  # Change 'weighted' to 'macro' if needed

print(f"F1 Score: {f1}")

F1 Score: 0.6681854882211213


In [15]:
model1 = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 8)
for name, param in model1.named_parameters():
    if 'classifier' not in name:  # Freeze layers that are not part of the classifier
        param.requires_grad = False

model1.to(device)  # Move model to GPU if available
optimizer = AdamW(model1.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
model1.train()

for epoch in range(4):  # Train for 4 epochs
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model1(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Epoch for model1 {epoch + 1}, Loss for model1: {loss.item()}")

Epoch for model1 1, Loss for model1: 1.5037622451782227
Epoch for model1 2, Loss for model1: 1.338724136352539
Epoch for model1 3, Loss for model1: 1.685149073600769
Epoch for model1 4, Loss for model1: 1.957392692565918


In [22]:
model1.eval()
total_eval_accuracy = 0
total_eval_loss = 0

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model1(**batch)

    logits = outputs.logits
    loss = outputs.loss
    total_eval_loss += loss.item()

    predictions1 = torch.argmax(logits, dim=-1)
    accuracy = (predictions1 == batch['labels']).cpu().numpy().mean() * 100
    total_eval_accuracy += accuracy

avg_test_accuracy = total_eval_accuracy / len(test_loader)
avg_test_loss = total_eval_loss / len(test_loader)

print(f"Test Loss for model1: {avg_test_loss}, Test Accuracy for model1: {avg_test_accuracy}")

Test Loss for model1: 1.3746979847932472, Test Accuracy for model1: 49.83974358974359


In [23]:
# Assuming you have the test_loader set up and the model in evaluation mode
predictions1, true_labels = [], []

for batch in test_loader:
    # Move batch to the appropriate device
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    pred_labels = torch.argmax(logits, dim=-1)

    # Collect predictions and true labels
    predictions1.extend(pred_labels.cpu().numpy())
    true_labels.extend(batch['labels'].cpu().numpy())

# Now predictions and true_labels are complete lists of all test data


In [24]:
f1 = f1_score(true_labels, predictions1, average='weighted')  # Change 'weighted' to 'macro' if needed

print(f"F1 Score for model1: {f1}")

F1 Score for model1: 0.6681854882211213
