In [2]:
import matplotlib.pyplot as plt
from datasets import load_dataset
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    get_linear_schedule_with_warmup,
)
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchmetrics
from torchinfo import summary

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
dataset = load_dataset("quotaclimat/frugalaichallenge-text-train")

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")


def tokenize_quotes(example):
    return tokenizer(
        example["quote"],
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors=None,
    )


def label_to_int(example):
    label_to_int = {
        "0_not_relevant": 0,
        "1_not_happening": 1,
        "2_not_human": 2,
        "3_not_bad": 3,
        "4_solutions_harmful_unnecessary": 4,
        "5_science_unreliable": 5,
        "6_proponents_biased": 6,
        "7_fossil_fuels_needed": 7,
    }
    example["label"] = label_to_int[example["label"]]
    return example


train_ds = dataset["train"].remove_columns(
    ["source", "url", "language", "subsource", "id", "__index_level_0__"]
)
test_ds = dataset["test"].remove_columns(
    ["source", "url", "language", "subsource", "id", "__index_level_0__"]
)

train_ds = train_ds.map(tokenize_quotes).remove_columns(["quote"])
test_ds = test_ds.map(tokenize_quotes).remove_columns(["quote"])

train_ds = train_ds.map(label_to_int)
test_ds = test_ds.map(label_to_int)

Map:   0%|          | 0/1219 [00:00<?, ? examples/s]

Map:   0%|          | 0/1219 [00:00<?, ? examples/s]

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base", num_labels=8
)
model

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [None]:
summary(
    model=model,
    input_size=(32, 512, 512)
    col_names=["input_size", "output_size", "num_params", "trainable"],
    col_width=20,
    row_settings=["var_names"],
)

In [36]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
BATCH_SIZE = 32
train_dataloader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=data_collator
)

test_dataloader = DataLoader(
    test_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=data_collator
)

print(f"Dataloaders: {train_dataloader, test_dataloader}")
print(f"Length of train dataloader: {len(train_dataloader)} batches of {BATCH_SIZE}")
print(f"Length of test dataloader: {len(test_dataloader)} batches of {BATCH_SIZE}")

Dataloaders: (<torch.utils.data.dataloader.DataLoader object at 0x74933c3c9040>, <torch.utils.data.dataloader.DataLoader object at 0x7493972bbb30>)
Length of train dataloader: 153 batches of 32
Length of test dataloader: 39 batches of 32


In [40]:
# training setup
EPOCHS = 10
RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps
)
accuracy_fn = torchmetrics.Accuracy(task="multiclass", num_classes=8).to(device)

In [38]:
datum = next(iter(train_dataloader))

summary(
    model=model,
    input_data=(datum["input_ids"].to(device), datum["attention_mask"].to(device)),
    col_names=["input_size", "output_size", "num_params", "trainable"],
    col_width=20,
    row_settings=["var_names"],
)

Layer (type (var_name))                                                Input Shape          Output Shape         Param #              Trainable
RobertaForSequenceClassification (RobertaForSequenceClassification)    [32, 145]            [32, 8]              --                   True
├─RobertaModel (roberta)                                               [32, 145]            [32, 145, 768]       --                   True
│    └─RobertaEmbeddings (embeddings)                                  --                   [32, 145, 768]       --                   True
│    │    └─Embedding (word_embeddings)                                [32, 145]            [32, 145, 768]       38,603,520           True
│    │    └─Embedding (token_type_embeddings)                          [32, 145]            [32, 145, 768]       768                  True
│    │    └─Embedding (position_embeddings)                            [32, 145]            [32, 145, 768]       394,752              True
│    │    └─LayerNorm 

In [None]:
for _ in tqdm(range(EPOCHS)):
    # train step
    train_loss = 0

    for batch, components in enumerate(train_dataloader):
        model.train()

        # move batch to device and extract components
        input_ids = components["input_ids"].to(device)
        attention_mask = components["attention_mask"].to(device)
        labels = components["labels"].to(device)

        # forward pass
        y_logits = model(input_ids, attention_mask).logits
        y_preds = torch.softmax(y_logits, dim=1)
        predicted_labels = torch.argmax(y_preds, dim=1)
        
        # calculate loss
        loss = loss_fn(y_logits, labels)
        train_loss += loss

        # zero out gradients, loss backward, step optimizer
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

    # avg loss per epoch
    train_loss /= len(train_dataloader)

    # test step
    test_loss, test_acc = 0, 0

    model.eval()
    with torch.inference_mode():
        for batch in test_dataloader:
            # move batch to device and extract components
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = bath["labels"].to(device)
            
            # forward pass
            test_logits = model(input_ids, attention_mask=attention_mask).logits
            test_preds = torch.softmax(test_logits, dim=1)
            test_labels = torch.argmax(test_preds, dim=1)

            # calculate loss & accumulate
            test_loss += loss_fn(test_preds, labels)

            # calculate accuracy
            test_acc += accuracy_fn(labels, test_labels)

        # Divide total test loss by length of test dataloader (per batch)
        test_loss /= len(test_dataloader)

        # Divide total accuracy by length of test dataloader (per batch)
        test_acc /= len(test_dataloader)

    # Print out what's happening
    print(
        f"Train loss: {train_loss:.3f} | Test loss: {test_loss:.3f}, Test acc: {test_acc:.3f}%"
    )

  0%|          | 0/10 [00:00<?, ?it/s]

Train loss: 1.740 | Test loss: 1.791, Test acc: 0.595%
Train loss: 0.985 | Test loss: 1.681, Test acc: 0.665%
Train loss: 0.710 | Test loss: 1.621, Test acc: 0.711%
Train loss: 0.518 | Test loss: 1.597, Test acc: 0.706%
Train loss: 0.365 | Test loss: 1.585, Test acc: 0.720%
Train loss: 0.254 | Test loss: 1.580, Test acc: 0.714%
Train loss: 0.181 | Test loss: 1.570, Test acc: 0.729%
Train loss: 0.133 | Test loss: 1.570, Test acc: 0.723%
Train loss: 0.111 | Test loss: 1.567, Test acc: 0.720%
Train loss: 0.089 | Test loss: 1.565, Test acc: 0.724%


In [42]:
torch.save(model.state_dict(), "distilroberta_climate_classifier.pt")

In [44]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilroberta-base", num_labels=8
)
state_dict = torch.load(("./distilroberta_climate_classifier.pt"))
model.load_state_dict(state_dict)
model.to(device)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

In [50]:
from sklearn.metrics import classification_report

labels = [
    "0_not_relevant",
    "1_not_happening",
    "2_not_human",
    "3_not_bad",
    "4_solutions_harmful_unnecessary",
    "5_science_unreliable",
    "6_proponents_biased",
    "7_fossil_fuels_needed",
]

model.eval()
all_preds = []
all_labels = []

with torch.inference_mode():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        test_labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask).logits
        preds = torch.softmax(logits, dim=1)
        pred_labels = torch.argmax(preds, dim=1)

        all_preds.extend(pred_labels.cpu().numpy())
        all_labels.extend(test_labels.cpu().numpy())

print(classification_report(all_labels, all_preds, labels=range(8), target_names=labels))


                                 precision    recall  f1-score   support

                 0_not_relevant       0.79      0.80      0.79       307
                1_not_happening       0.75      0.79      0.77       154
                    2_not_human       0.65      0.68      0.66       137
                      3_not_bad       0.71      0.72      0.71        97
4_solutions_harmful_unnecessary       0.68      0.70      0.69       160
           5_science_unreliable       0.64      0.65      0.64       160
            6_proponents_biased       0.74      0.65      0.69       139
          7_fossil_fuels_needed       0.68      0.60      0.64        65

                       accuracy                           0.72      1219
                      macro avg       0.71      0.70      0.70      1219
                   weighted avg       0.72      0.72      0.72      1219

