In [4]:
import torch
import numpy as np
import datasets
import pandas as pd
import evaluate
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
from tqdm import tqdm
from transformers import BertForSequenceClassification
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizer, AutoTokenizer, DataCollatorWithPadding
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

In [5]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda:0


In [6]:
train_dataset, val_dataset = datasets.load_dataset(
    "csv",
    data_files="../data/train_data.csv",
    split=[datasets.ReadInstruction("train", to=90, unit="%"), datasets.ReadInstruction("train", from_=-10, unit="%")],
)
train_dataset, val_dataset = train_dataset.rename_column("rating", "label"), val_dataset.rename_column(
    "rating", "label"
)

In [7]:
class_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(train_dataset["label"]), y=train_dataset["label"]
)
class_weights = torch.tensor(np.array(class_weights).astype("float32")).to(device)

In [8]:
for class_idx, class_weight in zip(np.unique(train_dataset["label"]), class_weights):
    print(class_idx, class_weight)

0 tensor(2.9041, device='cuda:0')
1 tensor(2.2662, device='cuda:0')
2 tensor(1.8866, device='cuda:0')
3 tensor(0.6817, device='cuda:0')
4 tensor(0.4510, device='cuda:0')


In [9]:
PRETRAINED_MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)


def tokenize_function(data):
    return tokenizer(data["review"], padding="max_length", truncation=True)


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_loader = DataLoader(tokenized_train, shuffle=True, batch_size=16)
val_loader = DataLoader(tokenized_val, batch_size=16)

In [10]:
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=5)
optimizer = Adam(model.parameters(), lr=5e-5)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
num_epochs = 10
loss_fun = nn.CrossEntropyLoss(class_weights)

for epoch in range(num_epochs):
    losses = []
    for batch in tqdm(train_loader):
        labels = batch["label"].to(device)
        batch = {"attention_mask": batch["attention_mask"].to(device), "input_ids": batch["input_ids"].to(device)}
        outputs = model(**batch)
        loss = loss_fun(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())
    print(np.mean(losses))

100%|██████████| 923/923 [18:09<00:00,  1.18s/it]


1.038550151937832


100%|██████████| 923/923 [17:57<00:00,  1.17s/it]


0.7651179581396241


100%|██████████| 923/923 [17:46<00:00,  1.16s/it]


0.5639060639321223


100%|██████████| 923/923 [17:46<00:00,  1.16s/it]


0.3892068857991037


100%|██████████| 923/923 [17:46<00:00,  1.16s/it]


0.2650824172517137


100%|██████████| 923/923 [18:07<00:00,  1.18s/it]


0.16949297182508144


100%|██████████| 923/923 [18:04<00:00,  1.18s/it]


0.12687728242173726


100%|██████████| 923/923 [17:49<00:00,  1.16s/it]


0.09915844894207021


100%|██████████| 923/923 [17:49<00:00,  1.16s/it]


0.0902167347812004


100%|██████████| 923/923 [17:49<00:00,  1.16s/it]

0.07389660193438552





In [12]:
metric = evaluate.load("accuracy")
model.eval()
for batch in val_loader:
    labels = batch["label"].to(device)
    batch = {
        "attention_mask": batch["attention_mask"].to(device),
        "input_ids": batch["input_ids"].to(device),
    }  # , "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

{'accuracy': 0.5771812080536913}

In [13]:
import evaluate

accuracy_metric = evaluate.load('accuracy')
precision_metric = evaluate.load('precision')
recall_metric = evaluate.load('recall')
f1_metric = evaluate.load('f1')

model.eval()
for batch in val_loader:
    labels = batch["label"].to(device)
    batch = {
        "attention_mask": batch["attention_mask"].to(device),
        "input_ids": batch["input_ids"].to(device),
    }  # , "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    
    accuracy_metric.add_batch(predictions=predictions, references=labels)
    precision_metric.add_batch(predictions=predictions, references=labels)
    recall_metric.add_batch(predictions=predictions, references=labels)
    f1_metric.add_batch(predictions=predictions, references=labels)


accuracy = accuracy_metric.compute()
f1 = f1_metric.compute()
precision = precision_metric.compute()
recall = recall_metric.compute()

Downloading builder script: 100%|██████████| 7.55k/7.55k [00:00<00:00, 14.8MB/s]


AttributeError: module 'evaluate' has no attribute 'jsonld'

In [None]:
torch.load(model.state_dict(), "bert_model2.pth")

In [1]:
preds = []
references = []
for batch in val_loader:
    labels = batch["label"].to(device)
    batch = {
        "attention_mask": batch["attention_mask"].to(device),
        "input_ids": batch["input_ids"].to(device),
    }  # , "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    preds.extend((logits.argmax(-1).tolist()))
    references.extend(labels.tolist())

confusion_matrix = evaluate.confusion_matrix(predictions, references)
ConfusionMatrixDisplay(confusion_matrix).plot()
plt.show()

NameError: name 'val_loader' is not defined

In [None]:
mca_eval = MulticlassAccuracy(num_classes=5, average=None)
mca_average = MulticlassAccuracy(num_classes=5, average='micro')

In [None]:
mca_eval(torch.tensor(preds), torch.tensor(references))

In [None]:
mca_average(torch.tensor(preds), torch.tensor(references))


In [None]:
torch.save(model.state_dict(), "bert_model2.pth")