In [2]:
import torch
import numpy as np
import datasets
import pandas as pd
import evaluate
from torch.utils.data import DataLoader
from torch.optim import Adam
from torch import nn
from tqdm import tqdm
from transformers import BertForSequenceClassification
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizer, AutoTokenizer, DataCollatorWithPadding

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

In [1]:
train_dataset, val_dataset = datasets.load_dataset(
    "csv",
    data_files="../data/train_data.csv",
    split=[datasets.ReadInstruction("train", to=90, unit="%"), datasets.ReadInstruction("train", from_=-10, unit="%")],
)
train_dataset, val_dataset = train_dataset.rename_column("rating", "label"), val_dataset.rename_column(
    "rating", "label"
)

In [None]:
class_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(train_dataset["label"]), y=train_dataset["label"]
)
class_weights = torch.tensor(np.array(class_weights).astype("float32")).to(device)

In [None]:
for class_idx, class_weight in zip(np.unique(train_dataset["label"]), class_weights):
    print(class_idx, class_weight)

In [None]:
PRETRAINED_MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)


def tokenize_function(data):
    return tokenizer(data["review"], padding="max_length", truncation=True)


tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

train_loader = DataLoader(tokenized_train, shuffle=True, batch_size=16)
val_loader = DataLoader(tokenized_val, batch_size=16)

In [None]:
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=5)
optimizer = Adam(model.parameters(), lr=5e-5)
model.to(device)

In [None]:
num_epochs = 10
loss_fun = nn.CrossEntropyLoss(class_weights)

for epoch in range(num_epochs):
    losses = []
    for batch in tqdm(train_loader):
        labels = batch["label"].to(device)
        batch = {"attention_mask": batch["attention_mask"].to(device), "input_ids": batch["input_ids"].to(device)}
        outputs = model(**batch)
        loss = loss_fun(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())
    print(np.mean(losses))

In [None]:
metric = evaluate.load("accuracy")
model.eval()
for batch in val_loader:
    labels = batch["label"].to(device)
    batch = {
        "attention_mask": batch["attention_mask"].to(device),
        "input_ids": batch["input_ids"].to(device),
    }  # , "token_type_ids":batch['token_type_ids'].to(device)}

    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=labels)

metric.compute()

In [None]:
torch.save(model.state_dict(), "")