In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import AutoModel
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
import pandas as pd
from transformers import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm

In [None]:
raw_train_dataset = load_dataset('csv', data_files='cople_ortho.csv', split='train[:80%]')
raw_validation_dataset = load_dataset('csv', data_files='cople_ortho.csv', split='train[80%:]')
labels = pd.read_csv('cople_ortho.csv')['Proficiency'].unique().tolist()
print(labels)

In [None]:
checkpoint = 'neuralmind/bert-base-portuguese-cased'
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=len(labels))
tokenizer = AutoTokenizer.from_pretrained(checkpoint, do_lower_case=False)

In [None]:
print(raw_train_dataset)
print(raw_validation_dataset)

In [None]:
def class_string_to_int(s):
    if s == 'A1':
        return 0
    if s == 'A2':
        return 1
    if s == 'B1':
        return 2
    if s == 'B2':
        return 3
    if s == 'C1':
        return 4
    if s == 'C2':
        return 5

In [None]:

def tokenize_function(example):
    example['labels'] = [class_string_to_int(proficiency) for proficiency in example['Proficiency']]
    return tokenizer(example["Text"], truncation=True)


tokenized_train_datasets = raw_train_dataset.map(tokenize_function, batched=True)
tokenized_validation_datasets = raw_validation_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
print(tokenized_train_datasets)
print(tokenized_validation_datasets)

In [None]:
tokenized_train_datasets = tokenized_train_datasets.remove_columns(["Text", "Id", "Proficiency"])
tokenized_train_datasets.set_format("torch")
tokenized_train_datasets.column_names

In [None]:
tokenized_validation_datasets = tokenized_validation_datasets.remove_columns(["Text", "Id", "Proficiency"])
tokenized_validation_datasets.set_format("torch")
tokenized_validation_datasets.column_names

In [None]:
train_dataloader = DataLoader(
    tokenized_train_datasets, shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_validation_datasets, batch_size=8, collate_fn=data_collator
)

In [None]:
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

In [None]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
import evaluate

# metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()