In [1]:
import transformers
import pandas as pd
import numpy as np
import nltk
from utils import TweetsDataset
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
tokenizer = transformers.AutoTokenizer.from_pretrained("Newtral/xlm-r-finetuned-toxic-political-tweets-es")
df = pd.read_csv("../../data/procesed.csv")

In [3]:
# https://huggingface.co/Newtral/xlm-r-finetuned-toxic-political-tweets-es

In [4]:
model = transformers.XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=4, problem_type="multi_label_classification")

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [5]:
df_train, df_valid, df_test = TweetsDataset.split_test_val(df,test_size=0.20,valid_size=0.20)
train_data_loader = TweetsDataset.create_data_loader(df_train, tokenizer, batch_size=8)
valid_data_loader = TweetsDataset.create_data_loader(df_valid, tokenizer, batch_size=8)

In [6]:
model = model.to(device)

EPOCHS = 5

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0, 
    num_training_steps = total_steps
)

In [7]:
from collections import defaultdict
from utils import Model

history = defaultdict(list)

Model.train(EPOCHS, model, train_data_loader, optimizer, device, scheduler, history, len(df_train), len(df_valid), valid_data_loader)

Epoch 1/5
----------
[KTrain loss 0.5407546909381389 Accuracy 0.3506343713956171
Val loss 0.5379155062966876 Accuracy 0.3277777777777778

Epoch 2/5
----------
[KTrain loss 0.5373690957568592 Accuracy 0.3543829296424452
Val loss 0.535550054024767 Accuracy 0.3638888888888889

Epoch 3/5
----------
[KTrain loss 0.5367523357558489 Accuracy 0.3502499038831219
Val loss 0.5361981568513093 Accuracy 0.3638888888888889

Epoch 4/5
----------
[KTrain loss 0.53607926544641 Accuracy 0.3636101499423299
Val loss 0.5356593176170632 Accuracy 0.3638888888888889

Epoch 5/5
----------
[KTrain loss 0.5356174074485611 Accuracy 0.36034217608612074
Val loss 0.5348692771461275 Accuracy 0.3638888888888889



defaultdict(list,
            {'train_acc': [0.3506343713956171,
              0.3543829296424452,
              0.3502499038831219,
              0.3636101499423299,
              0.36034217608612074],
             'train_loss': [0.5407546909381389,
              0.5373690957568592,
              0.5367523357558489,
              0.53607926544641,
              0.5356174074485611],
             'val_acc': [0.3277777777777778,
              0.3638888888888889,
              0.3638888888888889,
              0.3638888888888889,
              0.3638888888888889],
             'val_loss': [0.5379155062966876,
              0.535550054024767,
              0.5361981568513093,
              0.5356593176170632,
              0.5348692771461275]})

In [8]:
torch.save(model.state_dict(), 'xlm_newtral_pretrained.bin')