In [1]:
!pip install -qq transformers
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
import transformers
import torch
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from utils import TweetsDataset

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
df_train = pd.read_csv("../../data/development.csv")

In [3]:
def preprocess_tweet(tweet):
    return " ".join([t.lower() for t in word_tokenize(tweet)])
df_train_clean = pd.DataFrame({})
df_train_clean["tweet"] = df_train["tweet"].apply(preprocess_tweet)
df_train_clean["label"] = df_train["ideology_multiclass"].map({'moderate_left': 0, 'moderate_right': 1, "left": 2, "right": 3})

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained("bertin-project/bertin-roberta-base-spanish")

In [3]:
model = transformers.RobertaForSequenceClassification.from_pretrained("bertin-project/bertin-roberta-base-spanish", num_labels=4, problem_type="multi_label_classification")

Some weights of the model checkpoint at bertin-project/bertin-roberta-base-spanish were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at bertin-project/bertin-roberta-base-spanish and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classif

In [8]:
# División del conjunto de datos en entrenamiento, validación y test
df_train, df_valid = TweetsDataset.split_train_val(df_train)

In [5]:
train_data_loader = TweetsDataset.create_data_loader(df_train, tokenizer)
valid_data_loader = TweetsDataset.create_data_loader(df_valid, tokenizer)

NameError: name 'TweetsDataset' is not defined

In [10]:
data = next(iter(train_data_loader)) 
data['input_ids'][0].type()

'torch.LongTensor'

In [11]:
model = model.to(device)

EPOCHS = 5

optimizer = torch.optim.AdamW(model.parameters(), lr= 5e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [12]:
import torch.nn.functional as F
import sys

def train_epoch(model, data_loader, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0
    for i,d in enumerate(data_loader):
        print("Entrenando " + "."*(i%4), end="\r")
        loss, logits = model(
            input_ids=d["input_ids"].to(device),
            attention_mask=d["attention_mask"].to(device),
            labels=F.one_hot(d['labels'].to(device), num_classes=4).float(),
            return_dict=False
        )
        logits = logits.detach().cpu().numpy()
        label_ids = d['labels'].cpu().flatten().numpy()
        preds = np.argmax(logits, axis=1).flatten()
        targ = d['labels'].numpy()
        correct_predictions += np.sum(preds==targ)
        losses.append(loss.item())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        sys.stdout.write("\033[K")
    return correct_predictions / n_examples, np.mean(losses)

def eval_model(model, data_loader, device, n_examples):
    model = model.eval()
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for d in data_loader:
            loss, logits = model(
                input_ids = d["input_ids"].to(device),
                attention_mask = d["attention_mask"].to(device),
                labels = F.one_hot(d['labels'].to(device), num_classes=4).float(),
                return_dict=False
            )
            logits = logits.detach().cpu().numpy()
            labels_ids = d['labels'].cpu().flatten().numpy()
            preds = np.argmax(logits, axis=1).flatten()
            targ = d['labels'].numpy()
            correct_predictions += np.sum(preds==targ)
            losses.append(loss.item())
        return correct_predictions / n_examples, np.mean(losses)

In [13]:
# Entrenamiento 
from collections import defaultdict
history = defaultdict(list)
best_acc = 0

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-'*10)
    
    train_acc, train_loss = train_epoch(model, train_data_loader, optimizer, device, scheduler, len(df_train))
    print(f'Train loss {train_loss} Accuracy {train_acc}')
    val_acc, val_loss = eval_model(model, valid_data_loader, device, len(df_valid))
    print(f'Val loss {val_loss} Accuracy {val_acc}')
    print()
    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)
    if val_acc > best_acc:
#         torch.save(model.state_dict(), 'best_model_state_a5.bin')
        best_acc = val_acc

Epoch 1/5
----------
[KTrain loss 0.5230305357829034 Accuracy 0.40993848519800075
Val loss 0.5240743909193122 Accuracy 0.40032679738562094

Epoch 2/5
----------
[KTrain loss 0.4429614149297254 Accuracy 0.5465205690119185
Val loss 0.5954099766586138 Accuracy 0.3899782135076253

Epoch 3/5
----------
[KTrain loss 0.33107872045214093 Accuracy 0.6872356785851595
Val loss 0.6647299211958181 Accuracy 0.3888888888888889

Epoch 4/5
----------
[KTrain loss 0.203827803828589 Accuracy 0.838715878508266
Val loss 0.7787875862225242 Accuracy 0.45588235294117646

Epoch 5/5
----------
[KEntrenando ...

KeyboardInterrupt: 

In [4]:
# Test 
model.to(device)
model.load_state_dict(torch.load('./bertin-base-v1.bin', map_location=torch.device(device)))
model.eval()

df_test = pd.read_csv("../../data/development_test.csv")
df_test = pd.DataFrame({
    "tweets": df_test["tweet"],
    "labels": df_test["ideology_multiclass"].map({'moderate_left': 0, 'moderate_right': 1, "left": 2, "right": 3})
})
test_data_loader = TweetsDataset.create_data_loader(df_test, tokenizer)

In [5]:
from utils import Model, Results
acc, loss, f1 = Model.test(model, test_data_loader, device, len(df_test))
Results.add_result("bertin-base", "ideology-multiclass", acc, loss, f1)

Test loss: 0.827 Accuracy: 0.392


FileNotFoundError: [Errno 2] No such file or directory: '../../results.csv'

In [13]:
results = pd.read_csv("../results.csv")
results = results.append(pd.DataFrame({
     "model": "bertin-base",
     "class": "ideology-multiclass",
     "loss": loss,
     "f1score": f1,
     "accuracy": acc 
}, index=[0]))
results.to_csv("../results.csv")

  results = results.append(pd.DataFrame({


In [None]:
Results.add_result("bertin-base", "ideology-multiclass", acc, loss, f1)