In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
tokenizer_roberta = RobertaTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")

In [3]:
from torch import nn

class Albert(torch.nn.Module):
    
    def __init__(self):
        super(Albert, self).__init__()
        self.max_len = 512
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.encoder = RobertaModel.from_pretrained("PlanTL-GOB-ES/roberta-base-bne").to(self.device)
        self.dropout = nn.Dropout(0.15)
        
        self.linear_gender = nn.Linear(768,768).to(self.device)
        self.activation_gender = nn.ReLU().to(self.device)
        self.dropout_gender = nn.Dropout(0.15).to(self.device)
        self.linear_gender_1 = nn.Linear(768,1).to(self.device)
        self.out_gender = nn.Sigmoid().to(self.device)
        
        self.linear_profession = nn.Linear(768,768).to(self.device)
        self.activation_profession = nn.ReLU().to(self.device)
        self.dropout_profession = nn.Dropout(0.15).to(self.device)
        self.linear_profession_1 = nn.Linear(768,3).to(self.device)
        
        self.linear_ideology = nn.Linear(768,768).to(self.device)
        self.activation_ideology = nn.ReLU().to(self.device)
        self.dropout_ideology = nn.Dropout(0.15).to(self.device)
        self.linear_ideology_1 = nn.Linear(768,1).to(self.device)
        self.out_ideology = nn.Sigmoid().to(self.device)
        
        self.linear_ideology_multiclass = nn.Linear(768,768).to(self.device)
        self.activation_ideology_multiclass = nn.ReLU().to(self.device)
        self.dropout_ideology_multiclass = nn.Dropout(0.15).to(self.device)
        self.linear_ideology_multiclass_1 = nn.Linear(768,4).to(self.device)
        
    def forward(self, x):
        input_ids, attention_mask = x.values()
        embedings = self.encoder(input_ids = input_ids.squeeze().to(self.device), attention_mask = attention_mask.squeeze().to(self.device))
        embedings = self.dropout(embedings['pooler_output']).to(self.device)
        x_gender = self.linear_gender(embedings)
        x_gender = self.activation_gender(x_gender)
        x_gender = self.dropout_gender(x_gender)
        x_gender = self.linear_gender_1(x_gender)
        y_gender = self.out_gender(x_gender)
        
        x_profession = self.linear_profession(embedings)
        x_profession = self.activation_profession(x_profession)
        x_profession = self.dropout_profession(x_profession)
        y_profession = self.linear_profession_1(x_profession)
        
        x_ideology = self.linear_ideology(embedings)
        x_ideology = self.activation_ideology(x_ideology)
        x_ideology = self.dropout_ideology(x_ideology)
        x_ideology = self.linear_ideology_1(x_ideology)
        y_ideology = self.out_ideology(x_ideology)
        
        x_ideology_multiclass = self.linear_ideology_multiclass(embedings)
        x_ideology_multiclass = self.activation_ideology_multiclass(x_ideology_multiclass)
        x_ideology_multiclass = self.dropout_ideology_multiclass(x_ideology_multiclass)
        y_ideology_multiclass = self.linear_ideology_multiclass_1(x_ideology_multiclass)
        return y_gender, y_profession, y_ideology, y_ideology_multiclass

In [9]:
from torch.utils.data import DataLoader, Dataset

class PoliticESDataset(Dataset):
    def __init__(self, tweet, gender, profession, ideology, ideology_mc, tokenizer):
        self.tweet = tweet      
        self.gender = gender
        self.profession = profession
        self.ideology = ideology
        self.ideology_mc = ideology_mc
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.tweet)
    def __getitem__(self, item):
        return {
            "tweet": self.tokenizer.encode_plus(self.tweet[item],     
                        add_special_tokens = True, 
                        max_length = 512,        
                        truncation=True,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt')
            ,
            'labels': [
                self.gender[item],
                self.profession[item],
                self.ideology[item],
                self.ideology_mc[item]
            ]
        }

def create_data_loader(df, tokenizer, batch_size = 16, shuffle=True):
    return DataLoader(
        PoliticESDataset(
            tweet=df.tweet.to_numpy(),
            gender = df.gender.to_numpy(),
            profession = df.profession.to_numpy(),
            ideology = df.ideology_binary.to_numpy(),
            ideology_mc = df.ideology_multiclass.to_numpy(),
            tokenizer = tokenizer
        ),
        batch_size = batch_size,
        shuffle=shuffle
    )

In [5]:
from tqdm import tqdm

def fit(model, data_loader, total_steps, optimizer, loss_fns):
    model = model.train()
    running_loss = 0.
    last_loss = 0.
    for i, entry in tqdm(enumerate(data_loader), total = total_steps/16):
        optimizer.zero_grad()
        y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])       
        loss_gender = loss_fns["gender"](y_gender.squeeze().to(device), entry["labels"][0].to(torch.float32).to(device)).to(device)
        loss_profession = loss_fns["profession"](y_profession.to(device), entry["labels"][1].to(device)).to(device)
        loss_ideology = loss_fns["ideology"](y_ideology.squeeze().to(device), entry["labels"][2].to(torch.float32).to(device)).to(device)
        loss_ideology_multiclass = loss_fns["ideology_multiclass"](y_ideology_multiclass.to(device), entry["labels"][3].to(device)).to(device)
        loss = loss_gender + loss_profession + loss_ideology + loss_ideology_multiclass
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 99:
            last_loss = running_loss / 100
            running_loss = 0.
    return last_loss

def test(model, data_loader, total_steps, loss_fns):
    correct_predictions = [0.,0.,0.,0.]
    loss = 0.
    model = model.eval()
    with torch.no_grad():
        for entry in data_loader:
            y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])
            loss_gender = loss_fns["gender"](y_gender.squeeze().to(device), entry["labels"][0].to(torch.float32).to(device)).to(device)
            loss_profession = loss_fns["profession"](y_profession.to(device), entry["labels"][1].to(device)).to(device)
            loss_ideology = loss_fns["ideology"](y_ideology.squeeze().to(device), entry["labels"][2].to(torch.float32).to(device)).to(device)
            loss_ideology_multiclass = loss_fns["ideology_multiclass"](y_ideology_multiclass.to(device), entry["labels"][3].to(device)).to(device)
            loss = loss_gender + loss_profession + loss_ideology + loss_ideology_multiclass
            correct_predictions[0] += torch.sum(torch.eq(entry["labels"][0].to(device), torch.where(y_gender.squeeze() > 0.5, 1, 0).to(device))).cpu().detach().item()
            correct_predictions[1] += torch.sum(torch.eq(entry["labels"][1].to(device), torch.argmax(y_profession, dim=1).to(device))).cpu().detach().item()
            correct_predictions[2] += torch.sum(torch.eq(entry["labels"][2].to(device), torch.where(y_ideology.squeeze() > 0.5, 1, 0).to(device))).cpu().detach().item()
            correct_predictions[3] += torch.sum(torch.eq(entry["labels"][3].to(device), torch.argmax(y_ideology_multiclass, dim=1).to(device))).cpu().detach().item()
    return [x/total_steps for x in correct_predictions], loss

def train_model(model, train_dataset, eval_dataset, EPOCHS, batch_size, lr):
    train_data_loader = create_data_loader(train_dataset, tokenizer_roberta, batch_size)
    eval_data_loader = create_data_loader(eval_dataset, tokenizer_roberta, batch_size)
    loss_fns = {
        "gender": nn.BCELoss(),
        "profession": nn.CrossEntropyLoss(),
        "ideology": nn.BCELoss(),
        "ideology_multiclass": nn.CrossEntropyLoss()
    }
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(EPOCHS):
        train_loss = fit(model, train_data_loader, len(train_dataset), optimizer, loss_fns)
        print(f'EPOCH {epoch + 1}: Pérdida: {train_loss}')
        eval_acc, eval_loss = test(model, eval_data_loader, len(eval_dataset), loss_fns)
        print(f'Eval accuracy: {eval_acc} Eval loss: {eval_loss.item()}')

In [6]:
df_train = pd.read_csv('../../data/multilabel_encoded.csv')
df_eval = pd.read_csv('../../data/multilabel_encoded_test.csv')
model = Albert()
train_model(model, df_train, df_eval, 5, 16, 1e-5)
torch.save(model.state_dict(), './beto.bin')

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [12:51<00:00,  1.17it/s]


EPOCH 1: Pérdida: 2.8176467299461363
Eval accuracy: [0.6480555555555556, 0.7680555555555556, 0.675, 0.47694444444444445] Eval loss: 3.1501407623291016


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [16:02<00:00,  1.07s/it]


EPOCH 2: Pérdida: 2.194765404462814
Eval accuracy: [0.6688888888888889, 0.7558333333333334, 0.7027777777777777, 0.5186111111111111] Eval loss: 3.923849582672119


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [17:11<00:00,  1.15s/it]


EPOCH 3: Pérdida: 1.2326801699399947
Eval accuracy: [0.6855555555555556, 0.7652777777777777, 0.6961111111111111, 0.4930555555555556] Eval loss: 4.8182477951049805


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [18:35<00:00,  1.24s/it]


EPOCH 4: Pérdida: 0.6337519471347332
Eval accuracy: [0.6747222222222222, 0.7605555555555555, 0.6761111111111111, 0.4888888888888889] Eval loss: 2.876265525817871


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [20:28<00:00,  1.37s/it]


EPOCH 5: Pérdida: 0.3257904165238142
Eval accuracy: [0.6652777777777777, 0.7536111111111111, 0.6841666666666667, 0.4961111111111111] Eval loss: 5.261568546295166


In [7]:
loss_fns = {
        "gender": nn.BCELoss(),
        "profession": nn.CrossEntropyLoss(),
        "ideology": nn.BCELoss(),
        "ideology_multiclass": nn.CrossEntropyLoss()
    }
eval_data_loader = create_data_loader(df_eval, tokenizer_roberta, 16, shuffle = False)
test(model, eval_data_loader, len(df_eval), loss_fns)

([0.6652777777777777,
  0.7536111111111111,
  0.6841666666666667,
  0.4961111111111111],
 tensor(5.5175, device='cuda:0'))

In [16]:
def test_and_save(model, data_loader, total_steps, loss_fns):
    gender = []
    profession = []
    ideology = []
    ideology_multiclass = []
    model = model.eval()
    with torch.no_grad():
        for entry in data_loader:
            y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])
            gender = np.concatenate((gender,torch.where(y_gender.squeeze() > 0.5, 1, 0).cpu().detach().numpy()))
            profession = np.concatenate((profession,torch.argmax(y_profession, dim=1).cpu().detach().numpy()))
            ideology = np.concatenate((ideology,torch.where(y_ideology.squeeze() > 0.5, 1, 0).cpu().detach().numpy()))
            ideology_multiclass = np.concatenate((ideology_multiclass,torch.argmax(y_ideology_multiclass, dim=1).cpu().detach().numpy()))
    return gender, profession, ideology, ideology_multiclass

In [17]:
eval_data_loader = create_data_loader(df_eval, tokenizer_roberta, 16, shuffle = False)
loss_fns = {
        "gender": nn.BCELoss(),
        "profession": nn.CrossEntropyLoss(),
        "ideology": nn.BCELoss(),
        "ideology_multiclass": nn.CrossEntropyLoss()
    }
gender, profession, ideology, ideology_multiclass = test_and_save(model, eval_data_loader, len(df_eval), loss_fns)

In [20]:
from sklearn.metrics import classification_report
for y_pred, y_true in zip([gender, profession, ideology, ideology_multiclass], ["gender","profession", "ideology_binary","ideology_multiclass"]):
    print(f"Report {y_true}")
    print(classification_report(y_pred, df_eval[y_true]))

Report gender
              precision    recall  f1-score   support

         0.0       0.52      0.56      0.54      1265
         1.0       0.75      0.72      0.74      2335

    accuracy                           0.67      3600
   macro avg       0.64      0.64      0.64      3600
weighted avg       0.67      0.67      0.67      3600

Report profession
              precision    recall  f1-score   support

         0.0       0.22      0.35      0.27       100
         1.0       0.79      0.81      0.80      1988
         2.0       0.77      0.71      0.74      1512

    accuracy                           0.75      3600
   macro avg       0.59      0.62      0.60      3600
weighted avg       0.76      0.75      0.76      3600

Report ideology_binary
              precision    recall  f1-score   support

         0.0       0.70      0.74      0.72      1965
         1.0       0.66      0.62      0.64      1635

    accuracy                           0.68      3600
   macro avg       