In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from torch import nn

class Albert(torch.nn.Module):
    
    def __init__(self):
        super(Albert, self).__init__()
        self.max_len = 512
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.tokenizer_albert = RobertaTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")
        self.encoder_albert = RobertaModel.from_pretrained("PlanTL-GOB-ES/roberta-base-bne").to(self.device)
        self.dropout_albert = nn.Dropout(0.15)
        
        self.linear_gender = nn.Linear(768,384).to(self.device)
        self.activation_gender = nn.Tanh().to(self.device)
        self.dropout_gender = nn.Dropout(0.15).to(self.device)
        self.linear_gender_1 = nn.Linear(384,1).to(self.device)
        self.out_gender = nn.Sigmoid().to(self.device)
        
        self.linear_profession = nn.Linear(768,384).to(self.device)
        self.activation_profession = nn.Tanh().to(self.device)
        self.dropout_profession = nn.Dropout(0.15).to(self.device)
        self.linear_profession_1 = nn.Linear(384,3).to(self.device)
        self.out_profession = nn.LogSoftmax(dim = 1).to(self.device)
        
        self.linear_ideology = nn.Linear(768,384).to(self.device)
        self.activation_ideology = nn.Tanh().to(self.device)
        self.dropout_ideology = nn.Dropout(0.15).to(self.device)
        self.linear_ideology_1 = nn.Linear(384,1).to(self.device)
        self.out_ideology = nn.Sigmoid().to(self.device)
        
        self.linear_ideology_multiclass = nn.Linear(768,384).to(self.device)
        self.activation_ideology_multiclass = nn.Tanh().to(self.device)
        self.dropout_ideology_multiclass = nn.Dropout(0.15).to(self.device)
        self.linear_ideology_multiclass_1 = nn.Linear(384,4).to(self.device)
        self.out_ideology_multiclass = nn.LogSoftmax(dim = 1).to(self.device)
        
    def forward(self, x):
        tokenized = self.tokenizer_albert.batch_encode_plus(x,     
                        add_special_tokens = True, 
                        max_length = 512,        
                        truncation=True,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt')
        embedings = self.encoder_albert(**tokenized.to(self.device))
        embedings = self.dropout_albert(embedings['pooler_output']).to(self.device)
        x_gender = self.linear_gender(embedings)
        x_gender = self.activation_gender(x_gender)
        x_gender = self.dropout_gender(x_gender)
        x_gender = self.linear_gender_1(x_gender)
        y_gender = self.out_gender(x_gender)
        
        x_profession = self.linear_profession(embedings)
        x_profession = self.activation_profession(x_profession)
        x_profession = self.dropout_profession(x_profession)
        x_profession = self.linear_profession_1(x_profession)
        y_profession = self.out_profession(x_profession)
        
        x_ideology = self.linear_ideology(embedings)
        x_ideology = self.activation_ideology(x_ideology)
        x_ideology = self.dropout_ideology(x_ideology)
        x_ideology = self.linear_ideology_1(x_ideology)
        y_ideology = self.out_ideology(x_ideology)
        
        x_ideology_multiclass = self.linear_ideology_multiclass(embedings)
        x_ideology_multiclass = self.activation_ideology_multiclass(x_ideology_multiclass)
        x_ideology_multiclass = self.dropout_ideology_multiclass(x_ideology_multiclass)
        x_ideology_multiclass = self.linear_ideology_multiclass_1(x_ideology_multiclass)
        y_ideology_multiclass = self.out_ideology_multiclass(x_ideology_multiclass)
        return y_gender, y_profession, y_ideology, y_ideology_multiclass

In [3]:
from torch.utils.data import DataLoader, Dataset

class PoliticESDataset(Dataset):
    def __init__(self, tweet, gender, profession, ideology, ideology_mc):
        self.tweet = tweet      
        self.gender = gender
        self.profession = profession
        self.ideology = ideology
        self.ideology_mc = ideology_mc
    def __len__(self):
        return len(self.tweet)
    def __getitem__(self, item):
        return {
            'tweet': self.tweet[item],
            'labels': [
                self.gender[item],
                self.profession[item],
                self.ideology[item],
                self.ideology_mc[item]
            ]
        }

def create_data_loader(df, batch_size = 16):
    return DataLoader(
        PoliticESDataset(
            tweet=df.tweet.to_numpy(),
            gender = df.gender.to_numpy(),
            profession = df.profession.to_numpy(),
            ideology = df.ideology_binary.to_numpy(),
            ideology_mc = df.ideology_multiclass.to_numpy()
        ),
        batch_size = batch_size
    )

In [4]:
from tqdm import tqdm

def fit(model, data_loader, total_steps, optimizer, loss_fns):
    model = model.train()
    running_loss = 0.
    last_loss = 0.
    for i, entry in tqdm(enumerate(data_loader), total = total_steps/16):
        optimizer.zero_grad()
        y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])       
        loss_gender = loss_fns["gender"](y_gender.to(device).flatten(), entry["labels"][0].to(torch.float32).to(device)).to(device)
        loss_profession = loss_fns["profession"](y_profession.to(device), entry["labels"][1].to(device)).to(device)
        loss_ideology = loss_fns["ideology"](y_ideology.to(device).flatten(), entry["labels"][2].to(torch.float32).to(device)).to(device)
        loss_ideology_multiclass = loss_fns["ideology_multiclass"](y_ideology_multiclass.to(device), entry["labels"][3].to(device)).to(device)
        loss = loss_gender + loss_profession + loss_ideology + loss_ideology_multiclass
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 99:
            last_loss = running_loss / 100
            running_loss = 0.
    return last_loss

def test(model, data_loader, total_steps, loss_fns):
    correct_predictions = [0.,0.,0.,0.]
    loss = 0.
    model = model.eval()
    with torch.no_grad():
        for entry in data_loader:
            y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])
            loss_gender = loss_fns["gender"](y_gender.flatten().to(device), entry["labels"][0].to(torch.float32).to(device)).to(device)
            loss_profession = loss_fns["profession"](y_profession.to(device), entry["labels"][1].to(device)).to(device)
            loss_ideology = loss_fns["ideology"](y_ideology.flatten().to(device), entry["labels"][2].to(torch.float32).to(device)).to(device)
            loss_ideology_multiclass = loss_fns["ideology_multiclass"](y_ideology_multiclass.to(device), entry["labels"][3].to(device)).to(device)
            loss = loss_gender + loss_profession + loss_ideology + loss_ideology_multiclass
            for i, (e, y) in enumerate(zip(entry['labels'], [y_gender, y_profession, y_ideology, y_ideology_multiclass])):
                correct_predictions[i] = correct_predictions[i] + torch.sum(torch.eq(e.to(device), torch.argmax(y, dim=1).to(device))).cpu().detach().item()
    return [x/total_steps for x in correct_predictions], loss

def train_model(model, train_dataset, eval_dataset, EPOCHS, batch_size, lr):
    train_data_loader = create_data_loader(train_dataset, batch_size)
    eval_data_loader = create_data_loader(eval_dataset, batch_size)
    loss_fns = {
        "gender": nn.BCELoss(),
        "profession": nn.NLLLoss(),
        "ideology": nn.BCELoss(),
        "ideology_multiclass": nn.NLLLoss(),
    }
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(EPOCHS):
        train_loss = fit(model, train_data_loader, len(train_dataset), optimizer, loss_fns)
        print(f'EPOCH {epoch + 1}: Pérdida: {train_loss}')
        eval_acc, eval_loss = test(model, eval_data_loader, len(eval_dataset), loss_fns)
        print(f'Eval loss: {eval_acc} Eval accs: {eval_loss.item()}')

In [None]:
df_train = pd.read_csv('../../data/multilabel_encoded.csv')
df_eval = pd.read_csv('../../data/multilabel_encoded_test.csv')
model = Albert()
train_model(model, df_train, df_eval, 5, 16, 3e-4)
torch.save(model.state_dict(), './albert.bin')

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [11:45<00:00,  1.28it/s]


EPOCH 1: Pérdida: 3.6079621052742006
Eval loss: [0.37777777777777777, 0.3888888888888889, 0.5777777777777777, 0.4] Eval accs: 3.8938846588134766


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [11:54<00:00,  1.26it/s]


EPOCH 2: Pérdida: 3.6061617851257326
Eval loss: [0.37777777777777777, 0.5666666666666667, 0.5777777777777777, 0.4] Eval accs: 3.9865453243255615


100%|██████████████████████████████████████████████████████████████████████████████| 900/900.0 [11:56<00:00,  1.26it/s]


EPOCH 3: Pérdida: 3.6637202382087706
Eval loss: [0.37777777777777777, 0.3888888888888889, 0.5777777777777777, 0.4] Eval accs: 3.9148473739624023


 78%|█████████████████████████████████████████████████████████████                 | 705/900.0 [09:22<02:40,  1.22it/s]

In [None]:
loss_fns = {
        "gender": nn.CrossEntropyLoss,
        "profession": nn.NLLLoss(),
        "ideology": nn.CrossEntropyLoss,
        "ideology_multiclass": nn.NLLLoss(),
    }
eval_data_loader = create_data_loader(df_eval, 16)
test(model, eval_data_loader, len(df_eval), loss_fns)

In [None]:
t = torch.tensor([[2],[3]])
t.flatten()