In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaModel, BertModel, BertTokenizer
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from torch import nn

class Beto(torch.nn.Module):
    
    def __init__(self):
        super(Beto, self).__init__()
        self.max_len = 512
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.tokenizer_beto = BertTokenizer.from_pretrained('dccuchile/bert-base-spanish-wwm-cased')
        self.tokenizer_roberta = RobertaTokenizer.from_pretrained("PlanTL-GOB-ES/roberta-base-bne")
        self.encoder_beto = BertModel.from_pretrained('dccuchile/bert-base-spanish-wwm-cased').to(self.device)
        self.encoder_roberta = RobertaModel.from_pretrained("PlanTL-GOB-ES/roberta-base-bne").to(self.device)
        self.dropout_beto = nn.Dropout(0.15)
        self.dropout_roberta = nn.Dropout(0.15)
        
        self.linear_gender = nn.Linear(1536,1536).to(self.device)
        self.activation_gender = nn.Tanh().to(self.device)
        self.dropout_gender = nn.Dropout(0.15).to(self.device)
        self.linear_gender = nn.Linear(1536,2).to(self.device)
        self.out_gender = nn.Sigmoid().to(self.device)
        
        self.linear_profession = nn.Linear(1536,1536).to(self.device)
        self.activation_profession = nn.Tanh().to(self.device)
        self.dropout_profession = nn.Dropout(0.15).to(self.device)
        self.linear_profession = nn.Linear(1536,3).to(self.device)
        self.out_profession = nn.Sigmoid().to(self.device)
        
        self.linear_ideology = nn.Linear(1536,1536).to(self.device)
        self.activation_ideology = nn.Tanh().to(self.device)
        self.dropout_ideology = nn.Dropout(0.15).to(self.device)
        self.linear_ideology_1 = nn.Linear(1536,2).to(self.device)
        self.out_ideology = nn.Sigmoid().to(self.device)
        
        self.linear_ideology_multiclass = nn.Linear(1536,1536).to(self.device)
        self.activation_ideology_multiclass = nn.Tanh().to(self.device)
        self.dropout_ideology_multiclass = nn.Dropout(0.15).to(self.device)
        self.linear_ideology_multiclass_1 = nn.Linear(1536,4).to(self.device)
        self.out_ideology_multiclass = nn.Softmax(dim = 1).to(self.device)
        
    def forward(self, x):
        ids_beto, token_type_ids_beto, attention_masks_beto = self.tokenizer_beto(x).values()
        def pad_ids(ids):
            ids_ = ids[: (self.max_len-2)]
            ids_ = [4] + ids + [5]
            padding_len_ids = self.max_len - len(ids_)
            ids_ = ids_ + ([0] * padding_len_ids)
            return ids_
        def pad_attmask(mask):
            return mask + ([0] * (self.max_len - len(mask)))
        ids_beto = list(map(pad_ids, ids_beto))
        attention_masks_beto = list(map(pad_attmask, attention_masks_beto))
        ids_roberta, attention_masks_roberta = self.tokenizer_roberta(x).values()
        ids_roberta = list(map(lambda x: x + [0] * (self.max_len - len(x)), ids_roberta))
        attention_masks_roberta = list(map(pad_attmask, attention_masks_roberta))
        embeding_beto = self.encoder_beto(
            torch.LongTensor(ids_beto).to(self.device),
            torch.LongTensor(attention_masks_beto).to(self.device)
        )
        embeding_roberta = self.encoder_roberta(
            input_ids = torch.LongTensor(ids_roberta).to(self.device),
            attention_mask = torch.LongTensor(attention_masks_roberta).to(self.device)
        )
        embeding_beto = self.dropout_beto(embeding_beto['pooler_output'])
        embeding_roberta = self.dropout_roberta(embeding_roberta['pooler_output'])
        embeding = torch.cat((embeding_beto, embeding_roberta), dim=1)
        # embeding = embeding_beto.to(self.device) # Sustituir esta línea cuando se obtenga el otro encoder
        x_gender = self.linear_gender(embeding)
        x_gender = self.activation_gender(x_gender)
        x_gender = self.dropout_gender(x_gender)
        y_gender = self.out_gender(x_gender)
        
        x_profession = self.linear_profession(embeding)
        x_profession = self.activation_profession(x_profession)
        x_profession = self.dropout_profession(x_profession)
        y_profession = self.out_profession(x_profession)
        
        x_ideology = self.linear_ideology(embeding)
        x_ideology = self.activation_ideology(x_ideology)
        x_ideology = self.dropout_ideology(x_ideology)
        y_ideology = self.out_ideology(x_ideology)
        
        x_ideology_multiclass = self.linear_ideology_multiclass(embeding)
        x_ideology_multiclass = self.activation_ideology_multiclass(x_ideology)
        x_ideology_multiclass = self.dropout_ideology_multiclass(x_ideology)
        y_ideology_multiclass = self.out_ideology_multiclass(x_ideology_multiclass) 
        return y_gender, y_profession, y_ideology, y_ideology_multiclass
        

In [3]:
from torch.utils.data import DataLoader, Dataset

class PoliticESDataset(Dataset):
    def __init__(self, tweet, gender, profession, ideology, ideology_mc):
        self.tweet = tweet
        self.gender = gender
        self.profession = profession
        self.ideology = ideology
        self.ideology_mc = ideology_mc
    def __len__(self):
        return len(self.tweet)
    def __getitem__(self, item):
        return {
            'tweet': self.tweet[item],
            'labels': [
                self.gender[item],
                self.profession[item],
                self.ideology[item],
                self.ideology[item]
            ]
        }

def create_data_loader(df, batch_size = 16):
    return DataLoader(
        PoliticESDataset(
            tweet = df.tweet.to_numpy(),
            gender = df.gender.to_numpy(),
            profession = df.profession.to_numpy(),
            ideology = df.ideology_binary.to_numpy(),
            ideology_mc = df.ideology_multiclass.to_numpy()
        ),
        batch_size = batch_size
    )

In [4]:
from tqdm import tqdm

## CAMBIAR LOSS POR ONE HOT ENCODED.

def fit(model, data_loader, total_steps, optimizer, loss_fn):
    correct_predictions = [0.,0.,0.,0.]
    running_loss = 0.
    last_loss = 0.
    for i, entry in tqdm(enumerate(data_loader), total = total_steps/4):
        optimizer.zero_grad()
        y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])
        losses = []
        for index, (y, t) in enumerate(zip([y_gender, y_profession, y_ideology, y_ideology_multiclass], entry['labels'])):
            correct_predictions[index] += np.sum(torch.argmax(y, dim=1).cpu().numpy() == t.cpu().numpy())
            loss = loss_fn(y.to(device), t.to(device)).to(device)
            losses.append(loss)
        loss = sum(losses)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 99:
            last_loss = running_loss / 100
            running_loss = 0.
    return np.mean(correct_predictions) / total_steps, last_loss

def train_model(model, train_dataset, eval_dataset, EPOCHS, batch_size, lr):
    train_data_loader = create_data_loader(train_dataset, batch_size)
    eval_data_loader = create_data_loader(eval_dataset, batch_size)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(EPOCHS):
        train_acc, train_loss = fit(model, train_data_loader, len(train_dataset), optimizer, loss_fn)
        print(f'Precisión EPOCH {epoch + 1}: {train_acc}. Pérdida: {train_loss}')

In [5]:
from sklearn.model_selection import train_test_split
model = Beto()
df = pd.read_csv("../../data/multilabel_encoded.csv")
X_train, X_val, df_train, df_val = train_test_split(
    df['tweet'],
    df[['gender', 'profession','ideology_binary', 'ideology_multiclass']],
    test_size=0.08,
    random_state = 100
)
df_val['tweet'] = X_val
df_train['tweet'] = X_train
train_model(model, df_train, df_val, EPOCHS = 8, batch_size = 4, lr=3e-5)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.decoder.weig

Precisión EPOCH 1: 0.5690104166666666. Pérdida: 15.882005043029785


100%|████████████████████████████████████████████████████████████████████████████| 3312/3312.0 [27:25<00:00,  2.01it/s]


Precisión EPOCH 2: 0.5618017814009661. Pérdida: 15.872678308486938


100%|████████████████████████████████████████████████████████████████████████████| 3312/3312.0 [27:05<00:00,  2.04it/s]


Precisión EPOCH 3: 0.558178592995169. Pérdida: 15.888975343704224


100%|████████████████████████████████████████████████████████████████████████████| 3312/3312.0 [27:05<00:00,  2.04it/s]


Precisión EPOCH 4: 0.5667647946859904. Pérdida: 15.87449758529663


 29%|██████████████████████▎                                                      | 958/3312.0 [07:53<19:23,  2.02it/s]


KeyboardInterrupt: 

In [None]:
def test(model, df_test):
    test_data_loader = create_data_loader(df_test, batch_size=8)
    model = model.eval()
    genders = []
    professions = []
    ideology = []
    ideology_multiclass = []
    with torch.no_grad():
        for entry in test_data_loader:
            y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])
            genders = genders + torch.argmax(y_gender, dim=1).item()
            professions = professions + torch.argmax(y_professions, dim=1).item()
            ideology = ideology + torch.argmax(y_ideology, dim=1).item()
            ideology_multiclass = ideology_multiclass + torch.argmax(y_ideology_multiclass, dim=1).item()
    df_preds = pd.DataFrame({
        "tweet": df_test["tweet"],
        "gender": genders,
        "professions": professions,
        "ideology_binary": ideology,
        "ideology_multiclass": ideology_multiclass
    })
    df_preds.to_csv("./data/predicted.csv")
df_test = pd.read_csv("data/multilabel_encoded_test.csv")
test(model, df_test)