In [3]:
import numpy as np
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2Model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
from torch import nn

class GPT2(torch.nn.Module):
    
    def __init__(self):
        super(GPT2, self, encoder, max_len).__init__()
        self.max_len = 512
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.encoder = encoder
        self.dropout = nn.Dropout(0.15)
        
        gender_layer = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, encoder.config.hidden_size),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(encoder.config.hidden_size, 1),
            nn.Sigmoid()
        )
        
        profession_layer = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, encoder.config.hidden_size),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(encoder.config.hidden_size, 3)
        )
        
        ideology_layer = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, encoder.config.hidden_size),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(encoder.config.hidden_size, 1),
            nn.Sigmoid()
        )
        
        ideology_multiclass = nn.Sequential(
            nn.Linear(encoder.config.hidden_size, encoder.config.hidden_size),
            nn.ReLU(),
            nn.Dropout(0.15),
            nn.Linear(encoder.config.hidden_size, 4)
        )
        
    def forward(self, x):
        input_ids, attention_mask = x.values()
        embedings = self.encoder(input_ids = input_ids.squeeze().to(self.device), attention_mask = attention_mask.squeeze().to(self.device))
        embedings = self.dropout(embedings['hidden_states'][:,0]).to(self.device)
        return self.gender_layer(embedings), self.profession_layer(embedings), self.ideology_layer(embedings), self.ideology_multiclass_layer(embedings)

In [3]:
from torch.utils.data import DataLoader, Dataset

class PoliticESDataset(Dataset):
    def __init__(self, tweet, gender, profession, ideology, ideology_mc, tokenizer):
        self.tweet = tweet
        self.gender = gender
        self.profession = profession
        self.ideology = ideology
        self.ideology_mc = ideology_mc
    def __len__(self):
        return len(self.roberta_ids)
    def __getitem__(self, item):
        return {
            'tweet': self.tokenizer.encode_plus(self.tweet[item],     
                        add_special_tokens = True, 
                        max_length = 512,        
                        truncation=True,
                        padding = 'max_length',
                        return_attention_mask = True,
                        return_tensors = 'pt')
            ,
            'labels': [
                self.gender[item],
                self.profession[item],
                self.ideology[item],
                self.ideology_mc[item]
            ]
        }

def create_data_loader(df, tokenizer, batch_size = 16):
    return DataLoader(
        PoliticESDataset(
            tweet=df.tweet.to_numpy(),
            gender = df.gender.to_numpy(),
            profession = df.profession.to_numpy(),
            ideology = df.ideology_binary.to_numpy(),
            ideology_mc = df.ideology_multiclass.to_numpy(),
            tokenizer = tokenizer
        ),
        batch_size = batch_size,
        shuffle=True
    )

In [4]:
from tqdm import tqdm

def fit(model, data_loader, total_steps, optimizer, loss_fns):
    model = model.train()
    running_loss = 0.
    last_loss = 0.
    for i, entry in tqdm(enumerate(data_loader), total = total_steps/16):
        optimizer.zero_grad()
        y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])       
        loss_gender = loss_fns["gender"](y_gender.squeeze().to(device), entry["labels"][0].to(torch.float32).to(device)).to(device)
        loss_profession = loss_fns["profession"](y_profession.to(device), entry["labels"][1].to(device)).to(device)
        loss_ideology = loss_fns["ideology"](y_ideology.squeeze().to(device), entry["labels"][2].to(torch.float32).to(device)).to(device)
        loss_ideology_multiclass = loss_fns["ideology_multiclass"](y_ideology_multiclass.to(device), entry["labels"][3].to(device)).to(device)
        loss = loss_gender + loss_profession + loss_ideology + loss_ideology_multiclass
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 100 == 99:
            last_loss = running_loss / 100
            running_loss = 0.
    return last_loss

def test(model, data_loader, total_steps, loss_fns):
    correct_predictions = [0.,0.,0.,0.]
    loss = 0.
    model = model.eval()
    with torch.no_grad():
        for entry in data_loader:
            y_gender, y_profession, y_ideology, y_ideology_multiclass = model(entry["tweet"])
            loss_gender = loss_fns["gender"](y_gender.squeeze().to(device), entry["labels"][0].to(torch.float32).to(device)).to(device)
            loss_profession = loss_fns["profession"](y_profession.to(device), entry["labels"][1].to(device)).to(device)
            loss_ideology = loss_fns["ideology"](y_ideology.squeeze().to(device), entry["labels"][2].to(torch.float32).to(device)).to(device)
            loss_ideology_multiclass = loss_fns["ideology_multiclass"](y_ideology_multiclass.to(device), entry["labels"][3].to(device)).to(device)
            loss = loss_gender + loss_profession + loss_ideology + loss_ideology_multiclass
            correct_predictions[0] += torch.sum(torch.eq(entry["labels"][0].to(device), torch.where(y_gender.squeeze() > 0.5, 1, 0).to(device))).cpu().detach().item()
            correct_predictions[1] += torch.sum(torch.eq(entry["labels"][1].to(device), torch.argmax(y_profession, dim=1).to(device))).cpu().detach().item()
            correct_predictions[2] += torch.sum(torch.eq(entry["labels"][2].to(device), torch.where(y_ideology.squeeze() > 0.5, 1, 0).to(device))).cpu().detach().item()
            correct_predictions[3] += torch.sum(torch.eq(entry["labels"][3].to(device), torch.argmax(y_ideology_multiclass, dim=1).to(device))).cpu().detach().item()
    return [x/total_steps for x in correct_predictions], loss


def train_model(model, train_dataset, eval_dataset, tokenizer, EPOCHS, batch_size, lr):
    train_data_loader = create_data_loader(train_dataset, tokenizer_roberta, batch_size)
    eval_data_loader = create_data_loader(eval_dataset, tokenizer_roberta, batch_size)
    loss_fns = {
        "gender": nn.BCELoss(),
        "profession": nn.CrossEntropyLoss(),
        "ideology": nn.BCELoss(),
        "ideology_multiclass": nn.CrossEntropyLoss()
    }
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(EPOCHS):
        train_loss = fit(model, train_data_loader, len(train_dataset), optimizer, loss_fns)
        print(f'EPOCH {epoch + 1}: Pérdida: {train_loss}')
        eval_acc, eval_loss = test(model, eval_data_loader, len(eval_dataset), loss_fns)
        print(f'Eval accuracy: {eval_acc} Eval loss: {eval_loss.item()}')

In [5]:
tokenizer = GPT2Tokenizer.from_pretrained("mrm8488/spanish-gpt2").to(device)
encoder_gpt2 = GPT2Model.from_pretrained("mrm8488/spanish-gpt2").to(device)
df_train = pd.read_csv('../../data/multilabel_encoded.csv')
df_eval = pd.read_csv('../../data/multilabel_encoded_test.csv')
model = GPT2(encoder_gpt2, 512)
train_model(model, df_train, df_eval, tokenizer, 5, 16, 3e-5)
torch.save(model.state_dict(), './gpt2.bin')

Some weights of the model checkpoint at PlanTL-GOB-ES/roberta-base-bne were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at datificate/gpt2-small-spanish were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another

OutOfMemoryError: CUDA out of memory. Tried to allocate 96.00 MiB (GPU 0; 11.00 GiB total capacity; 10.16 GiB already allocated; 0 bytes free; 10.31 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF