In [39]:
import numpy as np
import pandas as pd
import torch
from transformers import DistilBertTokenizer, GPT2Tokenizer, GPT2LMHeadModel, DistilBertModel, BloomModel, BloomTokenizerFast

In [43]:
tokenizer_beto = DistilBertTokenizer.from_pretrained('dccuchile/distilbert-base-spanish-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/361 [00:00<?, ?B/s]

In [124]:
from torch import nn

class Beto(torch.nn.Module):
    
    def __init__(self):
        super(Beto, self).__init__()
        self.max_len = 512
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.tokenizer_beto = DistilBertTokenizer.from_pretrained('dccuchile/distilbert-base-spanish-uncased')
        #self.tokenizer_bloom = BloomTokenizerFast.from_pretrained('bigscience/bloom')
        self.encoder_beto = DistilBertModel.from_pretrained('dccuchile/distilbert-base-spanish-uncased')
        #self.encoder_bloom = BloomModel.from_pretrained('bigscience/bloom')
        self.dropout_beto = nn.Dropout(0.15)
        #self.dropout_bloom = nn.Dropout(0.15)
        
        self.linear_gender = nn.Linear(768,768)
        self.activation_gender = nn.Tanh()
        self.dropout_gender = nn.Dropout(0.15)
        self.linear_gender = nn.Linear(768,2)
        self.out_gender = nn.Sigmoid()
        
        self.linear_profession = nn.Linear(768,768)
        self.activation_profession = nn.Tanh()
        self.dropout_profession = nn.Dropout(0.15)
        self.linear_profession = nn.Linear(768,3)
        self.out_profession = nn.Sigmoid()
        
        self.linear_ideology = nn.Linear(768,768)
        self.activation_ideology = nn.Tanh()
        self.dropout_ideology = nn.Dropout(0.15)
        self.linear_ideology_1 = nn.Linear(768,2)
        self.out_ideology = nn.Sigmoid()
        
        self.linear_ideology_multiclass = nn.Linear(768,768)
        self.activation_ideology_multiclass = nn.Tanh()
        self.dropout_ideology_multiclass = nn.Dropout(0.15)
        self.linear_ideology_multiclass_1 = nn.Linear(768,4)
        self.out_ideology_multiclass = nn.Softmax()
        
    def forward(self, x):
        ids_beto, attention_masks_beto = self.tokenizer_beto(x).values()
        for ids, attention_mask in zip(ids_beto, attention_masks_beto):
            ids = ids[:self.max_len - 2]
            ids = [4] + ids + [5]
            padding_len_ids = self.max_len - len(ids)
            attention_mask = attention_mask + ([0] * (padding_len_ids - 2))
            ids = ids + ([0] * padding_len_ids)
        #ids_bloom, attention_mask_bloom = self.tokenizer_bloom(x).values()
        print(ids_beto)
        embeding_beto = self.encoder_beto(
            torch.LongTensor(ids_beto).to(self.device),
            torch.LongTensor(attention_masks_beto).to(self.device)
        )
        #embeding_bloom = encoder_bloom(
        #    torch.LongTensor(ids_bloom).to(self.device),
        #    torch.LongTensor(attention_masks_bloom).to(self.device)
        #)
        embeding_beto = self.dropout_beto(embeding_beto)
        #embeding_bloom = self.dropout_bloom(embeding_bloom)
        #embeding = torch.cat(embeding_beto, embeding_bloom)
        embeding = embeding_beto # Sustituir esta línea cuando se obtenga el otro encoder
        x_gender = self.linear_gender(embeding)
        x_gender = self.activation_gender(x_gender)
        x_gender = self.dropout_gender(x_gender)
        y_gender = self.out_gender(x_gender)
        
        x_profession = self.linear_profession(embeding)
        x_profession = self.activation_profession(x_profession)
        x_profession = self.dropout_profession(x_profession)
        y_profession = self.out_profession(x_profession)
        
        x_ideology = self.linear_ideology(embeding)
        x_ideology = self.activation_ideology(x_ideology)
        x_ideology = self.dropout_ideology(x_ideology)
        y_ideology = self.out_ideology(x_ideology)
        
        x_ideology_multiclass = self.linear_ideology_multiclass(embeding)
        x_ideology_multiclass = self.activation_ideology_multiclass(x_ideology)
        x_ideology_multiclass = self.dropout_ideology_multiclass(x_ideology)
        y_ideology_multiclass = self.out_ideology_multiclass(x_ideology_multiclass)
        
        return [y_gender, y_profession, y_ideology, y_ideology_multiclass]
        

In [114]:
from torch.utils.data import DataLoader, Dataset

class PoliticESDataset(Dataset):
    def __init__(self, tweet, gender, profession, ideology, ideology_mc):
        self.tweet = tweet
        self.gender = gender
        self.profession = profession
        self.ideology = ideology
        self.ideology_mc = ideology_mc
    def __len__(self):
        return len(self.tweet)
    def __getitem__(self, item):
        return {
            'tweet': self.tweet[item],
            'labels': [
                self.gender[item],
                self.profession[item],
                self.ideology[item],
                self.ideology[item]
            ]
        }

def create_data_loader(df, batch_size = 16):
    return DataLoader(
        PoliticESDataset(
            tweet = df.tweet.to_numpy(),
            gender = df.gender.to_numpy(),
            profession = df.profession.to_numpy(),
            ideology = df.ideology_binary.to_numpy(),
            ideology_mc = df.ideology_multiclass.to_numpy()
        ),
        batch_size = batch_size
    )

In [115]:
from tqdm import tqdm

def criterion(loss_func, outputs, targets):
    losses = 0
    for output, target in zip(output, targets):
        losses += loss_func(output, target)
    return losses

def fit(model, data_loader, total_steps, optimizer, loss_fn):
    targets = []
    predictions = []
    running_loss = 0.
    last_loss = 0.
    for i, entry in tqdm(enumerate(data_loader)):
        optimizer.zero_grad()
        output = model(entry["tweet"])
        targets.append(ouput.item())
        predictions.append(entry['labels'])
        loss = criterion(loss_fn, output, entry['labels']).to(device)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        if i % 1000 == 999:
            last_loss = running_loss / 1000
            bingoes = []
            for t,p in zip(targets, predictions):
                for i in range (3):
                    bingoes[i] += np.sum(targets[i]==predictions[i])
                    print(f' precisión etiqueta {i}: {bingoes[i]}')
            print(f' pérdida del lote: {last_loss}')
            running_loss = 0.

def train_model(model, train_dataset, eval_dataset, EPOCHS, batch_size, lr):
    train_data_loader = create_data_loader(train_dataset, batch_size)
    eval_data_loader = create_data_loader(eval_dataset, batch_size)
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(EPOCHS):
        train_acc, train_loss = fit(model, train_data_loader, len(train_dataset), optimizer, loss_fn)
        

In [125]:
from sklearn.model_selection import train_test_split
model = Beto()
df = pd.read_csv("../../data/multilabel_encoded.csv")
X_train, X_val, df_train, df_val = train_test_split(df['tweet'], df[['gender', 'profession','ideology_binary', 'ideology_multiclass']], test_size=0.15, random_state = 100)
df_val['tweet'] = X_val
df_train['tweet'] = X_train
train_model(model, df_train, df_val, EPOCHS = 2, batch_size = 16, lr=3e-5)

Some weights of the model checkpoint at dccuchile/distilbert-base-spanish-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
0it [00:00, ?it/s]

[[4, 1894, 2909, 13079, 15292, 1019, 985, 9087, 30960, 2808, 15454, 1069, 10338, 1019, 1149, 1434, 1048, 1032, 4588, 1009, 1098, 1000, 3388, 8724, 1008, 1048, 1914, 23714, 1019, 1041, 4481, 11926, 1039, 1098, 1009, 3050, 1019, 1032, 10004, 11081, 1067, 4733, 1000, 8724, 1009, 13431, 1008, 1035, 985, 9087, 30960, 5], [4, 985, 9087, 30960, 1628, 1532, 1032, 4638, 20412, 2375, 2134, 30956, 1008, 1041, 1035, 2870, 1028, 4804, 1008, 5], [4, 985, 9087, 30960, 3547, 1019, 1792, 1318, 1039, 2657, 1019, 1086, 8601, 1019, 1040, 1578, 1076, 4393, 1009, 1032, 2030, 1019, 1081, 3151, 8358, 1040, 1009, 2048, 2252, 1008, 1028, 1032, 2035, 6537, 1041, 3297, 1008, 5], [4, 3194, 1008, 985, 9087, 30960, 1057, 1086, 8177, 1012, 7386, 995, 1351, 8861, 1022, 1012, 21177, 1081, 1527, 13014, 1012, 1067, 1041, 17363, 12527, 1190, 1019, 1777, 2484, 1041, 1054, 1094, 7508, 1152, 1076, 1792, 1195, 1212, 1076, 1069, 4841, 1008, 1388, 1019, 26532, 1086, 2332, 1097, 1027, 14059, 24331, 30955, 1040, 10183, 30964, 100




ValueError: expected sequence of length 52 at dim 1 (got 20)

In [93]:
y_train

Unnamed: 0,0,gender,profession,ideology_binary,ideology_multiclass
605,"Tras varios ejercicios excelentes, @user sigue...",,,,
9166,@user Solo contra la izquierda rojiparda. Que ...,,,,
12082,"@user Juan, ellos son el pueblo, lo representa...",,,,
8569,Sra. @user se lo vuelvo a explicar: Cuando ace...,,,,
575,"@user Pues, camarada, te falta conocer mujeres...",,,,
...,...,...,...,...,...
12119,,0.0,1.0,1.0,3.0
8039,,1.0,1.0,0.0,1.0
14147,,1.0,0.0,0.0,1.0
6936,,0.0,0.0,0.0,1.0


In [61]:

train_data_loader = create_data_loader(df_train)

In [30]:
encoder_gpt2 = GPT2LMHeadModel.from_pretrained('datificate/gpt2-small-spanish')
input_ids_gpt2, at_mask_gpt2 = tokenizer_gpt2(['Hola buenas tardes', 'Hola buenas noches']).values()

In [49]:
tokenizer_beto('Hola buenas tardes').values()

dict_values([[4, 1734, 2972, 10095, 5], [1, 1, 1, 1, 1]])

In [33]:
encoder_gpt2(input_ids=torch.LongTensor(i), attention_mask=torch.FloatTensor(a))

torch.Size([4, 50257])

In [45]:
encoder_beto(torch.tensor(input_ids), torch.tensor(token_type), torch.tensor(at_mask))['pooler_output']

NameError: name 'encoder_beto' is not defined