In [1]:
print('hello world')

hello world


In [2]:
# import torch
import torch.nn as nn
class BetoBasic(nn.Module):
    def __init__(
        self, 
        tokenizer, 
        model,
    ):
        super().__init__()
        self.tokenizer = tokenizer
        self.bert = model
        
    def tokenize_data(self, example):
        return self.tokenizer(example['features'], padding = 'max_length')
        
    def forward(self, input_ids, token_type_ids, attention_mask):
        hidden = self.bert(
            input_ids = input_ids, 
            token_type_ids = token_type_ids,
            attention_mask = attention_mask
        ).last_hidden_state[:, 0, :]
        return self.output_layer(hidden)
    
    def predecir_base(self, X, batch_size = 2, progress_bar = False):
        with torch.no_grad():
            self.eval()
#             print("    Generating predictions...")
            tokens = self.tokenizer(
                X.tolist(), 
                padding = "longest", 
                truncation = True
            )
            p = next(self.parameters())
            input_ids = torch.tensor(tokens["input_ids"]).long()
            token_type_ids = torch.tensor(tokens["token_type_ids"]).long()
            attention_mask = torch.tensor(tokens["attention_mask"]).long()
            dataset = tud.TensorDataset(
                input_ids, 
                token_type_ids,
                attention_mask,
            )
            loader = tud.DataLoader(dataset, batch_size = batch_size)
            output = []
            iterator = iter(loader)
            if progress_bar:
                iterator = tqdm(iterator)
            for batch in iterator:
                i, t, a = batch
                predictions = self.forward(
                    input_ids = i.to(p.device),
                    token_type_ids = t.to(p.device),
                    attention_mask = a.to(p.device)
                )
                output.append(predictions)
            return torch.cat(output, axis = 0)

In [3]:
class BetoMTL(BetoBasic):
    def __init__(
        self, 
        tokenizer, 
        model,
    ):
        super().__init__(tokenizer, model)
        self.output_layer = nn.Linear(768, 10)
        torch.save(self.state_dict(), "clf.pt")      
        
    def predecir_proba(self, X, **kwargs):
        return self.predecir_base(X, **kwargs).sigmoid().cpu().numpy()
    
    def predecir(self, X, **kwargs):
        return self.predecir_proba(X, **kwargs) > 0.5

    def entrenar(
        self, 
        X_train, 
        Y_train,
        X_test,
        Y_test,
        epochs = 2, 
        batch_size = 2,
        learning_rate = 10**-5,
        progress_bar = True,
        refresh = True, 
        weight_decay = 0, 
        freeze_encoder = False, 
    ):
        assert isinstance(X_train, np.ndarray)
        assert isinstance(Y_train, np.ndarray)
        assert isinstance(X_test, np.ndarray)
        assert isinstance(Y_test, np.ndarray)
        print("Training model...")
        if refresh:
            self.load_state_dict(torch.load("clf.pt"))
        if freeze_encoder:
            for param in self.bert.parameters():
                param.requires_grad = False
        tokens = self.tokenizer(
            X_train.tolist(), 
            padding = "longest", 
            truncation = True
        )
        p = next(self.parameters())
        input_ids = torch.tensor(tokens["input_ids"]).long()
        token_type_ids = torch.tensor(tokens["token_type_ids"]).long()
        attention_mask = torch.tensor(tokens["attention_mask"]).long()
        label = torch.tensor(Y_train).float()
        dataset = tud.TensorDataset(
            input_ids, 
            token_type_ids,
            attention_mask,
            label
        )
        loader = tud.DataLoader(
            dataset, 
            shuffle = True, 
            batch_size = batch_size
        )
        criterion = nn.BCEWithLogitsLoss()
        optimizer = torch.optim.AdamW(self.parameters(), lr = learning_rate, weight_decay = weight_decay)
        training_steps = epochs * len(loader)
        if progress_bar:
            bar = tqdm(range(training_steps))
        self.train()
        for epoch in range(1, epochs + 1):
            losses = []
            accuracies = []
            for j, batch in enumerate(loader):
                i, t, a, l = batch
                predictions = self.forward(
                    input_ids = i.to(p.device),
                    token_type_ids = t.to(p.device),
                    attention_mask = a.to(p.device)
                )
                l = l.to(p.device)
                loss = criterion(predictions, l)
                accuracy = ((predictions > 0) == l).sum()
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                if progress_bar:
                    bar.update(1)
                losses.append(loss.item())
                accuracies.append(accuracy.item())
                if epoch == 1 and j == 0:
                    print(f"    first step, train loss:{loss.item():.4f}")
            total_loss = sum(losses) / len(losses)
            total_accuracy = sum(accuracies) / len(X_train) / 10
            print(f"    epoch: {epoch}, train loss:{total_loss:.4f}, train accuracy: {total_accuracy:.4f}")


In [4]:
import pandas as pd
data = pd.read_csv("../../data/task_2.csv")



labels = data.iloc[:, 1:]
labels

Unnamed: 0,xenophobia,suffering,economic,migration,culture,benefits,health,security,dehumanisation,others
0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
3812,0,0,0,0,0,0,0,0,0,0
3813,0,0,0,0,0,0,0,0,0,0
3814,0,0,0,0,0,0,0,0,0,0
3815,0,0,0,0,0,0,0,0,0,0


In [5]:
import transformers
import torch

tokenizer = transformers.AutoTokenizer.from_pretrained("../../assets/beto/tokenizer")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
model = transformers.AutoModel.from_pretrained("../../assets/beto/model")
model.to(device)

X = data.sentence.sample(100)
Y = labels.sample(100)

Some weights of the model checkpoint at ../../assets/beto/model were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
import importlib
import sys
sys.path.append('../../library/')
import juan
import utils

importlib.reload(juan)
importlib.reload(utils)
clf = juan.BetoMTL(tokenizer, model)
clf.to(device)

# clf.entrenar(
#     X,
#     Y,
#     epochs = 5,
# #     freeze_encoder = True,
# #     weight_decay = 1000,
# #     class_weights = "balanced"
# )

# predictions = clf.predecir_proba(X)
# # predictions = clf.predecir(X)
# predictions[:5]

results = utils.validate_MTL_juan(
    X,
    Y, 
#     X, Y,
    clf, 
#     epochs = 5,
#     freeze_encoder = True,
    progress_bar = False
#     weight_decay = 100,
#     class_weights = "balanced"
)
results

X (100,)
Y (100, 10)
Cross-validation process started...
*** fold 1 / 5
    training model...
Training model...
    first step, train loss:0.7661
    epoch: 1, train loss:0.3487, train accuracy: 0.9025
    epoch: 2, train loss:0.1730, train accuracy: 0.9587
    generating predictions on the train set...
    generating predictions on the test set...
    Total runtime: 0.06 minutes
*** fold 2 / 5
    training model...
Training model...
    first step, train loss:0.7109
    epoch: 1, train loss:0.3432, train accuracy: 0.9075
    epoch: 2, train loss:0.1719, train accuracy: 0.9587
    generating predictions on the train set...
    generating predictions on the test set...
    Total runtime: 0.10 minutes
*** fold 3 / 5
    training model...
Training model...
    first step, train loss:0.7243
    epoch: 1, train loss:0.3584, train accuracy: 0.8988
    epoch: 2, train loss:0.1807, train accuracy: 0.9575
    generating predictions on the train set...
    generating predictions on the test set.

Unnamed: 0_level_0,test_accuracy,test_f1,test_precision,test_recall,train_accuracy,train_f1,train_precision,train_recall
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
benefits,0.98,0.0,0.0,0.0,0.98,0.0,0.0,0.0
culture,0.94,0.0,0.0,0.0,0.94,0.0,0.0,0.0
dehumanisation,0.97,0.0,0.0,0.0,0.97,0.0,0.0,0.0
economic,0.98,0.0,0.0,0.0,0.98,0.0,0.0,0.0
health,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
migration,0.92,0.0,0.0,0.0,0.92,0.0,0.0,0.0
others,0.97,0.0,0.0,0.0,0.97,0.0,0.0,0.0
security,0.89,0.0,0.0,0.0,0.89,0.0,0.0,0.0
suffering,0.96,0.0,0.0,0.0,0.96,0.0,0.0,0.0
task_1,0.52,0.0,0.0,0.0,0.22,0.0,0.0,0.0


In [8]:
results = utils.validate_MTL_juan(
    X,
    Y, 
#     X, Y,
    clf, 
#     epochs = 5,
#     freeze_encoder = True,
    progress_bar = False,
    refresh=False
#     weight_decay = 100,
#     class_weights = "balanced"
)
results

X (100,)
Y (100, 10)
Cross-validation process started...
*** fold 1 / 5
    training model...
Training model...
    first step, train loss:0.1728
    epoch: 1, train loss:0.1601, train accuracy: 0.9587
    epoch: 2, train loss:0.1299, train accuracy: 0.9600
    generating predictions on the train set...
    generating predictions on the test set...
    Total runtime: 0.04 minutes
*** fold 2 / 5
    training model...
Training model...
    first step, train loss:0.0445
    epoch: 1, train loss:0.1118, train accuracy: 0.9613
    epoch: 2, train loss:0.0807, train accuracy: 0.9800
    generating predictions on the train set...
    generating predictions on the test set...
    Total runtime: 0.09 minutes
*** fold 3 / 5
    training model...
Training model...
    first step, train loss:0.0298
    epoch: 1, train loss:0.0712, train accuracy: 0.9825
    epoch: 2, train loss:0.0585, train accuracy: 0.9887
    generating predictions on the train set...
    generating predictions on the test set.

Unnamed: 0_level_0,test_accuracy,test_f1,test_precision,test_recall,train_accuracy,train_f1,train_precision,train_recall
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
benefits,0.99,0.2,0.2,0.2,0.995,0.733333,0.8,0.7
culture,1.0,0.6,0.6,0.6,0.9775,0.713247,0.8,0.65
dehumanisation,1.0,0.4,0.4,0.4,0.995,0.9,1.0,0.866667
economic,0.98,0.0,0.0,0.0,0.9925,0.533333,0.6,0.5
health,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
migration,0.97,0.6,0.6,0.6,0.985,0.869231,1.0,0.809524
others,0.99,0.4,0.4,0.4,0.9925,0.826667,1.0,0.733333
security,0.98,0.8,0.8,0.8,0.9875,0.923077,1.0,0.888889
suffering,0.98,0.4,0.4,0.4,0.99,0.8,0.8,0.8
task_1,0.86,0.704762,0.8,0.65,0.92,0.933333,1.0,0.9
