In [1]:
import transformers
import pandas as pd
import numpy as np
import nltk
from utils import TweetsDataset
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased")
model = transformers.BertForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=4)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchi

In [3]:
df = pd.read_csv("../../data/development.csv")
df = pd.DataFrame({
    "tweets": df["tweet"],
    "labels": df["ideology_multiclass"].map({'moderate_left': 0, 'moderate_right': 1, "left": 2, "right": 3})
})
df_train, df_val = TweetsDataset.split_test_val(df, valid_size=0.15)
train_data_loader = TweetsDataset.create_data_loader(df_train, tokenizer, batch_size=8)
valid_data_loader = TweetsDataset.create_data_loader(df_val, tokenizer, batch_size=8)

In [4]:
model = model.to(device)
EPOCHS = 8
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps = 0,
    num_training_steps = len(train_data_loader)
)

In [5]:
from collections import defaultdict
from utils import Model
history = defaultdict(list)
Model.train(EPOCHS, model, train_data_loader, optimizer, device, scheduler, history, len(df_train), len(df_val), valid_data_loader)

Epoch 1/8
----------
[KTrain loss: 0.506 Accuracy: 0.437
Validation loss 0.474 Accuracy: 0.5 F1 score: 0.5

Epoch 2/8
----------
[KTrain loss: 0.434 Accuracy: 0.574
Validation loss 0.474 Accuracy: 0.5 F1 score: 0.5

Epoch 3/8
----------
[KTrain loss: 0.434 Accuracy: 0.576
Validation loss 0.474 Accuracy: 0.5 F1 score: 0.5

Epoch 4/8
----------
[KTrain loss: 0.434 Accuracy: 0.573
Validation loss 0.474 Accuracy: 0.5 F1 score: 0.5

Epoch 5/8
----------
[KTrain loss: 0.434 Accuracy: 0.57
Validation loss 0.474 Accuracy: 0.5 F1 score: 0.5

Epoch 6/8
----------
[KTrain loss: 0.435 Accuracy: 0.573
Validation loss 0.474 Accuracy: 0.5 F1 score: 0.5

Epoch 7/8
----------
[KTrain loss: 0.434 Accuracy: 0.573
Validation loss 0.474 Accuracy: 0.5 F1 score: 0.5

Epoch 8/8
----------
[KTrain loss: 0.434 Accuracy: 0.572
Validation loss 0.474 Accuracy: 0.5 F1 score: 0.5



defaultdict(list,
            {'train_acc': [0.4366830065359477,
              0.5741013071895424,
              0.575735294117647,
              0.5727124183006536,
              0.5695261437908496,
              0.5727124183006536,
              0.5726307189542483,
              0.572140522875817],
             'train_loss': [0.5056820976773119,
              0.43428898354371387,
              0.4341222838249082,
              0.43426128835265154,
              0.4342885223576446,
              0.43450173209695253,
              0.434234154049088,
              0.4343612019435253],
             'val_acc': [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5],
             'val_loss': [0.4735356524034783,
              0.4735356524034783,
              0.4735356524034783,
              0.4735356524034783,
              0.4735356524034783,
              0.4735356524034783,
              0.4735356524034783,
              0.4735356524034783],
             'f_score': [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5

In [6]:
torch.save(model.state_dict(), 'berto-base-cased.bin')

In [8]:
# Test
from utils import Results, Model
model = transformers.BertForSequenceClassification.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", num_labels=4)
model.to(device)
model.load_state_dict(torch.load('berto-base-cased.bin'))
df_test = pd.read_csv('../../data/development_test.csv')
df_test = pd.DataFrame({
    "tweets": df_test["tweet"],
    "labels": df_test["ideology_multiclass"].map({'moderate_left': 0, 'moderate_right': 1, "left": 2, "right": 3})
})
test_data_loader = TweetsDataset.create_data_loader(df_test, tokenizer)
acc, loss, f1 = Model.test(model, test_data_loader, device, len(df_test))
Results.add_result("berto-base-cased", "ideology-multiclass", acc, loss, f1)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchi

Test loss: 0.497 Accuracy: 0.448


AttributeError: 'DataFrame' object has no attribute 'concat'

In [17]:
from datetime import date
results = pd.read_csv("../results.csv")
results = pd.concat([results,pd.DataFrame({
    "model": "berto-base-uncased",
    "class": "ideology-multiclass",
    "loss": loss,
    "f1score": f1,
    "accuracy": acc,
    "train_time": date.now()
}, index=[0])]).reset_index(drop=True)
results.reset_index()
results.to_csv("../results.csv")