In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers



In [3]:
import pandas as pd
import numpy as np
import torch

from torchtext.vocab import build_vocab_from_iterator
from transformers import AutoTokenizer, AutoModelForPreTraining

# import config as cfg


In [4]:
emb_model = AutoModelForPreTraining.from_pretrained('bert-base-multilingual-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')


In [5]:
LABELS = {'Historia': 0, 'Administracao': 1, 'Geografia': 2, 'Biologia': 3,
          'Literatura': 4, 'Artes': 5, 'Matematica': 6}

In [6]:
x_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/book_data/train_og.csv')
x_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/book_data/test.csv')

x_test.drop(['titulo', 'genero'], axis=1, inplace=True)

# x_train['alt_title'] = x_train['alt_title'].map(lambda title: \
#                                                 tokenizer.encode_plus(title,
#                                                                       add_special_tokens=True,
#                                                                       padding='longest'))
# x_test['alt_title'] = x_test['alt_title'].map(lambda title: \
#                                               tokenizer.encode_plus(title,
#                                                                     add_special_tokens=True,
#                                                                     padding='longest'))

In [7]:
np.random.seed(42)
val = np.random.uniform(0, 1, len(x_train))

In [8]:
x_val = x_train.query('@val > 0.8').copy()
x_train = x_train.query('@val <= 0.8')

In [9]:
x_val.reset_index(drop=True, inplace=True)
x_train.reset_index(drop=True, inplace=True)

In [10]:
x_train

Unnamed: 0,alt_title,label_inf,label
0,biologia organismos,1,3
1,biologia 3 genetica evolucao ecologia,0,3
2,tribos mal,-1,0
3,egipcios,-1,0
4,historia,0,0
...,...,...,...
687,cultura sociedade brasil colonia,0,0
688,marxismo,-1,0
689,atlas mundo atual,2,2
690,geometria plana conceitos basicos,2,6


In [11]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_len: int = None) -> object:

        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        textos = self.data.loc[index, 'alt_title'],
        labels = self.data.loc[index, 'label']

        encoding = tokenizer(textos,
                             max_length=self.max_len,
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt')

        inputs = encoding['input_ids']
        tkn_type = encoding['token_type_ids']
        att_mask = encoding['attention_mask']

        return inputs, tkn_type, att_mask, labels


In [12]:
train_dataset = MyDataset(x_train, tokenizer=tokenizer, max_len=32)
val_dataset = MyDataset(x_val, tokenizer=tokenizer, max_len=32)
test_dataset = MyDataset(x_test, tokenizer=tokenizer, max_len=32)

del x_train, x_test, x_val

In [13]:
from transformers import BertModel, BertTokenizer

In [14]:
import torch
from torch import nn
from transformers import BertModel

class CustomBERTModel(nn.Module):
    def __init__(self, num_classes, dropout=0.5):
        super(CustomBERTModel, self).__init__()

        # Load the pre-trained BERT model
        # self.bert = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')

        # for p in self.bert.parameters():
        #     p.requires_grad = False

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.relu = nn.ReLU()

        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, token_tp_ids):
        # Forward pass through the BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_tp_ids)

        # Extract the last hidden state of the token [CLS] for classification task
        pooled_output = outputs.pooler_output

        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer  # Softmax is applied outside the model during training

model = CustomBERTModel(num_classes=7)


In [15]:
from torch.optim import Adam
import torch.nn as nn
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    # train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, type_input, attention, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                # mask = train_input['attention_mask'].to(device)
                # input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(train_input.squeeze(1).to(device), attention.squeeze(1).to(device), type_input.squeeze(1).to(device))

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for input_id, type_id, attention, label in val_dataloader:

                    val_label = label.to(device)
                    # mask = val_input['attention_mask'].to(device)
                    # input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id.squeeze(1).to(device), attention.squeeze(1).to(device), type_id.squeeze(1).to(device))

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')

            if (total_acc_train / len(train_data)) > 0.99:
                 break

EPOCHS = 20
# model = BertClassifier()
LR = 1e-6

train(model, train_dataset, val_dataset, LR, EPOCHS)

100%|██████████| 346/346 [00:38<00:00,  8.94it/s]


Epochs: 1 | Train Loss:  0.884                 | Train Accuracy:  0.345                 | Val Loss:  0.798                 | Val Accuracy:  0.422


100%|██████████| 346/346 [00:31<00:00, 11.13it/s]


Epochs: 2 | Train Loss:  0.725                 | Train Accuracy:  0.530                 | Val Loss:  0.674                 | Val Accuracy:  0.617


100%|██████████| 346/346 [00:30<00:00, 11.16it/s]


Epochs: 3 | Train Loss:  0.586                 | Train Accuracy:  0.707                 | Val Loss:  0.571                 | Val Accuracy:  0.728


100%|██████████| 346/346 [00:32<00:00, 10.61it/s]


Epochs: 4 | Train Loss:  0.460                 | Train Accuracy:  0.811                 | Val Loss:  0.490                 | Val Accuracy:  0.794


100%|██████████| 346/346 [00:30<00:00, 11.18it/s]


Epochs: 5 | Train Loss:  0.368                 | Train Accuracy:  0.851                 | Val Loss:  0.441                 | Val Accuracy:  0.800


100%|██████████| 346/346 [00:31<00:00, 11.13it/s]


Epochs: 6 | Train Loss:  0.298                 | Train Accuracy:  0.887                 | Val Loss:  0.390                 | Val Accuracy:  0.822


100%|██████████| 346/346 [00:31<00:00, 11.08it/s]


Epochs: 7 | Train Loss:  0.243                 | Train Accuracy:  0.905                 | Val Loss:  0.364                 | Val Accuracy:  0.828


100%|██████████| 346/346 [00:31<00:00, 11.11it/s]


Epochs: 8 | Train Loss:  0.196                 | Train Accuracy:  0.935                 | Val Loss:  0.349                 | Val Accuracy:  0.822


100%|██████████| 346/346 [00:31<00:00, 11.07it/s]


Epochs: 9 | Train Loss:  0.160                 | Train Accuracy:  0.958                 | Val Loss:  0.326                 | Val Accuracy:  0.833


100%|██████████| 346/346 [00:31<00:00, 11.08it/s]


Epochs: 10 | Train Loss:  0.128                 | Train Accuracy:  0.965                 | Val Loss:  0.330                 | Val Accuracy:  0.828


100%|██████████| 346/346 [00:31<00:00, 11.12it/s]


Epochs: 11 | Train Loss:  0.103                 | Train Accuracy:  0.983                 | Val Loss:  0.333                 | Val Accuracy:  0.822


100%|██████████| 346/346 [00:31<00:00, 11.14it/s]


Epochs: 12 | Train Loss:  0.080                 | Train Accuracy:  0.991                 | Val Loss:  0.320                 | Val Accuracy:  0.828


In [16]:
def evaluate(model, test_data):

    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    total_acc_test = 0
    with torch.no_grad():

        for input_id, type_id, attention, label in test_dataloader:

              output = model(input_id.squeeze(1).to(device),
                             attention.squeeze(1).to(device),
                             type_id.squeeze(1).to(device))
              acc = (output.argmax(dim=1) == label.to(device)).sum().item()
              total_acc_test += acc

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

evaluate(model, test_dataset)


Test Accuracy:  0.887


In [17]:
df_c = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/book_data/train_final.csv')
ds_c = MyDataset(df_c, tokenizer)

In [18]:
model_c = CustomBERTModel(num_classes=7)

In [19]:
train(model_c, ds_c, val_dataset, LR, EPOCHS)

100%|██████████| 1744/1744 [06:31<00:00,  4.46it/s]


Epochs: 1 | Train Loss:  0.811                 | Train Accuracy:  0.453                 | Val Loss:  0.623                 | Val Accuracy:  0.633


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 2 | Train Loss:  0.631                 | Train Accuracy:  0.623                 | Val Loss:  0.389                 | Val Accuracy:  0.833


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 3 | Train Loss:  0.511                 | Train Accuracy:  0.701                 | Val Loss:  0.254                 | Val Accuracy:  0.906


100%|██████████| 1744/1744 [06:29<00:00,  4.47it/s]


Epochs: 4 | Train Loss:  0.415                 | Train Accuracy:  0.768                 | Val Loss:  0.173                 | Val Accuracy:  0.944


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 5 | Train Loss:  0.332                 | Train Accuracy:  0.825                 | Val Loss:  0.112                 | Val Accuracy:  0.978


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 6 | Train Loss:  0.267                 | Train Accuracy:  0.868                 | Val Loss:  0.077                 | Val Accuracy:  0.989


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 7 | Train Loss:  0.215                 | Train Accuracy:  0.900                 | Val Loss:  0.064                 | Val Accuracy:  0.989


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 8 | Train Loss:  0.175                 | Train Accuracy:  0.917                 | Val Loss:  0.040                 | Val Accuracy:  0.994


100%|██████████| 1744/1744 [06:29<00:00,  4.47it/s]


Epochs: 9 | Train Loss:  0.146                 | Train Accuracy:  0.932                 | Val Loss:  0.041                 | Val Accuracy:  0.989


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 10 | Train Loss:  0.116                 | Train Accuracy:  0.947                 | Val Loss:  0.018                 | Val Accuracy:  1.000


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 11 | Train Loss:  0.101                 | Train Accuracy:  0.953                 | Val Loss:  0.026                 | Val Accuracy:  0.994


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 12 | Train Loss:  0.084                 | Train Accuracy:  0.958                 | Val Loss:  0.012                 | Val Accuracy:  1.000


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 13 | Train Loss:  0.074                 | Train Accuracy:  0.962                 | Val Loss:  0.014                 | Val Accuracy:  0.994


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 14 | Train Loss:  0.066                 | Train Accuracy:  0.963                 | Val Loss:  0.008                 | Val Accuracy:  1.000


100%|██████████| 1744/1744 [06:30<00:00,  4.46it/s]


Epochs: 15 | Train Loss:  0.059                 | Train Accuracy:  0.970                 | Val Loss:  0.012                 | Val Accuracy:  0.994


100%|██████████| 1744/1744 [06:30<00:00,  4.46it/s]


Epochs: 16 | Train Loss:  0.055                 | Train Accuracy:  0.966                 | Val Loss:  0.008                 | Val Accuracy:  1.000


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 17 | Train Loss:  0.050                 | Train Accuracy:  0.970                 | Val Loss:  0.008                 | Val Accuracy:  0.994


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 18 | Train Loss:  0.048                 | Train Accuracy:  0.970                 | Val Loss:  0.004                 | Val Accuracy:  1.000


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 19 | Train Loss:  0.045                 | Train Accuracy:  0.971                 | Val Loss:  0.008                 | Val Accuracy:  0.994


100%|██████████| 1744/1744 [06:30<00:00,  4.47it/s]


Epochs: 20 | Train Loss:  0.041                 | Train Accuracy:  0.973                 | Val Loss:  0.004                 | Val Accuracy:  1.000


In [20]:
evaluate(model_c, test_dataset)

Test Accuracy:  0.866


In [21]:
# valores de treino e validação nas 17 epochs para alteração de palavras por contexto
# [0.398, 0.591, 0.674, 0.737, 0.793, 0.842, 0.876, 0.903, 0.923, 0.936, 0.949, 0.960, 0.966, 0.964, 0.969, 0.971, 0.976]
# [0.517 0.647 0.671 0.700 0.695 0.702 0.699 0.683 0.682 0.686 0.695 0.706 0.690 0.710 0.700 0.690 0.695]

# valores de trein oe validação em 11 epochs na base sem alteração

In [22]:
x_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/book_data/train_og.csv')


In [23]:
np.random.seed(42)
val = np.random.uniform(0, 1, len(x_train))

x_val = x_train.query('@val > 0.8').copy()
x_train = x_train.query('@val <= 0.8')

x_val.reset_index(drop=True, inplace=True)
x_train.reset_index(drop=True, inplace=True)

In [24]:
x_train

Unnamed: 0,alt_title,label_inf,label
0,biologia organismos,1,3
1,biologia 3 genetica evolucao ecologia,0,3
2,tribos mal,-1,0
3,egipcios,-1,0
4,historia,0,0
...,...,...,...
687,cultura sociedade brasil colonia,0,0
688,marxismo,-1,0
689,atlas mundo atual,2,2
690,geometria plana conceitos basicos,2,6


In [25]:
!pip install snorkel

Collecting snorkel
  Downloading snorkel-0.9.9-py3-none-any.whl (103 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m102.4/103.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.9.9


In [26]:
import nltk

nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [27]:
from snorkel.augmentation import transformation_function


@transformation_function()
def random_deletion(x):
    # Tokenize the sentence
    tokens = tokenizer.tokenize(x['alt_title'])

    # If the sentence is a single word or shorter, return it as is
    if len(tokens) <= 1:
        return x

    # Randomly choose a token index to delete
    del_index = random.randrange(len(tokens))

    # Remove the chosen token
    del tokens[del_index]

    # Join the tokens back into a sentence and return
    x['random_deletion'] = ' '.join(tokens)
    return x


In [28]:
import random
import nltk
from snorkel.augmentation import transformation_function

def random_swap(text):
    # Tokenize the sentence
    tokens = nltk.word_tokenize(text)

    # If the sentence is two words or shorter, return it as is
    if len(tokens) <= 2:
        return text

    # Randomly choose an index
    index = random.randint(0, len(tokens) - 2)

    # Swap the token at the chosen index with the next token
    tokens[index], tokens[index + 1] = tokens[index + 1], tokens[index]

    # Join the tokens back into a sentence and return
    return ' '.join(tokens)

@transformation_function()
def tf_random_swap(df_row):
    # Assuming that 'text' is the column containing text
    df_row['random_swap'] = random_swap(df_row['alt_title'])
    return df_row


In [29]:
from snorkel.augmentation import ApplyOnePolicy, PandasTFApplier, ApplyAllPolicy

tf_policy = ApplyAllPolicy(n_tfs=2, n_per_original=3, keep_original=True)
tf_applier = PandasTFApplier([tf_random_swap, random_deletion], tf_policy)

x_train = tf_applier.apply(x_train)

100%|██████████| 692/692 [00:05<00:00, 135.05it/s]


In [30]:
x_train.reset_index(drop=True, inplace=True)

In [31]:
x_train['random_swap'].fillna(x_train['alt_title'], inplace=True)
x_train['random_deletion'].fillna(x_train['alt_title'], inplace=True)

In [32]:
df_rd = x_train[['random_deletion', 'label']].copy()
df_rd.rename(columns={'random_deletion': 'alt_title'}, inplace=True)

In [33]:
ds_rd = MyDataset(df_rd, tokenizer)

In [34]:
model_rd = CustomBERTModel(num_classes=7)

train(model_rd, ds_rd, val_dataset, LR, EPOCHS)

100%|██████████| 1384/1384 [05:12<00:00,  4.43it/s]


Epochs: 1 | Train Loss:  0.795                 | Train Accuracy:  0.443                 | Val Loss:  0.626                 | Val Accuracy:  0.639


100%|██████████| 1384/1384 [05:10<00:00,  4.46it/s]


Epochs: 2 | Train Loss:  0.483                 | Train Accuracy:  0.766                 | Val Loss:  0.424                 | Val Accuracy:  0.806


100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 3 | Train Loss:  0.286                 | Train Accuracy:  0.883                 | Val Loss:  0.337                 | Val Accuracy:  0.839


100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 4 | Train Loss:  0.172                 | Train Accuracy:  0.940                 | Val Loss:  0.303                 | Val Accuracy:  0.850


100%|██████████| 1384/1384 [05:10<00:00,  4.46it/s]


Epochs: 5 | Train Loss:  0.103                 | Train Accuracy:  0.970                 | Val Loss:  0.290                 | Val Accuracy:  0.844


100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 6 | Train Loss:  0.061                 | Train Accuracy:  0.984                 | Val Loss:  0.297                 | Val Accuracy:  0.856


100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 7 | Train Loss:  0.042                 | Train Accuracy:  0.990                 | Val Loss:  0.311                 | Val Accuracy:  0.822


100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 8 | Train Loss:  0.025                 | Train Accuracy:  0.995                 | Val Loss:  0.308                 | Val Accuracy:  0.850


In [35]:
evaluate(model_rd, test_dataset)

Test Accuracy:  0.873


In [36]:
df_rs = x_train[['random_swap', 'label']].copy()
df_rs.rename(columns={'random_swap': 'alt_title'}, inplace=True)

In [37]:
ds_rs = MyDataset(df_rs, tokenizer)

In [38]:
model_rs = CustomBERTModel(num_classes=7)

train(model_rs, ds_rs, val_dataset, LR, EPOCHS)

100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 1 | Train Loss:  0.676                 | Train Accuracy:  0.585                 | Val Loss:  0.528                 | Val Accuracy:  0.744


100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 2 | Train Loss:  0.303                 | Train Accuracy:  0.872                 | Val Loss:  0.375                 | Val Accuracy:  0.828


100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 3 | Train Loss:  0.128                 | Train Accuracy:  0.975                 | Val Loss:  0.327                 | Val Accuracy:  0.844


100%|██████████| 1384/1384 [05:09<00:00,  4.47it/s]


Epochs: 4 | Train Loss:  0.056                 | Train Accuracy:  0.996                 | Val Loss:  0.354                 | Val Accuracy:  0.828


In [39]:
evaluate(model_rs, test_dataset)

Test Accuracy:  0.880
