<a href="https://colab.research.google.com/github/jrhumberto/pub/blob/main/Untitled50.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertForSequenceClassification, BertTokenizer, pipeline
from sklearn import preprocessing
from tqdm import tqdm

In [2]:
# Flags de controle e hiperparametros
MAX_LENGTH = 512

TRAIN_RATIO = 0.7
VAL_RATIO = 0.2
TEST_RATIO = 0.1

BATCH_SIZE = 16

# Configuracao da CPU/GPU
device = torch.device("cuda:0" if (torch.cuda.is_available()) else "cpu")

In [3]:
print(f"Utilizando para processamento: {device}")

Utilizando para processamento: cpu


In [None]:
df = pd.read_csv("../input/imdb-ptbr/imdb-reviews-pt-br.csv")

In [None]:
print(f"Tamanho da base: {len(df)}")

In [None]:
df['sentiments'].value_counts()

In [None]:
df

In [None]:
bert_en = pipeline("sentiment_analysis")

In [None]:
test_instance = 100

df['text_en' ][test_instance], bert_en(df['text_en'][test_instance])



In [None]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

In [None]:
# 'df_tokenized' sera um dicionario com as keys ['idout_ids', 'token_type_ide', 'attention_mask']
# 'input_ids -> instancias tokenizadas
# 'token_type_ids' -> mascara usada em tarefas de classificacao de frases em pares (sera descartada nesta task)
# 'attention_mask' -> mascara de atencao que destaca para o modelo os tokens de padding [PAD]
df_tokenized = tokenizer.batch_encode_plus(df['text_pt'], return_tensors='pt', padding=True, truncation=True, max_length=MAX_LENGTH)

In [None]:
print(df_tokenized['input_ids'].shape, df_tokenized['attention_mask'].shape)

In [None]:
# [0, DATASET_LEN, MAX_LENGTH] = input_ids
# [1, DATASET_LEN, MAX_LENGTH T= attention_mask
X = torch.stack((df_tokenized["input_ids"], df_tokenized["attention_mask"]), dim=0)

df['sentiment'] = df['sentiment' ].apply(lambda x: 0 if x == 'neg' else 1)
y = torch.Tensor(df ['sentiment' ].to_numpy())

In [None]:
class TextDataset(Dataset) :

def __init__ (self, X, y):

self.X = X
self.X = self.X.to(device)

self.y = y
self.y = self.y.to(device)

self.len = len(y)

def __len_(self):
    return self.len

def __getitem__(self, idx):
    return self.X[ :, idx], self.y[idx]

In [None]:
dataset = TextDataset(X, y)

# Calculo do numero de instancias que devem existir em cada split
num_train_instances = np.int(np.round(dataset.len * TRAIN_RATIO))
num_val_instances = np.int(np.round(dataset.len * VAL_RATIO))
num_test_instances = np.int(np.round(dataset.len * TEST_RATIO))
print(f"Treino: (num_train_instances), Val: (num_val_instances}, Teste: (num_test_instances}")

train_split, val_split, test_split = torch.utils.data.random_split(dataset, [num_train_instances, num_val_instances, num_test_instances])

train_loader = torch.utils.data.DataLoader(train_split, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_split, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_split, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
epochs = 40
steps_per_epoch = 200
epoch_validation_samples = 50

model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased").to(device)
for param in model.base_model.parameters() :
    param.requires_grad = True

loss_func = torch.nn.CrossEntropyLoss()
optim = torch.optim.Adam(model.parameters())

acc_calc = lambda output, labels : (labels == output.argmax(axis=1)).sum()

scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, 0.9997)

In [None]:
epoch_metadata = []

for i in range(epochs) :

    num_train_examples = 0
    num_val_examples = 0

    train_hits = 0
    val_hits = 0

    train_bar = tqdm(total=steps_per_epoch, desc=f"Train", unit= "steps", position=0, leave=True)
    val_bar = tqdm(total=epoch_validation_samples, desc=f"Val", unit= "samples", position=8, leave=True)

    for batch_number. (features, labels) in enumerate(train_loader):

        train_running_loss = 0

        model.train()

        input_ids, input_masks = features[:, 0, :]. features[:, 1, :]
        loss, logits = model(input_ids, input_masks, labels=labels.long())

        optim.zero_grad()
        loss.backward()
        optim.step()

        train_running_loss += loss.item()

        softmax_predictions = torch.nn.functional. softmax(logits, dim=1)
        train_hits += acc_calc(softmax_predictions, labels)

        # Updating our display bar
        train_bar.update(1)

        num_train_examples += features.shape[ 0]

        scheduler.step()

        # Breaking ofter a certain amount of steps in the current epoch.
        if (batch_number + 1) % steps_per_epoch == 0:
            train_bar.close()

    for batch_number, (features, labels) in enumerate(val_loader):
        with torch.no_grad():
            val_running_loss = 0

            model.eval()

            input_ids, input_masks = features[:, 0, :], features[:, 1, :]

            loss, logits = model(input_ids, input_masks, labels=labels.long())

            val_running_loss += loss.item()

            softmax_predictions = torch.nn.functional.softmax(logits, dim=1)
            val_hits += acc_calc(softmax_predictions, labels)

            num_val_examples += features.shape[0]

            # Updating our display bar
            val_bar.update(1)

            # Breaking after a certain amount of steps in the current epoch ...
            if (batch_number + 1) % epoch_validation_samples == 0:
                val_bar.close()
                break

    train_acc = torch.true_divide(train_hits, num_train_examples)
    val_acc = torch.true_divide(val_hits, num_val_examples)

    print(f"EPOCH SUMMARY - {i + |} \t Train loss: (train_running_loss) \t Train Acc: (train_acc) \t Val loss: (val_running_loss} \t Val Acc: (val_acc}")




In [None]:
model.save_pretrained(f"epoch_{i}")

In [None]:
num_test_examples = 0

train_hits = 0
test_hits = 0

test_running_loss = 0

for batch_number, (features, labels) in enumerate(test_loader):
    with torch.no_grad():
        test_running_loss = 0

        model.eval()

        input_ids, input_masks = features[:, 0, :], features[ :, 1, :]

        loss, logits = model(input_ids, input_masks, labels=labels.long())

        test_running_loss += loss.item()

        softmax_predictions = torch.nn. functional.softmax(logits, dimx )
        test_hits += acc_calc(softmax_predictions, labels)

        num_test_examples += features. shape[8]

In [None]:
from IPython.display import FileLink
FileLink('/kaggle/working/checkpoints/epoch_39' + '/config.json')
FileLink('/kaggle/working/checkpoints/epoch_39' + '/pytorch_model.bin')