In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from pandas import read_parquet
from transformers import BertModel, BertTokenizerFast
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_path = "data/mBERT/fine"

tokenizer = BertTokenizerFast.from_pretrained(model_path)

In [3]:
train_data = read_parquet("data/merge/train.parquet").sample(5000)
dev_data = read_parquet("data/merge/dev.parquet").sample(3000)
test_data = read_parquet("data/merge/test.parquet").sample(1000)

with open("data/merge/tags_2_idx.json", "r") as f:
    tags2idx = json.load(f)

with open("data/merge/idx_2_tags.json", "r") as f:
    idx2tags = json.load(f)

with open("data/merge/chars2idx.json", "r") as f:
    chars2idx = json.load(f)

In [4]:
sentences_train = train_data["tokens"].values.tolist()
tags_train = train_data["ner_tags"].values.tolist()

sentences_dev = dev_data["tokens"].values.tolist()
tags_dev = dev_data["ner_tags"].values.tolist()

sentences_test = test_data["tokens"].values.tolist()
tags_test = test_data["ner_tags"].values.tolist()

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert = BertModel.from_pretrained("data/mBERT/fine").to(device)
bert.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
 

In [6]:
def tokenize_and_align_labels(sentences, labels, max_length, label_all_tokens=True):
    tokenized_inputs = tokenizer(
        sentences, padding="max_length", max_length=min(512, max_length * 4), # some tokens will be split into smaller parts
        return_tensors="pt", truncation=True, is_split_into_words=True
    )

    tokenized_labels = []
    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx
        tokenized_labels.append(label_ids)

    tokenized_inputs["labels"] = torch.tensor(tokenized_labels)
    return tokenized_inputs

In [13]:
class MultilingualDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.sentences[idx].tolist(), self.labels[idx].tolist()
    
def collate_fn(batch):
    sentences, labels = zip(*batch)
    B = len(labels)
    max_len = max([len(sentence) for sentence in sentences])
    tokenized_inputs = tokenize_and_align_labels(sentences, labels, max_len)
    T = tokenized_inputs["input_ids"].shape[1]
    batch_words = []
    for input_ids in tokenized_inputs["input_ids"]:
        batch_words.append(tokenizer.convert_ids_to_tokens(input_ids))
    L = max([len(x) for words in batch_words for x in words])
    batch_char_ids = torch.zeros(B, T, L)
    for i in range(B):
        for j in range(T):
            cur_word = batch_words[i][j]
            if cur_word == "[PAD]":
                break
            elif cur_word[0] == "[" and cur_word[-1] == "]":
                continue
            for k in range(len(cur_word)):
                batch_char_ids[i][j][k] = chars2idx.get(cur_word[k], chars2idx["<unk>"])
    tokenized_inputs["char_ids"] = batch_char_ids.long()
    device = bert.device
    with torch.no_grad():
        bert_outputs = bert(tokenized_inputs["input_ids"].to(device), attention_mask=tokenized_inputs["attention_mask"].to(device))
    bert_embeddings = bert_outputs["last_hidden_state"]
    return tokenized_inputs, bert_embeddings

In [8]:
class BiLSTM(nn.Module):
    def __init__(
            self, input_size,
            lstm_hidden_dim, lstm_num_layers, lstm_dropout, 
            linear_output_dim, label_size
        ):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=lstm_hidden_dim,
            num_layers=lstm_num_layers,
            batch_first=True,
            dropout=lstm_dropout,
            bidirectional=True
        )
        self.fc1 = nn.Linear(2 * lstm_hidden_dim, linear_output_dim)
        self.dropout = nn.Dropout(lstm_dropout)
        self.fc2 = nn.Linear(linear_output_dim, label_size)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.dropout(F.elu(self.fc1(x), inplace=True))
        x = self.fc2(x)
        return x

In [14]:
train_dataset = MultilingualDataset(sentences_train, tags_train)
dev_dataset = MultilingualDataset(sentences_dev, tags_dev)
test_dataset = MultilingualDataset(sentences_test, tags_test)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=collate_fn
)
dev_loader = DataLoader(
    dataset=dev_dataset,
    batch_size=2,
    shuffle=False,
    collate_fn=collate_fn
)
lstm_model = BiLSTM(
    input_size=768,
    lstm_hidden_dim=256, lstm_num_layers=2, lstm_dropout=0.33,
    linear_output_dim=128, label_size=len(idx2tags)
)
lstm_model = lstm_model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = Adam(lstm_model.parameters(), lr=1e-3)
epochs = 5

In [15]:
for epoch in range(epochs):
    print(f'Epoch: {epoch + 1}/{epochs}')
    train_loss = 0
    cur_total = 0
    correct = 0
    amount = 0
    lstm_model.train()
    train_loop = tqdm(train_loader, desc=f'Epoch: {epoch + 1}/{epochs}')
    for tokenized_inputs, bert_embeddings in train_loop:
        B = bert_embeddings.shape[0]
        bert_embeddings = bert_embeddings.to(device)
        labels = tokenized_inputs["labels"].to(device)
        attention_masks = tokenized_inputs["attention_mask"].to(device)

        optimizer.zero_grad()
        logits = lstm_model(bert_embeddings)
        logits_reshape = logits.view(-1, logits.shape[-1])
        labels_reshape = labels.view(-1)
        loss = criterion(logits_reshape, labels_reshape)
        loss.backward()
        optimizer.step()

        train_loss += loss.cpu().item() * B
        
        masks_reshape = attention_masks.view(-1)
        logits_non_pad = logits_reshape[masks_reshape == 1]
        labels_non_pad = labels_reshape[masks_reshape == 1]

        _, predictions = torch.max(logits_non_pad, 1)
        correct += sum(predictions == labels_non_pad)
        amount += len(labels_non_pad)

        cur_total += len(labels)
        running_loss = train_loss / cur_total
        running_acc = correct / amount
        train_loop.set_postfix(loss=running_loss, acc=running_acc.item())

    train_loss /= len(train_dataset)
    train_acc = correct / amount

    dev_loss = 0
    y_true, y_pred = [], []
    correct = 0
    amount = 0
    lstm_model.eval()
    dev_loop = tqdm(dev_loader, desc=f'Epoch: {epoch + 1}/{epochs}')
    with torch.no_grad():
        for tokenized_inputs, bert_embeddings in dev_loop:
            B = bert_embeddings.shape[0]
            bert_embeddings = bert_embeddings.to(device)
            labels = tokenized_inputs["labels"].to(device)
            attention_masks = tokenized_inputs["attention_mask"].to(device)

            logits = lstm_model(bert_embeddings)
            logits_reshape = logits.view(-1, logits.shape[-1])
            labels_reshape = labels.view(-1)
            loss = criterion(logits_reshape, labels_reshape)
            dev_loss += loss.cpu().item() * B
            
            masks_reshape = attention_masks.view(-1)
            logits_non_pad = logits_reshape[masks_reshape == 1]
            labels_non_pad = labels_reshape[masks_reshape == 1]

            _, predictions = torch.max(logits_non_pad, 1)
            correct += sum(predictions == labels_non_pad)
            amount += len(labels_non_pad)
            y_pred.extend(predictions.cpu())
            y_true.extend(labels_non_pad.cpu())

    dev_loss /= len(dev_dataset)
    val_acc = correct / amount
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    print('train_loss: {:.4f}, train_acc: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}'.format(train_loss, train_acc, dev_loss, val_acc))
    print('val_precision: {:.4f}, val_recall: {:.4f}, val_f1: {:.4f}'.format(val_precision, val_recall, val_f1))

Epoch: 1/5


Epoch: 1/5: 100%|██████████| 2500/2500 [02:06<00:00, 19.81it/s, acc=0.703, loss=0.69] 
Epoch: 1/5: 100%|██████████| 1500/1500 [00:54<00:00, 27.33it/s]


train_loss: 0.6904, train_acc: 0.7026, val_loss: 0.4887, val_acc: 0.7394
val_precision: 0.6135, val_recall: 0.6454, val_f1: 0.6218
Epoch: 2/5


Epoch: 2/5: 100%|██████████| 2500/2500 [02:18<00:00, 18.00it/s, acc=0.758, loss=0.442]
Epoch: 2/5: 100%|██████████| 1500/1500 [00:55<00:00, 27.17it/s]


train_loss: 0.4423, train_acc: 0.7578, val_loss: 0.4012, val_acc: 0.7661
val_precision: 0.6731, val_recall: 0.6636, val_f1: 0.6644
Epoch: 3/5


Epoch: 3/5: 100%|██████████| 2500/2500 [02:15<00:00, 18.51it/s, acc=0.778, loss=0.36] 
Epoch: 3/5: 100%|██████████| 1500/1500 [01:06<00:00, 22.39it/s]


train_loss: 0.3604, train_acc: 0.7785, val_loss: 0.4045, val_acc: 0.7636
val_precision: 0.6606, val_recall: 0.6600, val_f1: 0.6527
Epoch: 4/5


Epoch: 4/5: 100%|██████████| 2500/2500 [02:20<00:00, 17.77it/s, acc=0.794, loss=0.294]
Epoch: 4/5: 100%|██████████| 1500/1500 [00:55<00:00, 27.25it/s]


train_loss: 0.2945, train_acc: 0.7938, val_loss: 0.3931, val_acc: 0.7759
val_precision: 0.6743, val_recall: 0.6899, val_f1: 0.6793
Epoch: 5/5


Epoch: 5/5: 100%|██████████| 2500/2500 [02:17<00:00, 18.16it/s, acc=0.809, loss=0.245]
Epoch: 5/5: 100%|██████████| 1500/1500 [00:55<00:00, 27.13it/s]


train_loss: 0.2453, train_acc: 0.8088, val_loss: 0.3908, val_acc: 0.7721
val_precision: 0.6745, val_recall: 0.6731, val_f1: 0.6704


using torch.no_grad() for bert: training time on the demo dataset drops from 5 mins to 2 mins an epoch.

In [101]:
for epoch in range(epochs):
    print(f'Epoch: {epoch + 1}/{epochs}')
    train_loss = 0
    cur_total = 0
    correct = 0
    amount = 0
    lstm_model.train()
    train_loop = tqdm(train_loader, desc=f'Epoch: {epoch + 1}/{epochs}')
    for tokenized_inputs, bert_embeddings in train_loop:
        B = bert_embeddings.shape[0]
        bert_embeddings = bert_embeddings.to(device)
        labels = tokenized_inputs["labels"].to(device)
        attention_masks = tokenized_inputs["attention_mask"].to(device)

        optimizer.zero_grad()
        logits = lstm_model(bert_embeddings)
        logits_reshape = logits.view(-1, logits.shape[-1])
        labels_reshape = labels.view(-1)
        loss = criterion(logits_reshape, labels_reshape)
        loss.backward()
        optimizer.step()

        train_loss += loss.cpu().item() * B
        
        masks_reshape = attention_masks.view(-1)
        logits_non_pad = logits_reshape[masks_reshape == 1]
        labels_non_pad = labels_reshape[masks_reshape == 1]

        _, predictions = torch.max(logits_non_pad, 1)
        correct += sum(predictions == labels_non_pad)
        amount += len(labels_non_pad)

        cur_total += len(labels)
        running_loss = train_loss / cur_total
        running_acc = correct / amount
        train_loop.set_postfix(loss=running_loss, acc=running_acc.item())

    train_loss /= len(train_dataset)
    train_acc = correct / amount

    dev_loss = 0
    y_true, y_pred = [], []
    correct = 0
    amount = 0
    lstm_model.eval()
    dev_loop = tqdm(dev_loader, desc=f'Epoch: {epoch + 1}/{epochs}')
    with torch.no_grad():
        for tokenized_inputs, bert_embeddings in dev_loop:
            B = bert_embeddings.shape[0]
            bert_embeddings = bert_embeddings.to(device)
            labels = tokenized_inputs["labels"].to(device)
            attention_masks = tokenized_inputs["attention_mask"].to(device)

            logits = lstm_model(bert_embeddings)
            logits_reshape = logits.view(-1, logits.shape[-1])
            labels_reshape = labels.view(-1)
            loss = criterion(logits_reshape, labels_reshape)
            dev_loss += loss.cpu().item() * B
            
            masks_reshape = attention_masks.view(-1)
            logits_non_pad = logits_reshape[masks_reshape == 1]
            labels_non_pad = labels_reshape[masks_reshape == 1]

            _, predictions = torch.max(logits_non_pad, 1)
            correct += sum(predictions == labels_non_pad)
            amount += len(labels_non_pad)
            y_pred.extend(predictions.cpu())
            y_true.extend(labels_non_pad.cpu())

    dev_loss /= len(dev_dataset)
    val_acc = correct / amount
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    print('train_loss: {:.4f}, train_acc: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}'.format(train_loss, train_acc, dev_loss, val_acc))
    print('val_precision: {:.4f}, val_recall: {:.4f}, val_f1: {:.4f}'.format(val_precision, val_recall, val_f1))

Epoch: 1/5


Epoch: 1/5: 100%|██████████| 2500/2500 [07:30<00:00,  5.55it/s, acc=0.696, loss=0.699]
Epoch: 1/5: 100%|██████████| 1500/1500 [01:07<00:00, 22.20it/s]


train_loss: 0.6988, train_acc: 0.6964, val_loss: 0.4551, val_acc: 0.7523
val_precision: 0.6587, val_recall: 0.6345, val_f1: 0.6416
Epoch: 2/5


Epoch: 2/5: 100%|██████████| 2500/2500 [05:01<00:00,  8.29it/s, acc=0.751, loss=0.466]
Epoch: 2/5: 100%|██████████| 1500/1500 [00:59<00:00, 25.11it/s]


train_loss: 0.4662, train_acc: 0.7507, val_loss: 0.3793, val_acc: 0.7694
val_precision: 0.6658, val_recall: 0.6905, val_f1: 0.6759
Epoch: 3/5


Epoch: 3/5: 100%|██████████| 2500/2500 [05:12<00:00,  8.00it/s, acc=0.773, loss=0.376]
Epoch: 3/5: 100%|██████████| 1500/1500 [00:59<00:00, 25.33it/s]


train_loss: 0.3757, train_acc: 0.7733, val_loss: 0.3780, val_acc: 0.7728
val_precision: 0.6682, val_recall: 0.6918, val_f1: 0.6776
Epoch: 4/5


Epoch: 4/5: 100%|██████████| 2500/2500 [05:14<00:00,  7.94it/s, acc=0.786, loss=0.311]
Epoch: 4/5: 100%|██████████| 1500/1500 [00:58<00:00, 25.58it/s]


train_loss: 0.3111, train_acc: 0.7864, val_loss: 0.3612, val_acc: 0.7806
val_precision: 0.6880, val_recall: 0.6999, val_f1: 0.6913
Epoch: 5/5


Epoch: 5/5: 100%|██████████| 2500/2500 [05:24<00:00,  7.71it/s, acc=0.802, loss=0.254]
Epoch: 5/5: 100%|██████████| 1500/1500 [01:01<00:00, 24.59it/s]


train_loss: 0.2535, train_acc: 0.8021, val_loss: 0.3685, val_acc: 0.7833
val_precision: 0.6797, val_recall: 0.7121, val_f1: 0.6938


In [15]:
class BiLSTM_CharCNN(nn.Module):
    def __init__(
            self, input_size, char_size, char_embedding_dim, char_final_dim,
            lstm_hidden_dim, lstm_num_layers, lstm_dropout, 
            linear_output_dim, label_size
        ):
        super().__init__()
        self.char_embedding = nn.Embedding(char_size, char_embedding_dim)
        self.char_cnn = nn.Sequential(
            nn.Conv1d(char_embedding_dim, 256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(256, char_final_dim, kernel_size=3, padding=1),
        )

        self.lstm = nn.LSTM(
            input_size=input_size + char_final_dim,
            hidden_size=lstm_hidden_dim,
            num_layers=lstm_num_layers,
            batch_first=True,
            dropout=lstm_dropout,
            bidirectional=True
        )
        self.fc1 = nn.Linear(2 * lstm_hidden_dim, linear_output_dim)
        self.dropout = nn.Dropout(lstm_dropout)
        self.fc2 = nn.Linear(linear_output_dim, label_size)

    def forward(self, x, char_x):
        char_x = self.char_embedding(char_x)
        B, T, L, D = char_x.shape
        char_x = char_x.view(B * T, L, D).permute(0, 2, 1)
        char_x, _ = torch.max(F.relu(self.char_cnn(char_x), inplace=True), dim=-1)
        char_x = char_x.view(B, T, -1)  # [B, T, D_char]

        x = torch.concat((x, char_x), dim=-1)

        x, _ = self.lstm(x)
        x = self.dropout(F.elu(self.fc1(x), inplace=True))
        x = self.fc2(x)
        return x

In [21]:
train_dataset = MultilingualDataset(sentences_train, tags_train)
dev_dataset = MultilingualDataset(sentences_dev, tags_dev)
test_dataset = MultilingualDataset(sentences_test, tags_test)
train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=2,
    shuffle=True,
    collate_fn=collate_fn
)
dev_loader = DataLoader(
    dataset=dev_dataset,
    batch_size=2,
    shuffle=False,
    collate_fn=collate_fn
)

lstm_cnn_model = BiLSTM_CharCNN(
    input_size=768, char_size=len(chars2idx), char_embedding_dim=100, char_final_dim=128,
    lstm_hidden_dim=256, lstm_num_layers=2, lstm_dropout=0.33,
    linear_output_dim=128, label_size=len(idx2tags)
)
lstm_cnn_model = lstm_cnn_model.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = Adam(lstm_cnn_model.parameters(), lr=1e-3)
epochs = 5

In [23]:
for epoch in range(epochs):
    print(f'Epoch: {epoch + 1}/{epochs}')
    train_loss = 0
    cur_total = 0
    correct = 0
    amount = 0
    lstm_cnn_model.train()
    train_loop = tqdm(train_loader, desc=f'Epoch: {epoch + 1}/{epochs}')
    for tokenized_inputs, bert_embeddings in train_loop:
        B = bert_embeddings.shape[0]
        bert_embeddings = bert_embeddings.to(device)
        labels = tokenized_inputs["labels"].to(device)
        attention_masks = tokenized_inputs["attention_mask"].to(device)
        char_ids = tokenized_inputs["char_ids"].to(device)

        optimizer.zero_grad()
        logits = lstm_cnn_model(bert_embeddings, char_ids)
        logits_reshape = logits.view(-1, logits.shape[-1])
        labels_reshape = labels.view(-1)
        loss = criterion(logits_reshape, labels_reshape)
        loss.backward()
        optimizer.step()

        train_loss += loss.cpu().item() * B
        
        masks_reshape = attention_masks.view(-1)
        logits_non_pad = logits_reshape[masks_reshape == 1]
        labels_non_pad = labels_reshape[masks_reshape == 1]

        _, predictions = torch.max(logits_non_pad, 1)
        correct += sum(predictions == labels_non_pad)
        amount += len(labels_non_pad)

        cur_total += len(labels)
        running_loss = train_loss / cur_total
        running_acc = correct / amount
        train_loop.set_postfix(loss=running_loss, acc=running_acc.item())

    train_loss /= len(train_dataset)
    train_acc = correct / amount

    dev_loss = 0
    y_true, y_pred = [], []
    correct = 0
    amount = 0
    lstm_cnn_model.eval()
    dev_loop = tqdm(dev_loader, desc=f'Epoch: {epoch + 1}/{epochs}')
    with torch.no_grad():
        for tokenized_inputs, bert_embeddings in dev_loop:
            B = bert_embeddings.shape[0]
            bert_embeddings = bert_embeddings.to(device)
            labels = tokenized_inputs["labels"].to(device)
            attention_masks = tokenized_inputs["attention_mask"].to(device)
            char_ids = tokenized_inputs["char_ids"].to(device)

            logits = lstm_cnn_model(bert_embeddings, char_ids)
            logits_reshape = logits.view(-1, logits.shape[-1])
            labels_reshape = labels.view(-1)
            loss = criterion(logits_reshape, labels_reshape)
            dev_loss += loss.cpu().item() * B
            
            masks_reshape = attention_masks.view(-1)
            logits_non_pad = logits_reshape[masks_reshape == 1]
            labels_non_pad = labels_reshape[masks_reshape == 1]

            _, predictions = torch.max(logits_non_pad, 1)
            correct += sum(predictions == labels_non_pad)
            amount += len(labels_non_pad)
            y_pred.extend(predictions.cpu())
            y_true.extend(labels_non_pad.cpu())

    dev_loss /= len(dev_dataset)
    val_acc = correct / amount
    val_precision, val_recall, val_f1, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    print('train_loss: {:.4f}, train_acc: {:.4f}, val_loss: {:.4f}, val_acc: {:.4f}'.format(train_loss, train_acc, dev_loss, val_acc))
    print('val_precision: {:.4f}, val_recall: {:.4f}, val_f1: {:.4f}'.format(val_precision, val_recall, val_f1))

Epoch: 1/5


Epoch: 1/5: 100%|██████████| 2500/2500 [06:26<00:00,  6.46it/s, acc=0.708, loss=0.657]
Epoch: 1/5: 100%|██████████| 1500/1500 [01:01<00:00, 24.48it/s]


train_loss: 0.6569, train_acc: 0.7083, val_loss: 0.4452, val_acc: 0.7604
val_precision: 0.6589, val_recall: 0.6460, val_f1: 0.6469
Epoch: 2/5


Epoch: 2/5: 100%|██████████| 2500/2500 [05:49<00:00,  7.16it/s, acc=0.761, loss=0.446]
Epoch: 2/5: 100%|██████████| 1500/1500 [01:01<00:00, 24.39it/s]


train_loss: 0.4457, train_acc: 0.7606, val_loss: 0.3844, val_acc: 0.7771
val_precision: 0.6818, val_recall: 0.6629, val_f1: 0.6686
Epoch: 3/5


Epoch: 3/5: 100%|██████████| 2500/2500 [05:59<00:00,  6.96it/s, acc=0.777, loss=0.367]
Epoch: 3/5: 100%|██████████| 1500/1500 [01:03<00:00, 23.79it/s]


train_loss: 0.3669, train_acc: 0.7771, val_loss: 0.3926, val_acc: 0.7711
val_precision: 0.6642, val_recall: 0.6913, val_f1: 0.6717
Epoch: 4/5


Epoch: 4/5: 100%|██████████| 2500/2500 [06:02<00:00,  6.89it/s, acc=0.793, loss=0.307]
Epoch: 4/5: 100%|██████████| 1500/1500 [01:12<00:00, 20.62it/s]


train_loss: 0.3068, train_acc: 0.7927, val_loss: 0.3579, val_acc: 0.7867
val_precision: 0.6898, val_recall: 0.7031, val_f1: 0.6931
Epoch: 5/5


Epoch: 5/5: 100%|██████████| 2500/2500 [05:59<00:00,  6.95it/s, acc=0.804, loss=0.261]
Epoch: 5/5: 100%|██████████| 1500/1500 [01:03<00:00, 23.76it/s]


train_loss: 0.2608, train_acc: 0.8045, val_loss: 0.3831, val_acc: 0.7668
val_precision: 0.6500, val_recall: 0.7083, val_f1: 0.6746
