# Delivery 02

# Sctructure:
+ Read data
+ Spell checker
+ HMM
+ Id fetures
+ Structured Perceptron (Notebook: Structured_Perceptron_Validation)
+ BERT

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm, trange

import editdistance
import itertools
import re


In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', 50)

In [3]:
# Using ffill we optimize the code. It will fill all missing values with the previous non-nan value
data = pd.read_csv("../data/kaggle_ner/ner_dataset.csv",
                   encoding="latin1").fillna(method="ffill")

In [4]:
data.head()


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [5]:
print("Available TAGS:")
print(set(data["Tag"]))

Available TAGS:
{'B-org', 'I-org', 'B-nat', 'O', 'B-eve', 'B-geo', 'I-eve', 'I-gpe', 'B-per', 'B-art', 'I-per', 'I-tim', 'I-geo', 'I-art', 'B-tim', 'I-nat', 'B-gpe'}


In [6]:
print("{:10s} | {:20s}".format("TAG", "Examples"))
print("-"*60)
for tag in set(data["Tag"]):
    print("{:10s} | {:20s} {:20s} {:20s}".format(tag, *data[data["Tag"] == tag]["Word"][0:3].values))

TAG        | Examples            
------------------------------------------------------------
B-org      | Labor                International        IAEA                
I-org      | Party                Atomic               Energy              
B-nat      | H5N1                 H5N1                 Jing                
O          | Thousands            of                   demonstrators       
B-eve      | 2012                 Games                Games               
B-geo      | London               Iraq                 Hyde                
I-eve      | Summer               Olympics             Olympic             
I-gpe      | States               Korea                Binh                
B-per      | Bush                 President            Thomas              
B-art      | Nuclear              Saltillo             Pentastar           
I-per      | Mahmoud              Ahmadinejad          Horbach             
I-tim      | 8                    1                    2             

In [7]:
n_sentences = len(data['Sentence #'].unique())

In [8]:
n_sentences

47959

In [9]:
# Using groupby and apply rather than a for loop the computation is reduced from +2h to 4s
%time X_txt = list(data.groupby("Sentence #")['Word'].apply(list))
%time Y_txt = list(data.groupby("Sentence #")['Tag'].apply(list))

Wall time: 4.63 s
Wall time: 4.63 s


In [10]:
i = 0
xy = ["{}/{}".format(x,y) for x,y in zip(X_txt[i],Y_txt[i])]
" ".join(xy)

'Thousands/O of/O demonstrators/O have/O marched/O through/O London/B-geo to/O protest/O the/O war/O in/O Iraq/B-geo and/O demand/O the/O withdrawal/O of/O British/B-gpe troops/O from/O that/O country/O ./O'

## Word to pos and tag to pos

In [11]:
def build_word_to_pos(X):
    word_to_pos = {k: i for i, k in enumerate(X['Word'].unique())}               
    pos_to_word = {v: k for k, v in word_to_pos.items()}
    return word_to_pos, pos_to_word
            
def build_tag_to_pos(Y):
    tag_to_pos = {k: i for i, k in enumerate(Y['Tag'].unique())} 
    pos_to_tag = {v: k for k, v in tag_to_pos.items()}

    return tag_to_pos, pos_to_tag

In [12]:
word_to_pos, pos_to_word = build_word_to_pos(data)
tag_to_pos, pos_to_tag  = build_tag_to_pos(data)

len(word_to_pos), len(tag_to_pos)

(35178, 17)

In [13]:
print(list(word_to_pos.items())[:10])
print(tag_to_pos)

[('Thousands', 0), ('of', 1), ('demonstrators', 2), ('have', 3), ('marched', 4), ('through', 5), ('London', 6), ('to', 7), ('protest', 8), ('the', 9)]
{'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'B-art': 8, 'I-art': 9, 'I-per': 10, 'I-gpe': 11, 'I-tim': 12, 'B-nat': 13, 'B-eve': 14, 'I-eve': 15, 'I-nat': 16}


In [14]:
X = [[word_to_pos[w] for w in s] for s in X_txt]
Y = [[tag_to_pos[t] for t in s] for s in Y_txt]

In [15]:
n_split = 35971
X_train_txt, X_test_txt, Y_train_txt, Y_test_txt = X_txt[:n_split], X_txt[n_split:], Y_txt[:n_split], Y_txt[n_split:]

# HMM 

In [32]:
import scipy
import numpy as np

# From .py
from HiddenMarkovModel import *

import os,sys,inspect
import skseq

In [33]:
import skseq
import skseq.sequences
import skseq.readers

from skseq.sequences import sequence
from skseq.sequences import sequence_list
from skseq.sequences import label_dictionary

In [34]:
n_split = 35971
X_train_txt, X_test_txt, Y_train_txt, Y_test_txt = X_txt[:n_split], X_txt[n_split:], Y_txt[:n_split], Y_txt[n_split:]

In [18]:
hmm = HMM(word_to_pos, tag_to_pos)

In [19]:
hmm.fit(X_train_txt, Y_train_txt)

  return {"emission":   np.log(probs["emission"]),
  "transition": np.log(probs["transition"]),
  "final":      np.log(probs["final"]),
  "initial":    np.log(probs["initial"])}


In [24]:
import pickle
# # open a file, where you ant to store the data
# file = open('HMM.pkl', 'wb')

# # dump information to that file
# pickle.dump(hmm, file)

# hmm = pickle.load(open( "HMM.pkl", "rb" ))

In [26]:
def evaluate_hmm():
    tot = 0
    err = 0

    mstks = []
    correct = []

    tbar = tqdm(X_test_txt)
    for i, xtest in enumerate(tbar):
        pred = hmm.predict_labels(xtest)
        yral = Y_test_txt[i]

        s = sum(v1!=v2 for v1,v2 in list(zip(pred, yral)))
        err+= s
        tot+=len(yral)

        toappend = mstks if s!=0 else correct
        toappend.append(pd.DataFrame([xtest, pred, yral]))
        tbar.set_description("Accuracy: {:6.4f}".format(1-err/tot))
    return correct, mstks

In [27]:
correct, mstks = evaluate_hmm()

HBox(children=(FloatProgress(value=0.0, max=11988.0), HTML(value='')))






In [None]:
correct[0]

In [None]:
mstks[0]

# Structured perceptron

In [16]:
from skseq.sequences.sequence_list import SequenceList
from skseq.sequences.sequence import Sequence
from skseq.sequences.label_dictionary import LabelDictionary

x_dict = LabelDictionary(label_names=data['Word'].unique())
y_dict = LabelDictionary(label_names=data['Tag'].unique())

# train_sequences = [Sequence(x,y) for x,y in zip(X_train_txt, Y_train_txt)]
# test_sequences = [Sequence(x,y) for x,y in zip(X_test_txt, Y_test_txt)]

train_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_train_txt, Y_train_txt):
    train_seq_list.add_sequence(x, y, x_dict, y_dict)

test_seq_list = SequenceList(x_dict, y_dict)
for x,y in zip(X_test_txt, Y_test_txt):
    test_seq_list.add_sequence(x, y, x_dict, y_dict)

In [17]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq_list)
feature_mapper.build_features()
# from skseq.sequences import extended_feature
# feature_mapper = extended_feature.ExtendedFeatures(train_seq_list)
# feature_mapper.build_features()

print('Feature dict length:', len(feature_mapper.feature_dict))
print('Feature list length:', len(feature_mapper.feature_list))

import skseq.sequences.structured_perceptron_validation as spc
# sp = spc.StructuredPerceptron(corpus.word_dict, corpus.tag_dict, feature_mapper)
sp = spc.StructuredPerceptronValidation(x_dict, y_dict, feature_mapper, averaged=False)


def train_perceptron(load_no_fit = True, num_epochs = 50, epochs_before_stopping = 5, dir_to_params = './'):
    if not load_no_fit:
        print('Training for %i epochs with early stopping after %i epochs of no improvement' % (num_epochs, epochs_before_stopping))
#         %time sp.fit(dummy_seq_list, val_seq_list, num_epochs, epochs_before_stopping, dir_to_params)
        %time sp.fit(train_seq_list, test_seq_list, num_epochs, epochs_before_stopping, dir_to_params)
    else:
        sp.load_model(dir_to_params)
        
        
import pandas as pd
from IPython.core import display as ICD


Feature dict length: 68502
Feature list length: 35971


In [None]:
def evaluate_perceptron():
    # Make predictions for the various sequences using the trained model.
    pred_train = sp.viterbi_decode_corpus(train_seq_list)
    pred_test  = sp.viterbi_decode_corpus(test_seq_list)

    # Evaluate and print accuracies
    eval_train = evaluate_corpus(train_seq_list.seq_list, pred_train)
    eval_test = evaluate_corpus(test_seq_list.seq_list, pred_test)

    print("SP -  Accuracy Train: %.3f Test: %.3f"%(eval_train, eval_test))
    
def predict_text_tags(seq, nice_format=False, change_word=False, sensibility=2):
    assert isinstance(seq, str) or isinstance(seq, list), "The input must be a sentence (string format or a list of words)"
    corpus = list(itertools.chain(*X_train_txt)) ###
    
    if isinstance(seq, str):
        seq = seq.split()      
    
    num2lab = {v:k for k,v in sp.state_labels.items()}
    if nice_format:
        grp = pd.DataFrame([seq, [num2lab[w] for w in sp.predict_tags_given_words(seq)]], 
                           index=["Words", "Tags"], columns=["W_{:02d}".format(i) for i in range(len(seq))])   
        grp = grp.style.applymap(lambda x: 'color: blue' if x != 'O' and x in sp.state_labels else 'color: black')
        ICD.display(grp)
        
    else:
        res = ""
        for o, w in zip(seq, sp.predict_tags_given_words(seq)):
            if o not in corpus:                                                    #######
                correct_o, dist = edit_ditance_word(o, X_train_txt)
                if dist <= sensibility:
                    seq[seq.index(o)] = correct_o
                    w = sp.predict_tags_given_words(seq)[seq.index(correct_o)]
                    if change_word:
                        o = correct_o
                        
            ft = " {}/{}" if num2lab[w]=='O' else " {}/\x1b[34m{}\x1b[0m"
            res += ft.format(o, num2lab[w]) 
        print(res)
    
    
def predict_batch_text_tags(batch, nice_format=False, change_word=False, sensibility=2):
    all_s = sum(isinstance(seq, str) for seq in batch)
    all_l = sum(isinstance(seq, list) for seq in batch)
    assert all_s==0 or all_l==0, "The inputs must be sentences (string format or lists of words)"

    for b in batch:
        predict_text_tags(b, nice_format, change_word, sensibility)
        
        
def word_in_corpus(phrase):
    corpus = list(itertools.chain(*X_train_txt)) # Use train words
    for i in range(len(phrase)):
        if phrase[i] not in corpus:
            print(phrase[i])
            palabra, dist = edit_ditance_word(phrase[i])
            if dist==1:
                phrase[i] = palabra
    return phrase


def edit_ditance_word(mistake, X_train_txt):
    # mistake = "Barchelona" 
    corpus = [w for seq in X_train_txt for w in seq]
    distances = [editdistance.eval(mistake, word) for word in corpus]
    return corpus[np.argmin(distances)], min(distances)

In [None]:
num_epochs = 100
epochs_before_stopping = 5
train_perceptron(load_no_fit=False, num_epochs = num_epochs, epochs_before_stopping = epochs_before_stopping)

# BERT
Bidirectional Encoder Representations from Transformers

In [61]:
class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [62]:
getter = SentenceGetter(data)

In [63]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
print(sentences[0])

['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.']


In [64]:
labels = [[s[2] for s in sentence] for sentence in getter.sentences]
print(labels[0])

['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [65]:
tag_values = list(set(data["Tag"].values))
tag_values.append("PAD")

tag2idx = {t: i for i, t in enumerate(tag_values)}

### The tag2idx dict is saved so it can be used in the reproduce_results notebook

In [58]:
import pickle
pickle.dump(tag2idx, open("bert_tag2idx.pkl", "wb"))

In [66]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertConfig

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [67]:
MAX_LEN = 75
bs = 32

### If there is a GPU available, use it

In [68]:
# GPU 
gpu_id = 3

device = torch.device("cuda:{}".format(gpu_id) if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(gpu_id)

'GeForce RTX 2080 Ti'

In [69]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)

In [70]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels


In [71]:
tokenized_texts_and_labels = [tokenize_and_preserve_labels(sent, labs) for sent, labs in tqdm(zip(sentences, labels), total=len(sentences))]

HBox(children=(FloatProgress(value=0.0, max=47959.0), HTML(value='')))




In [72]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

In [73]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")


In [74]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [75]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

### We use the sentences up to 35970 for training and the others for testing

In [76]:
n_split = 35971

tr_inputs, val_inputs, tr_tags, val_tags = input_ids[:n_split], input_ids[n_split:], tags[:n_split], tags[n_split:]
tr_masks, val_masks = attention_masks[:n_split], attention_masks[n_split:]

In [77]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)

tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)

tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [78]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)


In [79]:
import transformers
from transformers import BertForTokenClassification, AdamW

In [80]:
# Load a pretrained model 
model = BertForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False,
#     state_dict = torch.load("BERT_ep_4_acc_0.9611.pt")
)
model.to(device=device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [81]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)

In [82]:
from transformers import get_linear_schedule_with_warmup

epochs = 5
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [83]:
# !pip3 install seqeval

In [84]:
from seqeval.metrics import f1_score, accuracy_score

### Train and save the model at each epoch so we can retrieve the best one for the test set

This code in GPU can run in 3min/epoch. 
In CPU it is over 30min/epoch

In [86]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

pretrained_epoch = 0

for ep_num in tqdm(range(epochs), desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in tqdm(enumerate(train_dataloader), leave=False, total=len(train_dataloader), desc="Training"):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    
    for batch in tqdm(valid_dataloader, desc="Validation", leave=False):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print()
    torch.save(model.state_dict(), 'BERT_ep_{}_acc_{:6.4f}.pt'.format(ep_num+pretrained_epoch, accuracy_score(pred_tags, valid_tags)))

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=5.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Training', max=1125.0, style=ProgressStyle(description_wi…

Average train loss: 0.012632345520839508


HBox(children=(FloatProgress(value=0.0, description='Validation', max=375.0, style=ProgressStyle(description_w…

Validation loss: 0.22116888908545176
Validation Accuracy: 0.9609117368259436
Validation F1-Score: 0.8257091128545565



HBox(children=(FloatProgress(value=0.0, description='Training', max=1125.0, style=ProgressStyle(description_wi…

Average train loss: 0.012428074112265474


HBox(children=(FloatProgress(value=0.0, description='Validation', max=375.0, style=ProgressStyle(description_w…

Validation loss: 0.22116888908545176
Validation Accuracy: 0.9609117368259436
Validation F1-Score: 0.8257091128545565



HBox(children=(FloatProgress(value=0.0, description='Training', max=1125.0, style=ProgressStyle(description_wi…

Average train loss: 0.012548546395885448


HBox(children=(FloatProgress(value=0.0, description='Validation', max=375.0, style=ProgressStyle(description_w…

Validation loss: 0.22116888908545176
Validation Accuracy: 0.9609117368259436
Validation F1-Score: 0.8257091128545565



HBox(children=(FloatProgress(value=0.0, description='Training', max=1125.0, style=ProgressStyle(description_wi…




KeyboardInterrupt: 