# A quick evaluation on ESC and CTB

In [None]:
import torch
import torch.nn as nn
from datasets import load_from_disk
from model import Causal_Model
from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader
from tqdm import tqdm
from utils import compute_metrics

In [None]:
checkpoint = 'google-bert/bert-large-uncased'

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
special_tokens_dict = {'additional_special_tokens': ['<e1>','</e1>','<e2>','</e2>']}
tokenizer.add_special_tokens(special_tokens_dict)

def tokenize_function_mask(examples):
    return tokenizer(examples["event_masked_sentence"], truncation=True)

def tokenize_function_tag(examples):
    return tokenizer(examples["event_tagged_sentence"], truncation=True)

## ESC evaluation demo

In [None]:
test_fold =load_from_disk('dataset/ESC/ESC_test_fold4')

masked_test_fold = test_fold.map(tokenize_function_mask, batched=True, batch_size=32)
masked_test_fold = masked_test_fold.remove_columns(['sentence', 'event_tagged_sentence', 'event_masked_sentence','e1','e2'])
masked_test_fold.set_format("torch")

tagged_test_fold = test_fold.map(tokenize_function_tag, batched=True, batch_size=32)
tagged_test_fold = tagged_test_fold.remove_columns(['sentence', 'event_tagged_sentence', 'event_masked_sentence','e1','e2'])
tagged_test_fold.set_format("torch")

print(f"test len: {len(masked_test_fold)}")

In [None]:
test_btz=20

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader_mask_test = DataLoader(
masked_test_fold, shuffle=False, batch_size=test_btz, collate_fn=data_collator)
dataloader_tag_test = DataLoader(
tagged_test_fold, shuffle=False,  batch_size=test_btz, collate_fn=data_collator)

dataloader_mask_test = tqdm(dataloader_mask_test, dynamic_ncols=True)

In [None]:
device='cuda'
model=Causal_Model(bert_path=checkpoint, d_model=1024, num_heads=16, dropout_rate=0.5, device='cuda', visualize=False)

model.load_state_dict(torch.load('./model_checkpoints/ESC/best_model_fold4.pt'))
model=model.to(device)
criterion=nn.CrossEntropyLoss()

In [None]:
model.eval()
mean_loss_test = 0
predicted_all_test = []
gold_all_test = []
with torch.no_grad():
    iteration=0
    for mask_data, tag_data in zip(dataloader_mask_test, dataloader_tag_test):
        mask_data, tag_data=mask_data.to(device), tag_data.to(device)
        labels=tag_data['labels']
        labels=labels.to(device)

        del mask_data['labels']
        del tag_data['labels']
        
        outputs=model(mask_data, tag_data).squeeze(1)
        loss = criterion(outputs, labels)
        
        mean_loss_test = (mean_loss_test * iteration + loss.detach()) / (iteration + 1)
        iteration+=1

        predicted = torch.argmax(outputs, dim=-1)
        predicted=list(predicted.cpu().numpy())
        predicted_all_test+=predicted
        gold_all_test+=list(labels.cpu().numpy())
                                                    
precision_t, recall_t, f1_score_t = compute_metrics(gold_all_test, predicted_all_test)
print(f"[test ESC fold 4] p:{precision_t*100:.2f} r:{recall_t*100:.2f} F1:{f1_score_t*100:.2f} loss:{mean_loss_test.item():.4f}")

## CTB evaluation demo

In [None]:
test_fold =load_from_disk('dataset/CTB/CTB_test_fold2')

masked_test_fold = test_fold.map(tokenize_function_mask, batched=True, batch_size=32)
masked_test_fold = masked_test_fold.remove_columns(['sentence', 'event_tagged_sentence', 'event_masked_sentence','e1','e2'])
masked_test_fold.set_format("torch")

tagged_test_fold = test_fold.map(tokenize_function_tag, batched=True, batch_size=32)
tagged_test_fold = tagged_test_fold.remove_columns(['sentence', 'event_tagged_sentence', 'event_masked_sentence','e1','e2'])
tagged_test_fold.set_format("torch")

print(f"test len: {len(masked_test_fold)}")

In [None]:
test_btz=20

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
dataloader_mask_test = DataLoader(
masked_test_fold, shuffle=False, batch_size=test_btz, collate_fn=data_collator)
dataloader_tag_test = DataLoader(
tagged_test_fold, shuffle=False,  batch_size=test_btz, collate_fn=data_collator)

dataloader_mask_test = tqdm(dataloader_mask_test, dynamic_ncols=True)

In [None]:
device='cuda'
model=Causal_Model(bert_path=checkpoint, d_model=1024, num_heads=16, dropout_rate=0.5, device='cuda', visualize=False)

model.load_state_dict(torch.load('./model_checkpoints/CTB/best_model_fold2.pt'))
model=model.to(device)
criterion=nn.CrossEntropyLoss()

In [None]:
model.eval()
mean_loss_test = 0
predicted_all_test = []
gold_all_test = []
with torch.no_grad():
    iteration=0
    for mask_data, tag_data in zip(dataloader_mask_test, dataloader_tag_test):
        mask_data, tag_data=mask_data.to(device), tag_data.to(device)
        labels=tag_data['labels']
        labels=labels.to(device)

        del mask_data['labels']
        del tag_data['labels']
        
        outputs=model(mask_data, tag_data).squeeze(1)
        loss = criterion(outputs, labels)
        
        mean_loss_test = (mean_loss_test * iteration + loss.detach()) / (iteration + 1)
        iteration+=1

        predicted = torch.argmax(outputs, dim=-1)
        predicted=list(predicted.cpu().numpy())
        predicted_all_test+=predicted
        gold_all_test+=list(labels.cpu().numpy())
                                                    
precision_t, recall_t, f1_score_t = compute_metrics(gold_all_test, predicted_all_test)
print(f"[test CTB fold 2] p:{precision_t*100:.2f} r:{recall_t*100:.2f} F1:{f1_score_t*100:.2f} loss:{mean_loss_test.item():.4f}")