# Model 03a

Evidence retrieval using a Siamese BERT classification model.
This is similar to Model 01, however, it only uses official pre-trained models from hugging face.

Ref:
- [Hugging face pre-trained models](https://huggingface.co/transformers/v3.3.1/pretrained_models.html)
- [Hugging face guide to fine-tuning](https://huggingface.co/transformers/v3.3.1/custom_datasets.html)
- [Hugging face guide to fine-tuning easy](https://huggingface.co/docs/transformers/training)
- [SO Guide](https://stackoverflow.com/a/64156912)

## Setup

### Working Directory

In [1]:
# Change the working directory to project root
from pathlib import Path
import os
ROOT_DIR = Path.cwd()
while not ROOT_DIR.joinpath("src").exists():
    ROOT_DIR = ROOT_DIR.parent
os.chdir(ROOT_DIR)

### File paths

In [2]:
MODEL_PATH = ROOT_DIR.joinpath("./result/models/*")
DATA_PATH = ROOT_DIR.joinpath("./data/*")
NER_PATH = ROOT_DIR.joinpath("./result/ner/*")

### Dependencies

In [3]:
# Imports and dependencies
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import Module, TripletMarginWithDistanceLoss
from transformers import BertModel, BertTokenizer
from torch.optim import Adam
from torch.optim.lr_scheduler import LinearLR
from torcheval.metrics import BinaryAccuracy, BinaryF1Score

from src.torch_utils import get_torch_device
import json
from dataclasses import dataclass
from typing import List, Union, Tuple
from tqdm import tqdm
import random
import numpy as np
from datetime import datetime
from math import exp

TORCH_DEVICE = get_torch_device()

  from .autonotebook import tqdm as notebook_tqdm


Torch device is 'mps'


## Dataset

In [4]:
@dataclass
class ClaimEvidencePair:
    claim_id:str
    evidence_id:str
    label:int = 0

In [5]:
@dataclass
class ClaimEvidenceTriple:
    claim_id:str
    pos_evidence_id:str
    neg_evidence_id:str

In [6]:
class SiameseEvalDataset(Dataset):
    def __init__(
        self,
        dev_claims_path:Path,
        evidence_path:Path,
        device = None,
        verbose:bool=True
    ) -> None:
        super(SiameseEvalDataset, self).__init__()
        self.verbose = verbose
        self.device = device
        
        # Load claims data from json
        with open(dev_claims_path, mode="r") as f:
            self.claims = (json.load(fp=f))

        # Load evidence library
        self.evidence = dict()
        with open(evidence_path, mode="r") as f:
            self.evidence.update(json.load(fp=f))
        
        # Get a list of all evidences within the dev set
        self.related_evidences = sorted({
            evidence_id
            for claim in self.claims.values()
            for evidence_id in claim["evidences"]
        })
        
        # Generate the data
        self.data = self.__generate_data()
        return
        
    def __generate_data(self):
        data = []
        for claim_id, claim in tqdm(
            iterable=self.claims.items(),
            desc="claims",
            disable=not self.verbose
        ):
            evidence_ids = claim["evidences"]
            
            # Get the positives
            for evidence_id in evidence_ids:
                data.append(ClaimEvidencePair(
                    claim_id=claim_id,
                    evidence_id=evidence_id,
                    label=1
                ))
            
            # Get some negatives
            n_neg = 0
            for rel_evidence_id in self.related_evidences:
                if n_neg >= 10:
                    break
                if rel_evidence_id in evidence_ids:
                    continue
                data.append(ClaimEvidencePair(
                    claim_id=claim_id,
                    evidence_id=rel_evidence_id,
                    label=-1
                ))
                n_neg += 1
        return data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx) -> Tuple[Union[str, torch.Tensor]]:
        # Fetch the required data rows
        data = self.data[idx]
        
        # Get the label
        label = torch.tensor(data.label, device=self.device)
        
        # Get text ids
        claim_id = data.claim_id
        evidence_id = data.evidence_id
        
        # Get text
        claim_text = self.claims[claim_id]["claim_text"]
        evidence_text = self.evidence[evidence_id]

        return (claim_text, evidence_text, label)

In [7]:
class SiameseTripletDataset(Dataset):
    
    def __init__(
        self,
        claims_paths:List[Path],
        claims_shortlist_paths:List[Path],
        evidence_path:Path,
        evidence_shortlists:List[Path] = None,
        device = None,
        n_neg_shortlist:int = 10,
        n_neg_general:int = 10,
        verbose:bool=True
    ) -> None:
        super(SiameseTripletDataset, self).__init__()
        self.verbose = verbose
        self.device = device
        self.n_neg_shortlist = n_neg_shortlist
        self.n_neg_general = n_neg_general

        # Load claims data from json, this is a list as we could use
        # multiple json files in the same dataset
        self.claims = dict()
        for json_file in claims_paths:
            with open(json_file, mode="r") as f:
                self.claims.update(json.load(fp=f))
                # print(f"loaded claims: {json_file}")
        
        # Load the pre-retrieved shortlist of evidences by claim
        self.claims_shortlist = dict()
        for json_file in claims_shortlist_paths:
            with open(json_file, mode="r") as f:
                self.claims_shortlist.update(json.load(fp=f))
                # print(f"loaded claims_shortlist: {json_file}")
        
        # Load evidence library
        self.evidence = dict()
        with open(evidence_path, mode="r") as f:
            self.evidence.update(json.load(fp=f))
            # print(f"loaded evidences: {json_file}")
        
        # Load the evidence shortlists if available
        # Reduce the overall evidence list to the shortlist
        if evidence_shortlists is not None:
            self.evidence_shortlist = set()
            for json_file in evidence_shortlists:
                with open(json_file, mode="r") as f:
                    self.evidence_shortlist.update(json.load(fp=f))
                    # print(f"loaded evidence shortlist: {json_file}")
        
        # print(f"n_evidences: {len(self.evidence)}")
        
        # Generate the data
        self.data = self.__generate_data()
        return

    def __generate_data(self):
        print("Generate siamese dataset")
        
        data = []
        for claim_id, claim in tqdm(
            iterable=self.claims.items(),
            desc="claims",
            disable=not self.verbose
        ):
            pos_evidence_ids = set(claim["evidences"])
            shortlist_neg_evidence_ids = \
                set(self.claims_shortlist.get(claim_id, [])) \
                .difference(pos_evidence_ids)
            general_neg_evidence_ids = self.evidence_shortlist
            
            # For each positive evidence
            for pos_evidence_id in pos_evidence_ids:
                
                # Add i number of negatives from shortlisted evidences
                for i in range(self.n_neg_shortlist):
                    if len(shortlist_neg_evidence_ids) > 0:
                        neg_evidence_ids = random.sample(
                            population=shortlist_neg_evidence_ids,
                            k=min(
                                self.n_neg_shortlist,
                                len(shortlist_neg_evidence_ids)
                            )
                        )
                        
                        for neg_evidence_id in neg_evidence_ids:
                            data.append(ClaimEvidenceTriple(
                                claim_id=claim_id,
                                pos_evidence_id=pos_evidence_id,
                                neg_evidence_id=neg_evidence_id
                            ))
                
                # Add j number of negatives from general evidences
                for j in range(self.n_neg_general):
                    neg_evidence_ids = random.sample(
                        population=general_neg_evidence_ids,
                        k=min(self.n_neg_general, len(general_neg_evidence_ids))
                    )
                    
                    for neg_evidence_id in neg_evidence_ids:
                        data.append(ClaimEvidenceTriple(
                            claim_id=claim_id,
                            pos_evidence_id=pos_evidence_id,
                            neg_evidence_id=neg_evidence_id
                        ))
            continue
        
        print(f"Generated data n={len(data)}")
        
        return data

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx) -> Tuple[str]:
        # Fetch the required data rows
        data = self.data[idx]
        
        # Get text
        anchor_text = self.claims[data.claim_id]["claim_text"]
        pos_evidence_text = self.evidence[data.pos_evidence_id]
        neg_evidence_text = self.evidence[data.neg_evidence_id]

        return (anchor_text, pos_evidence_text, neg_evidence_text)

In [8]:
# WE WILL GENERATE THE DATASET PER EPOCH SO TO RANDOMISE THE NEGATIVE SAMPLES

# train_data = SiameseTripletDataset(
#     claims_paths=[DATA_PATH.with_name("train-claims.json")],
#     claims_shortlist_paths=[NER_PATH.with_name("train_claim_evidence_retrieved.json")],
#     evidence_shortlists=[NER_PATH.with_name("shortlist_train_claim_evidence_retrieved.json")],
#     evidence_path=DATA_PATH.with_name("evidence.json"),
#     device=TORCH_DEVICE,
#     n_neg_shortlist=2,
#     n_neg_general=1
# )

In [9]:
# for i in range(50):
#     anchor, pos, neg = train_data[i]
#     print(anchor[:50], pos[:50], neg[:50])

## Build model

In [10]:
class SiameseTripletEmbedderBert(Module):
    
    def __init__(
            self,
            pretrained_name:str,
            device,
            **kwargs
        ) -> None:
        super(SiameseTripletEmbedderBert, self).__init__(**kwargs)
        self.device = device
        
        # Use a pretrained tokenizer
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_name)
        
        # Use a pretrained model
        self.bert = BertModel.from_pretrained(pretrained_name)
        self.bert.to(device=device)
        return
        
    def forward(self, anchor_texts, pos_texts, neg_texts=None) -> Tuple[torch.Tensor]:
        
        # Run the tokenizer
        t_kwargs = {
            "return_tensors": "pt",
            "padding": True,
            "truncation": True,
            "max_length": 100,
            "add_special_tokens":True
        }
        anchor_x = self.tokenizer(anchor_texts, **t_kwargs)
        pos_x = self.tokenizer(pos_texts, **t_kwargs)
        if neg_texts:
            neg_x = self.tokenizer(neg_texts, **t_kwargs)
        
        anchor_x = anchor_x["input_ids"].to(device=self.device)
        pos_x = pos_x["input_ids"].to(device=self.device)
        if neg_texts:
            neg_x = neg_x["input_ids"].to(device=self.device)
        
        # Run Bert
        anchor_x = self.bert(anchor_x, return_dict=True).pooler_output
        pos_x = self.bert(pos_x, return_dict=True).pooler_output
        if neg_texts:
            neg_x = self.bert(neg_x, return_dict=True).pooler_output
        # dim=768
        
        if neg_texts:
            return anchor_x, pos_x, neg_x
        else:
            return anchor_x, pos_x

## Training and evaluation loop

In [11]:
model = SiameseTripletEmbedderBert(
    pretrained_name="bert-base-cased",
    device=TORCH_DEVICE
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [12]:
loss_fn = TripletMarginWithDistanceLoss(
    # distance_function=torch.cdist, # Euclidean distance, not implemented for MPS
    # Default is Pairwise Distance
    margin=2
)
optimizer = Adam(
    params=model.parameters(),
    lr=0.0000002
) #! Hyperparams

In [13]:
run_time = datetime.now().strftime('%Y_%m_%d_%H_%M')
MODEL_NAME = f"model_03a_bert_base_triplet_{run_time}.pth"
N_EPOCHS = 10
BATCH_SIZE = 64


In [14]:
dev_data = SiameseEvalDataset(
    dev_claims_path=DATA_PATH.with_name("dev-claims.json"),
    evidence_path=DATA_PATH.with_name("evidence.json"),
    device=TORCH_DEVICE
)

dev_dataloader = DataLoader(
    dataset=dev_data,
    shuffle=False,
    batch_size=BATCH_SIZE
)

claims: 100%|██████████| 154/154 [00:00<00:00, 195201.82it/s]


In [15]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
# Run evaluation before training to establish baseline
model.eval()

dev_batches = tqdm(dev_dataloader, desc="dev batches")
epoch_pos_cos_sim = []
epoch_neg_cos_sim = []
for batch in dev_batches:
    claim_texts, evidence_texts, labels = batch
    
    # Forward
    claim_emb, evidence_emb = model(claim_texts, evidence_texts)
    
    # Cosine similarity
    cos_sim = torch.cosine_similarity(claim_emb, evidence_emb)
    labelled_cos_sim = cos_sim * labels
    pos_cos_sim = labelled_cos_sim[torch.where(labelled_cos_sim > 0)]
    neg_cos_sim = labelled_cos_sim[torch.where(labelled_cos_sim < 0)]
    
    batch_pos_cos_sim = torch.mean(pos_cos_sim).cpu().item()
    batch_neg_cos_sim = torch.mean(neg_cos_sim).cpu().item() * -1
    
    epoch_pos_cos_sim.append(batch_pos_cos_sim)
    epoch_neg_cos_sim.append(batch_neg_cos_sim)
    
    dev_batches.postfix = f"pos cos_sim: {batch_pos_cos_sim:.3f}" + \
        f" neg cos_sim: {batch_neg_cos_sim:.3f}"
    
    continue

dev batches: 100%|██████████| 32/32 [00:09<00:00,  3.55it/s, pos cos_sim: 0.972 neg cos_sim: 0.868]


In [17]:
print(f"Average cos sim (pos, neg): {np.mean(epoch_pos_cos_sim):3f}, {np.mean(epoch_neg_cos_sim):3f}")

Average cos sim (pos, neg): 0.863419, 0.838396


In [18]:
metric_accuracy = BinaryAccuracy()
metric_f1 = BinaryF1Score()
metric_recall = BinaryF1Score()

scheduler = LinearLR(
    optimizer=optimizer,
    start_factor=0.1,
    end_factor=1,
    total_iters=BATCH_SIZE/10,
    verbose=True
)
best_epoch_loss = 999
for epoch in range(N_EPOCHS):
    
    print(f"Epoch: {epoch} of {N_EPOCHS}\n")
    
    # Run training
    model.train()
    
    train_data = SiameseTripletDataset(
        claims_paths=[DATA_PATH.with_name("train-claims.json")],
        claims_shortlist_paths=[NER_PATH.with_name("train_claim_evidence_retrieved.json")],
        evidence_shortlists=[NER_PATH.with_name("shortlist_train_claim_evidence_retrieved.json")],
        evidence_path=DATA_PATH.with_name("evidence.json"),
        device=TORCH_DEVICE,
        n_neg_shortlist=5,
        n_neg_general=1
    )
    
    train_dataloader = DataLoader(
        dataset=train_data,
        shuffle=True,
        batch_size=BATCH_SIZE
    )
    
    train_batches = tqdm(train_dataloader, desc="train batches")
    running_losses = []
    for batch in train_batches:
        anchor, positive, negative = batch
        
        # Reset optimizer
        optimizer.zero_grad()
        
        # Forward + loss
        anchor_emb, pos_emb, neg_emb = model(anchor, positive, negative)
        loss = loss_fn(anchor=anchor_emb, positive=pos_emb, negative=neg_emb)
        
        # Backward + optimiser
        loss.backward()
        optimizer.step()
        
        # Update running loss
        batch_loss = loss.item() * len(batch)
        running_losses.append(batch_loss)
        
        train_batches.postfix = f"loss: {batch_loss:.3f}"
        
        continue
    
    scheduler.step()
    
    epoch_loss = np.average(running_losses)
    print(f"Average epoch loss: {epoch_loss}")
    
    # Save model
    if epoch_loss <= best_epoch_loss:
        best_epoch_loss = epoch_loss
        torch.save(model, MODEL_PATH.with_name(MODEL_NAME))
        print(f"Saved model to: {MODEL_PATH.with_name(MODEL_NAME)}")
    
    # Evaluate every 5 epochs
    # if epoch % 5 != 0:
    #     continue
    
    # Run evaluation before training to establish baseline

    model.eval()

    dev_batches = tqdm(dev_dataloader, desc="dev batches")
    epoch_pos_cos_sim = []
    epoch_neg_cos_sim = []
    for batch in dev_batches:
        claim_texts, evidence_texts, labels = batch

        # Forward
        claim_emb, evidence_emb = model(claim_texts, evidence_texts)

        # Cosine similarity
        cos_sim = torch.cosine_similarity(claim_emb, evidence_emb)
        labelled_cos_sim = cos_sim * labels
        pos_cos_sim = labelled_cos_sim[torch.where(labelled_cos_sim > 0)]
        neg_cos_sim = labelled_cos_sim[torch.where(labelled_cos_sim < 0)]

        batch_pos_cos_sim = torch.mean(pos_cos_sim).cpu().item()
        batch_neg_cos_sim = torch.mean(neg_cos_sim).cpu().item() * -1

        epoch_pos_cos_sim.append(batch_pos_cos_sim)
        epoch_neg_cos_sim.append(batch_neg_cos_sim)

        dev_batches.postfix = f"pos cos_sim: {batch_pos_cos_sim:.3f}" + \
            f" neg cos_sim: {batch_neg_cos_sim:.3f}"

        continue
    
    print(f"Average cos sim (pos, neg): {np.mean(epoch_pos_cos_sim):3f}, {np.mean(epoch_neg_cos_sim):3f}")

print("Done!")

Adjusting learning rate of group 0 to 2.0000e-08.
Epoch: 0 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:23<00:00, 51.32it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [35:45<00:00,  1.34s/it, loss: 4.260] 


Adjusting learning rate of group 0 to 4.8125e-08.
Average epoch loss: 7.312220301500043
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.16it/s, pos cos_sim: 0.981 neg cos_sim: 0.968]


Average cos sim (pos, neg): 0.961867, 0.960008
Epoch: 1 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:23<00:00, 52.26it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [35:53<00:00,  1.35s/it, loss: 0.366]


Adjusting learning rate of group 0 to 7.6250e-08.
Average epoch loss: 5.077474236488342
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.13it/s, pos cos_sim: 0.982 neg cos_sim: 0.976]


Average cos sim (pos, neg): 0.976924, 0.970769
Epoch: 2 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:23<00:00, 52.32it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [36:00<00:00,  1.35s/it, loss: 5.545]


Adjusting learning rate of group 0 to 1.0437e-07.
Average epoch loss: 3.1247903037115905
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.05it/s, pos cos_sim: 0.975 neg cos_sim: 0.966]


Average cos sim (pos, neg): 0.975957, 0.961432
Epoch: 3 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:25<00:00, 49.05it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [35:24<00:00,  1.33s/it, loss: 1.703]


Adjusting learning rate of group 0 to 1.3250e-07.
Average epoch loss: 2.304878095233537
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.11it/s, pos cos_sim: 0.971 neg cos_sim: 0.956]


Average cos sim (pos, neg): 0.973609, 0.950006
Epoch: 4 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:25<00:00, 48.64it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [35:32<00:00,  1.33s/it, loss: 2.753]


Adjusting learning rate of group 0 to 1.6062e-07.
Average epoch loss: 1.9718570544766605
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:08<00:00,  3.93it/s, pos cos_sim: 0.968 neg cos_sim: 0.952]


Average cos sim (pos, neg): 0.972987, 0.945812
Epoch: 5 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:28<00:00, 42.64it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [36:25<00:00,  1.37s/it, loss: 0.000]


Adjusting learning rate of group 0 to 1.8875e-07.
Average epoch loss: 1.655956586363314
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.06it/s, pos cos_sim: 0.965 neg cos_sim: 0.947]


Average cos sim (pos, neg): 0.971302, 0.941882
Epoch: 6 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:24<00:00, 49.38it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [35:39<00:00,  1.34s/it, loss: 3.492]


Adjusting learning rate of group 0 to 1.8875e-07.
Average epoch loss: 1.4308717301409741
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.12it/s, pos cos_sim: 0.958 neg cos_sim: 0.940]


Average cos sim (pos, neg): 0.967719, 0.934870
Epoch: 7 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:25<00:00, 48.29it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [35:33<00:00,  1.33s/it, loss: 0.000]


Adjusting learning rate of group 0 to 1.8875e-07.
Average epoch loss: 1.2294666864987762
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.03it/s, pos cos_sim: 0.951 neg cos_sim: 0.928]


Average cos sim (pos, neg): 0.963640, 0.924704
Epoch: 8 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:25<00:00, 48.49it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [35:39<00:00,  1.34s/it, loss: 3.047]


Adjusting learning rate of group 0 to 1.8875e-07.
Average epoch loss: 1.0845261827427697
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.21it/s, pos cos_sim: 0.947 neg cos_sim: 0.920]


Average cos sim (pos, neg): 0.960711, 0.916577
Epoch: 9 of 10

Generate siamese dataset


claims: 100%|██████████| 1228/1228 [00:25<00:00, 48.74it/s]


Generated data n=102402


train batches: 100%|██████████| 1601/1601 [35:27<00:00,  1.33s/it, loss: 0.000]


Adjusting learning rate of group 0 to 1.8875e-07.
Average epoch loss: 0.9524464424255116
Saved model to: /Users/johnsonzhou/git/comp90042-project/result/models/model_03a_bert_base_triplet_2023_05_04_07_08.pth


dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.22it/s, pos cos_sim: 0.940 neg cos_sim: 0.910]

Average cos sim (pos, neg): 0.957843, 0.909097
Done!





In [19]:
@dataclass
class SentenceSimilarity:
    label:int
    score:float
    claim_text:str
    evidence_text:str

In [20]:
# Run evaluation before training to establish baseline
model.eval()

dev_batches = tqdm(dev_dataloader, desc="dev batches")
epoch_pos_cos_sim = []
epoch_neg_cos_sim = []
sentence_rankings = []
sentence_rankings_dot = []
for batch in dev_batches:
    claim_texts, evidence_texts, labels = batch
    
    # Forward
    claim_emb, evidence_emb = model(claim_texts, evidence_texts)
    
    # Cosine similarity
    cos_sim = torch.cosine_similarity(claim_emb, evidence_emb)
    labelled_cos_sim = cos_sim * labels
    pos_cos_sim = labelled_cos_sim[torch.where(labelled_cos_sim > 0)]
    neg_cos_sim = labelled_cos_sim[torch.where(labelled_cos_sim < 0)]
    
    batch_pos_cos_sim = torch.mean(pos_cos_sim).cpu().item()
    batch_neg_cos_sim = torch.mean(neg_cos_sim).cpu().item() * -1
    
    epoch_pos_cos_sim.append(batch_pos_cos_sim)
    epoch_neg_cos_sim.append(batch_neg_cos_sim)
    
    for claim_text, evidence_text, label, score in \
        zip(
            claim_texts, evidence_texts, 
            labels.detach().cpu().numpy(), cos_sim.detach().cpu().numpy()
        ):
        sentence_rankings.append(SentenceSimilarity(
            label=label,
            score=score,
            claim_text=claim_text,
            evidence_text=evidence_text
        ))
    
    dev_batches.postfix = f"pos cos_sim: {batch_pos_cos_sim:.3f}" + \
        f" neg cos_sim: {batch_neg_cos_sim:.3f}"
    
    continue

dev batches: 100%|██████████| 32/32 [00:07<00:00,  4.32it/s, pos cos_sim: 0.940 neg cos_sim: 0.910]


In [21]:
print(f"Average cos sim (pos, neg): {np.mean(epoch_pos_cos_sim):3f}, {np.mean(epoch_neg_cos_sim):3f}")

Average cos sim (pos, neg): 0.957843, 0.909097


In [22]:
sentence_rankings[:50]

[SentenceSimilarity(label=1, score=0.9862933, claim_text='[South Australia] has the most expensive electricity in the world.', evidence_text='[citation needed] South Australia has the highest retail price for electricity in the country.'),
 SentenceSimilarity(label=1, score=0.9728399, claim_text='[South Australia] has the most expensive electricity in the world.', evidence_text='"South Australia has the highest power prices in the world".'),
 SentenceSimilarity(label=-1, score=0.88018227, claim_text='[South Australia] has the most expensive electricity in the world.', evidence_text='"How the oceans absorb carbon dioxide is critical for predicting climate change".'),
 SentenceSimilarity(label=-1, score=0.91277826, claim_text='[South Australia] has the most expensive electricity in the world.', evidence_text='Families are forced into increasing poverty, some facing a daily struggle to pay their rent and put food on their table.'),
 SentenceSimilarity(label=-1, score=0.94074315, claim_tex

In [23]:
pos_sentence_rankings = sorted([
    s for s in sentence_rankings
    if s.label == 1
], key=lambda s: s.score)

neg_sentence_rankings = sorted([
    s for s in sentence_rankings
    if s.label == -1
], key=lambda s: s.score, reverse=True)

In [24]:
pos_sentence_rankings[:50]

[SentenceSimilarity(label=1, score=0.7421593, claim_text='The unlikely scenarios are now, all of a sudden, becoming more probable than they once were thought to be,’ says Sweet.”', evidence_text='The reverse possibility, that a small group broke off and wandered from India into Western Asia is readily dismissed as an improbably long migration, again without the least bit of evidence.'),
 SentenceSimilarity(label=1, score=0.8128866, claim_text='The unlikely scenarios are now, all of a sudden, becoming more probable than they once were thought to be,’ says Sweet.”', evidence_text="In recent decades, a few specialists have continued to support this interpretation, and Peter Schrijver has said that 'to a large extent, it is linguistics that is responsible for thinking in terms of drastic scenarios' about demographic change in late Roman Britain."),
 SentenceSimilarity(label=1, score=0.8138266, claim_text='Satellite measurements of infrared spectra over the past 40 years observe less energy

In [25]:
neg_sentence_rankings[:50]

[SentenceSimilarity(label=-1, score=0.9880421, claim_text='Last year’s warmth was manifested across the planet, from the warm tropical ocean waters off the coast of northeastern Australia, where the Great Barrier Reef experienced its worst coral bleaching event on record and large scale coral death, to the Arctic, where sea ice hit regular monthly record lows and overall temperatures were also the warmest on record, at least from January through September 2016.', evidence_text='The change was attributed to increasingly cold winters in the Arctic stratosphere at an altitude of approximately 20\xa0km (12\xa0mi), a change associated with global warming in a relationship that is still under investigation.'),
 SentenceSimilarity(label=-1, score=0.9875697, claim_text='Once natural influences, in particular the impact of El Niño and La Niña, are removed from the recent termperature record, there is no evidence of a significant change in the human contribution to climate change.', evidence_tex