## Measuring how many sentences are in the knowledge base

In [5]:
import pickle

filename = "sentence_similarity/data/sentence_section_pairs.pkl"

with open(filename, 'rb') as f:
    kb = pickle.load(f)

print(len(kb))

16004


## Timing how long cosine similarities take

In [9]:
# generate a random vector with the 300 dimensions
import numpy as np

# calculate the cosine similarity 16000 times and measure how long it takes
from sklearn.metrics.pairwise import cosine_similarity
import time

start = time.time()
for i in range(16000):
    cosine_similarity(np.random.rand(1, 10000), np.random.rand(1, 10000))
end = time.time()
print(end - start)

3.7546026706695557


## Inspecting training data labels are correct for reranker

In [10]:
import torch
import pandas as pd
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    AutoConfig,
)
from huggingface_hub import HfFolder

model_id = "climatebert/distilroberta-base-climate-f"


# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "iestynmullinor/climatebert-rerank-fever"

training_data_path = "iestynmullinor/fever_reranker_training"

dataset = load_dataset(training_data_path)

train_data = dataset["train"]
test_data = dataset["test"]
dev_data = dataset["validation"]

tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

def tokenize(batch):
    claims = list(map(str, batch["claim"]))
    evidences = list(map(str, batch["evidence"]))
    tokenized_inputs = tokenizer(claims, evidences, padding=True, truncation=True, max_length=256)
    return tokenized_inputs

train_data= train_data.map(tokenize, batched=True, batch_size=len(train_data))
test_data = test_data.map(tokenize, batched=True, batch_size=len(test_data))
dev_data = test_data.map(tokenize, batched=True, batch_size=len(dev_data))

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 37.0/37.0 [00:00<00:00, 84.1kB/s]
Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 77.6M/77.6M [00:16<00:00, 4.76MB/s]
Downloading data: 100%|██████████| 6.89M/6.89M [00:02<00:00, 2.42MB/s]
Downloading data: 100%|██████████| 6.77M/6.77M [00:01<00:00, 4.89MB/s]
Downloading data files: 100%|██████████| 3/3 [00:20<00:00,  6.85s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1897.88it/s]
Generating train split: 208346 examples [00:00, 383847.84 examples/s]
Generating validation split: 19998 examples [00:00, 402607.77 examples/s]
Generating test split: 19998 examples [00:00, 402042.34 examples/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Map: 100%|██████████| 208346/208346 [00:29<00:00, 6996.79 examples/s]
Map: 100%|██████████| 19998/19998 [00:01<00:00, 13324.00 e

In [13]:
print(train_data.column_names)
print(train_data[0])
print(tokenizer.convert_ids_to_tokens(train_data[0]["input_ids"]))

['claim', 'evidence', 'label', 'input_ids', 'attention_mask']
{'claim': 'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.', 'evidence': 'The Fox Broadcasting Company ( often shortened to Fox and stylized as FOX ) is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox . Nikolaj Coster-Waldau . He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam ( 2008 ) , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot . He became widely known to a broad audience for his current role as Ser Jaime Lannister , in the HBO series Game of Thrones .', 'label': 0, 'input_ids': [0, 38334, 1168, 1176, 230, 13991, 12, 771, 5618, 1180, 1006, 19, 5, 2063, 13610, 1260, 4, 2, 2, 133, 2063, 13610, 1260, 36, 747, 30288, 7, 2063, 8, 15240, 1538, 25, 7481, 4839, 16, 41, 470, 2370, 2777, 1861, 2308, 2384, 1546, 

## Undersampling data for reranker

In [9]:
import pandas as pd
from datasets import load_dataset
import datasets
from transformers import (
    RobertaTokenizerFast,
    
)
from sklearn.utils import shuffle
from imblearn.under_sampling import RandomUnderSampler

UNDERSAMPLE=True

model_id = "climatebert/distilroberta-base-climate-f"


# relace the value with your model: ex <hugging-face-user>/<model-name>
repository_id = "iestynmullinor/climatebert-rerank-fever"

training_data_path = "iestynmullinor/fever_reranker_training"

dataset = load_dataset(training_data_path)

train_data = dataset["train"]
test_data = dataset["test"]
dev_data = dataset["validation"]

tokenizer = RobertaTokenizerFast.from_pretrained(model_id)

def tokenize(batch):
    claims = list(map(str, batch["claim"]))
    evidences = list(map(str, batch["evidence"]))
    tokenized_inputs = tokenizer(claims, evidences, padding=True, truncation=True, max_length=256)
    return tokenized_inputs

print(train_data.column_names)

# remove all examples where the evidence is None

train_data = train_data.filter(lambda example: example["evidence"] is not None)

if UNDERSAMPLE:
    # Convert the DataFrame to a pandas DataFrame
    train_data_pd = pd.DataFrame(train_data)

    # Count the number of instances in each class
    class_counts = train_data_pd['label'].value_counts()

    # Find the number of instances in the minority class
    minority_class_count = class_counts.min()

    # Perform undersampling
    undersampled_data = pd.concat(
        [train_data_pd[train_data_pd['label'] == label].sample(n=minority_class_count, random_state=42) for label in class_counts.index]
    )

    # Shuffle the undersampled data
    undersampled_data = shuffle(undersampled_data, random_state=42)

    # Convert the undersampled data back to a Hugging Face dataset
    train_data = datasets.Dataset.from_dict(undersampled_data.to_dict('list'))

    print(train_data.column_names)
    print(train_data[0])
    print(train_data[1])
    print(train_data[2])

    train_data_df = pd.DataFrame(train_data)

    # print the number of samples for class 0 and class 1
    print("Number of samples for class 0: ", len(train_data_df[train_data_df["label"] == 0]))
    print("Number of samples for class 1: ", len(train_data_df[train_data_df["label"] == 1]))



Repo card metadata block was not found. Setting CardData to empty.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


['claim', 'evidence', 'label']


Filter: 100%|██████████| 208346/208346 [00:00<00:00, 509407.68 examples/s]


['claim', 'evidence', 'label']
{'claim': 'Singapore lies 137 km or one degree north of the equator.', 'evidence': "Singapore . It lies one degree ( 137 km ) north of the equator , at the southern tip of peninsular Malaysia , with Indonesia 's Riau Islands to the south .", 'label': 0}
{'claim': 'Louis C.K. took a hiatus in 2016.', 'evidence': 'Louis C.K. . During an extended Louie hiatus , C.K. created and starred in his web series Horace and Pete in 2016 , and voiced the lead role in the animated film The Secret Life of Pets the same year .', 'label': 1}
{'claim': 'Hugh Jackman plays Wolverine.', 'evidence': 'Hugh Jackman . He is known for his long-running role as Wolverine in the X-Men film series , as well as for his lead roles in films such as the romantic-comedy fantasy Kate & Leopold ( 2001 ) , the action-horror film Van Helsing ( 2004 ) , the magic-themed drama The Prestige ( 2006 ) , the epic fantasy drama The Fountain ( 2006 ) , the epic historical romantic drama Australia ( 20