Author: Mingang Hao 1326642 mhhao1@student.unimelb.edu.au

Installation & Data loading
---------------

In [1]:
import json
import string
import re
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import torch

def load_data(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Remove punctuation
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

path = "project-data"
train_data = load_data(os.path.join(path,'train-claims.json'))
dev_data = load_data(os.path.join(path,'dev-claims.json'))
test_data = load_data(os.path.join(path,'test-claims-unlabelled.json'))
evidence_data = load_data(os.path.join(path,'evidence.json'))



  from .autonotebook import tqdm as notebook_tqdm


Generating Prediction on Dev and Test Set
-------

SVC

In [17]:
# Preprocess the data
for claim_id, claim_data in train_data.items():
    train_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        train_data[claim_id]['evidences'][i] = preprocess(evidence_data[evidence_id])

for claim_id, claim_data in dev_data.items():
    dev_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        dev_data[claim_id]['evidences'][i] = preprocess(evidence_data[evidence_id])

for claim_id, claim_data in test_data.items():
    test_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])

for ev_id, ev_data in evidence_data.items():
    evidence_data[ev_id]= preprocess(ev_data)

vectorizer = TfidfVectorizer()

def retrieve_evidence(claim_text):
    claim_vector = vectorizer.transform([claim_text])
    sim_scores = cosine_similarity(tfidf_matrix, claim_vector).flatten()
    top_doc_indices = sim_scores.argsort()[::-1][:5] # retrieve top 10 evidence passages
    return [doc_ids[i] for i in top_doc_indices]
def concat_claim_with_evidence(claim_text, evidence_texts):
    return claim_text + ' ' + ' '.join(evidence_texts)

corpus = []
doc_ids = []
for evidence_id, evidence_text in evidence_data.items():
    corpus.append(evidence_text)
    doc_ids.append(evidence_id)

tfidf_matrix = vectorizer.fit_transform(corpus)


X_train = []
for claim_id, claim_data in train_data.items():
    claim_text = claim_data['claim_text']
    evidence_texts = [evidence_id for evidence_id in claim_data['evidences']]
    X_train.append(concat_claim_with_evidence(claim_text, evidence_texts))

y_train = [train_data[claim_id]['claim_label'] for claim_id in train_data]

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC(kernel='rbf'))
])

model.fit(X_train, y_train)
# Make predictions for the dev data
dev_predictions = {}
final_evidences = []
for claim_id, claim_data in dev_data.items():
    claim_text = claim_data['claim_text']
    indices = retrieve_evidence(claim_text)
    evidence_texts = [evidence_data[evidence_id] for evidence_id in indices]
    concatenated_text = concat_claim_with_evidence(claim_text, evidence_texts)
    y_pred = model.predict([concatenated_text])[0]
    for id in indices:
        ev_text = evidence_data[id]
        concatenated_text = concat_claim_with_evidence(claim_text, ev_text)
        if model.predict([concatenated_text])[0] == y_pred:
            final_evidences.append(id)
    dev_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred,
        "evidences": final_evidences
    }

# Make predictions for the test data
test_predictions = {}
for claim_id, claim_data in test_data.items():
    claim_text = claim_data['claim_text']
    indices = retrieve_evidence(claim_text)
    evidence_texts = [evidence_data[evidence_id] for evidence_id in indices]
    concatenated_text = concat_claim_with_evidence(claim_text, evidence_texts)
    y_pred = model.predict([concatenated_text])[0]
    
    test_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred,
        "evidences": indices
    }

In [16]:
with open(os.path.join(path,'test_predictions_svm.json'), 'w') as f:
    json.dump(test_predictions, f)
with open(os.path.join(path,'dev_predictions_svm.json'), 'w') as f:
    json.dump(dev_predictions, f)

Gensim WMD

In [106]:
lemmatizer = WordNetLemmatizer()
path = "project-data"
train_data = load_data(os.path.join(path,'train-claims.json'))
dev_data = load_data(os.path.join(path,'dev-claims.json'))
test_data = load_data(os.path.join(path,'test-claims-unlabelled.json'))
evidence_data = load_data(os.path.join(path,'evidence.json'))
def preprocess(sentence):
    sentence = re.sub('[^a-zA-Z0-9]', ' ', sentence)
    return  ' '.join([lemmatizer.lemmatize(w) for w in sentence.lower().split() if w not in stop_words])
# Preprocess the data
for claim_id, claim_data in train_data.items():
    train_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        train_data[claim_id]['evidences'][i] = preprocess(evidence_data[evidence_id])

for claim_id, claim_data in dev_data.items():
    dev_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        dev_data[claim_id]['evidences'][i] = preprocess(evidence_data[evidence_id])

for claim_id, claim_data in test_data.items():
    test_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])


import gensim.downloader as api
wmd_model = api.load('word2vec-google-news-300')

vectorizer = TfidfVectorizer()
corpus = []
doc_ids = []
for evidence_id, evidence_text in evidence_data.items():
    corpus.append(preprocess(evidence_text))
    doc_ids.append(evidence_id)

tfidf_matrix = vectorizer.fit_transform(corpus)

In [None]:
def retrieve_evidence(claim_text):
    claim_vector = vectorizer.transform([claim_text])
    sim_scores = cosine_similarity(tfidf_matrix, claim_vector).flatten()    
    top_doc_indices = sim_scores.argsort()[::-1][:100] # retrieve top 100 evidence passages
    ev_text = [corpus[i] for i in top_doc_indices]    
    wmd_distances = []
    for ev in ev_text:
        wmd_distances.append(wmd_model.wmdistance(claim_text.split(),ev.split()))
    wmd_indices_sorted = np.array(wmd_distances).argsort()[:3]  # Smaller distances are better
    top_wmd_indices = [top_doc_indices[i] for i in wmd_indices_sorted]
    #print(top_wmd_indices)
    return [doc_ids[i] for i in top_wmd_indices]


def concat_claim_with_evidence(claim_text, evidence_texts):
    return preprocess(claim_text + ' ' + ' '.join(evidence_texts))


X_train = []
for claim_id, claim_data in train_data.items():
    claim_text = claim_data['claim_text']
    evidence_texts = [evidence_id for evidence_id in claim_data['evidences']]
    X_train.append(concat_claim_with_evidence(claim_text, evidence_texts))

y_train = [train_data[claim_id]['claim_label'] for claim_id in train_data]

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC(kernel='rbf'))
])

model.fit(X_train, y_train)
# Make predictions for the dev data
for claim_id, claim_data in dev_data.items():
    claim_text = claim_data['claim_text']
    indices = retrieve_evidence(claim_text)
    evidence_texts = [evidence_data[evidence_id] for evidence_id in indices]
    concatenated_text = concat_claim_with_evidence(claim_text, evidence_texts)
    y_pred = model.predict([concatenated_text])[0]
    dev_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred,
        "evidences": indices
    }

# Make predictions for the test data
test_predictions = {}
for claim_id, claim_data in test_data.items():
    claim_text = claim_data['claim_text']
    indices = retrieve_evidence(claim_text)
    evidence_texts = [evidence_data[evidence_id] for evidence_id in indices]
    concatenated_text = concat_claim_with_evidence(claim_text, evidence_texts)
    y_pred = model.predict([concatenated_text])[0]
    
    test_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred,
        "evidences": indices
    }

In [103]:
with open(os.path.join(path,'test_predictions_wmd.json'), 'w') as f:
    json.dump(test_predictions, f)
with open(os.path.join(path,'dev_predictions_wmd.json'), 'w') as f:
    json.dump(dev_predictions, f)

lstm

In [14]:
import os
import json
import re
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.layers import Dense, LSTM, Embedding
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from keras_preprocessing.sequence import pad_sequences

path = "project-data"
train_data = json.load(open(os.path.join(path, "train-claims.json")))
dev_data = json.load(open(os.path.join(path, "dev-claims.json")))
test_data = json.load(open(os.path.join(path, "test-claims-unlabelled.json")))
evidence_data = json.load(open(os.path.join(path, "evidence.json")))
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Remove punctuation
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text
def retrieve_evidence(claim_text):
    claim_vector = vectorizer.transform([claim_text])
    sim_scores = cosine_similarity(tfidf_matrix, claim_vector).flatten()
    top_doc_indices = sim_scores.argsort()[::-1][:5] # retrieve top 10 evidence passages
    return [doc_ids[i] for i in top_doc_indices]

train_claims = []
train_labels = []

for claim_id, claim_data in train_data.items():
    claim_text = preprocess(claim_data['claim_text'])
    evidence_ids = retrieve_evidence(claim_text)
    evidence_texts = [preprocess(evidence_data[evidence_id]) for evidence_id in evidence_ids]
    concatenated_text = concat_claim_with_evidence(claim_text, evidence_texts)

    train_claims.append(concatenated_text)
    train_labels.append(claim_data['claim_label'])


dev_claims = []
dev_labels = []
for claim_id, claim_data in dev_data.items():
    dev_claims.append(preprocess(claim_data['claim_text']))    
    dev_labels.append(claim_data['claim_label'])

test_claims = []
for claim_id, claim_data in test_data.items():
    test_claims.append(preprocess(claim_data['claim_text']))

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_claims)
train_sequences = tokenizer.texts_to_sequences(train_claims)
dev_sequences = tokenizer.texts_to_sequences(dev_claims)
test_sequences = tokenizer.texts_to_sequences(test_claims)

# Find the maximum sequence length
max_length = max(max([len(seq) for seq in train_sequences]),
                 max([len(seq) for seq in dev_sequences]),
                 max([len(seq) for seq in test_sequences]))

# Pad your sequences so that they all have the same length
train_sequences = pad_sequences(train_sequences, maxlen=max_length)
dev_sequences = pad_sequences(dev_sequences, maxlen=max_length)
test_sequences = pad_sequences(test_sequences, maxlen=max_length)


# Encode the labels
label_encoder = LabelEncoder()
label_encoder.fit(train_labels)
train_labels_encoded = label_encoder.transform(train_labels)
dev_labels_encoded = label_encoder.transform(dev_labels)
num_classes = len(label_encoder.classes_)

# Define the model
max_length = max([len(seq) for seq in np.concatenate((train_sequences, dev_sequences, test_sequences))])
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64, input_length=max_length))
model.add(LSTM(64))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_sequences, train_labels_encoded, epochs=10, validation_data=(dev_sequences, dev_labels_encoded))

# Make predictions
# Make predictions
dev_predictions = {}
for claim_id, claim_data in dev_data.items():
    claim_text = preprocess(claim_data['claim_text'])
    evidence_ids = retrieve_evidence(claim_text)
    evidence_texts = [preprocess(evidence_data[evidence_id]) for evidence_id in evidence_ids]
    concatenated_text = concat_claim_with_evidence(claim_text, evidence_texts)

    # Prepare the concatenated text for prediction
    sequence = tokenizer.texts_to_sequences([concatenated_text])
    sequence = pad_sequences(sequence, maxlen=max_length)

    # Make a prediction
    prediction = model.predict(sequence)
    y_pred_label_encoded = np.argmax(prediction, axis=1)[0]
    y_pred_label = label_encoder.inverse_transform([y_pred_label_encoded])[0]
    
    dev_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred_label,
        "evidences": evidence_ids
    }
test_predictions = {}
for claim_id, claim_data in test_data.items():
    claim_text = preprocess(claim_data['claim_text'])
    evidence_ids = retrieve_evidence(claim_text)
    evidence_texts = [preprocess(evidence_data[evidence_id]) for evidence_id in evidence_ids]
    concatenated_text = concat_claim_with_evidence(claim_text, evidence_texts)

    # Prepare the concatenated text for prediction
    sequence = tokenizer.texts_to_sequences([concatenated_text])
    sequence = pad_sequences(sequence, maxlen=max_length)

    # Make a prediction
    prediction = model.predict(sequence)
    y_pred_label_encoded = np.argmax(prediction, axis=1)[0]
    y_pred_label = label_encoder.inverse_transform([y_pred_label_encoded])[0]
    
    test_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred_label,
        "evidences": evidence_ids
    }    

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [16]:
with open(os.path.join(path,'test_predictions_lstm.json'), 'w') as f:
    json.dump(test_predictions, f)
with open(os.path.join(path,'dev_predictions_lstm.json'), 'w') as f:
    json.dump(dev_predictions, f)

word2vec+cosine sim

In [7]:
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

word_vectors = api.load("word2vec-google-news-300") # Load pre-trained word vectors

def get_word_embeddings(text):
    embeddings = []
    for word in text.split():
        if word in word_vectors:
            embeddings.append(word_vectors[word])
    if len(embeddings) == 0:
        return None
    else:
        return np.mean(embeddings, axis=0)

def retrieve_evidence(claim_text, evidence_data):
    sim_scores = []
    for evidence_id, evidence_text in evidence_data.items():
        claim_emb = get_word_embeddings(claim_text)
        evidence_emb = get_word_embeddings(evidence_text)
        if claim_emb is None or evidence_emb is None:
            sim_scores.append(0)
        else:
            sim_scores.append(cosine_similarity([claim_emb], [evidence_emb])[0][0])
    top_doc_indices = np.argsort(sim_scores)[::-1][:5] # retrieve top 5 evidence passages
    return [list(evidence_data.keys())[i] for i in top_doc_indices]

X_train = [train_data[claim_id]['claim_text'] for claim_id in train_data]
y_train = [train_data[claim_id]['claim_label'] for claim_id in train_data]

model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', SVC(kernel='rbf'))
])

model.fit(X_train, y_train)
dev_predictions = {}
for claim_id, claim_data in dev_data.items():
    claim_text = claim_data['claim_text']
    evidences = retrieve_evidence(claim_text, evidence_data)
    evidence_texts = [evidence_data[eid] for eid in evidences]
    evidence_text = ' '.join(evidence_texts)
    y_pred = model.predict([claim_text])[0]
    
    dev_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred,
        "evidences": evidences
    }
test_predictions = {}
for claim_id, claim_data in test_data.items():
    claim_text = claim_data['claim_text']
    evidences = retrieve_evidence(claim_text, evidence_data)
    evidence_texts = [evidence_data[eid] for eid in evidences]
    evidence_text = ' '.join(evidence_texts)
    y_pred = model.predict([claim_text])[0]
    
    test_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred,
        "evidences": evidences
    }





In [None]:
with open(os.path.join(path,'test_predictions_w2v.json'), 'w') as f:
    json.dump(test_predictions, f)
with open(os.path.join(path,'dev_predictions_w2v.json'), 'w') as f:
    json.dump(dev_predictions, f)

--------------------------------------------


Code block onwards are illustration of my experiment, not all of them will generate test_prediction in correct format. 
--------------------

Pandas preprocessing

In [5]:
# create a list of IDs and a list of evidence data
ids = list(evidence_data.keys())
evidence_list = list(evidence_data.values())

# create a pandas dataframe with two columns
evidence_df = pd.DataFrame({'id': ids, 'evidence': evidence_list})

train_df = pd.DataFrame.from_dict(train_data).transpose()
dev_df = pd.DataFrame.from_dict(dev_data).transpose()
test_df = pd.DataFrame.from_dict(test_data).transpose()
# Define the label-to-integer mapping
label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}

# Convert the labels to integers
train_df['claim_label'] = train_df['claim_label'].replace(label_map)
dev_df['claim_label'] = dev_df['claim_label'].replace(label_map)
# create a list of IDs and a list of evidence data
ids = list(evidence_data.keys())
evidence_list = list(evidence_data.values())

# create a pandas dataframe with two columns
evidence_df = pd.DataFrame({'id': ids, 'evidence': evidence_list})

train_df = pd.DataFrame.from_dict(train_data).transpose()
dev_df = pd.DataFrame.from_dict(dev_data).transpose()
test_df = pd.DataFrame.from_dict(test_data).transpose()
# Define the label-to-integer mapping
label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}

# Convert the labels to integers
train_df['claim_label'] = train_df['claim_label'].replace(label_map)
dev_df['claim_label'] = dev_df['claim_label'].replace(label_map)


Fine-tune the BERT model on the training set using a fact-checking objective:

In [None]:
data_list = [{"id": k, **v} for k, v in train_data.items()]
train_sp, temp_data = train_test_split(data_list, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
bert_name = "bert-base-uncased"
num_labels = 4  # Number of labels (e.g., SUPPORTS, REFUTES, NOT RELATED)

class FactCheckingDataset(Dataset):
    def __init__(self, claims_data, evidence_data, tokenizer):
        self.claims_data = claims_data
        self.evidence_data = evidence_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.claims_data)

    def __getitem__(self, idx):
        claim = self.claims_data[idx]
        claim_text = claim["claim_text"]
        evidence_texts = [self.evidence_data[evidence_id] for evidence_id in claim["evidences"]]
        concatenated_text = claim_text + " " + " ".join(evidence_texts)
        label = label_map[claim["claim_label"]]
        encoding = self.tokenizer(concatenated_text, truncation=True, padding="max_length", max_length=512, return_tensors='pt')
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label),
        }


# Initialize the BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained(bert_name, num_labels=4) 
tokenizer = BertTokenizer.from_pretrained(bert_name)
label_map = {"SUPPORTS": 0, "REFUTES": 1, "NOT_ENOUGH_INFO": 2, "DISPUTED": 3}

# Convert the training and validation data to Dataset format
train_dataset = FactCheckingDataset(train_sp, evidence_data, tokenizer)
val_dataset = FactCheckingDataset(val_data, evidence_data, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=4)

# Set the model to use the GPU, if available
device = 'cuda'
model.to(device)

from datasets import load_metric

def compute_metrics(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    seed=42,
    learning_rate=1e-5,
)

# Create a Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save the fine-tuned model
output_dir = "finetuned_bert"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
    

Finetuned Bert, Top 10 Cosine, SVM

In [2]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity

import torch

# Load fine-tuned BERT model and tokenizer
bert_model = BertModel.from_pretrained('finetuned_bert')
bert_tokenizer = BertTokenizer.from_pretrained('finetuned_bert')

# Function to get BERT embeddings
def get_bert_embeddings(texts, batch_size=4):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        inputs = bert_tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
        outputs = bert_model(**inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :].detach().numpy()  # We take the embeddings from the [CLS] token
        embeddings.append(batch_embeddings)
    return np.concatenate(embeddings)

def retrieve_evidence(claim_text):
    claim_embedding = get_bert_embeddings([claim_text])
    sim_scores = cosine_similarity(evidence_embeddings, claim_embedding)
    top_doc_indices = sim_scores.flatten().argsort()[::-1][:5]  # retrieve top 5 evidence passages
    return [list(evidence_data.keys())[i] for i in top_doc_indices]



evidence_texts = list(evidence_data.values())
evidence_embeddings = get_bert_embeddings(evidence_texts)


X_train = [train_data[claim_id]['claim_text'] for claim_id in train_data]
y_train = [train_data[claim_id]['claim_label'] for claim_id in train_data]

# Get BERT embeddings for the train data
X_train = get_bert_embeddings(X_train)

# Train the model with BERT embeddings
model = make_pipeline(StandardScaler(), SVC(kernel='rbf'))
model.fit(X_train, y_train)

# Make predictions for the dev data
dev_predictions = {}
for claim_id, claim_data in dev_data.items():
    claim_text = claim_data['claim_text']
    evidences = retrieve_evidence(claim_text)
    claim_embedding = get_bert_embeddings([claim_text])
    y_pred = model.predict(claim_embedding)[0]
    
    dev_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred,
        "evidences": evidences
    }

# Make predictions for the test data
test_predictions = {}
for claim_id, claim_data in test_data.items():
    claim_text = claim_data['claim_text']
    evidences = retrieve_evidence(claim_text)
    claim_embedding = get_bert_embeddings([claim_text])
    y_pred = model.predict(claim_embedding)[0]
    
    test_predictions[claim_id] = {
        "claim_text": claim_text,
        "claim_label": y_pred,
        "evidences": evidences
    }  


Some weights of the model checkpoint at finetuned_bert were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
num_epochs = 3

for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, optimizer, scheduler, device)
    dev_loss = evaluate(model, dev_dataloader, device)
    print(f"Epoch: {epoch + 1}, Train Loss: {train_loss:.4f}, Dev Loss: {dev_loss:.4f}")


In [None]:
def generate_evidence_embeddings(corpus, batch_size=32):
    evidence_embeddings = []
    
    for i in range(0, len(corpus), batch_size):
        batch = corpus[i:i + batch_size]
        batch_embeddings = [get_claim_embedding(evidence_text) for evidence_text in batch]
        evidence_embeddings.extend(batch_embeddings)
    
    return evidence_embeddings

evidence_embeddings = generate_evidence_embeddings(corpus)


Set inputs for Grid Search
**Please replace the  fine_tuned_model_path with the fine-tuned transformer

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input,concatenate
from sentence_transformers import SentenceTransformer,models
# Create the input layers
claim_input = Input(shape=(768,))
evidence_input = Input(shape=(768,))

#change this name to the fine-tuned transformer
fine_tuned_model_path = "finetuned_bert"

# Load your fine-tuned model as a Sentence Transformer model
word_embedding_model = models.Transformer(fine_tuned_model_path)

# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=True)

# Create the Sentence Transformer model
device = 'cuda'
sentence_transformer = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=device)

# Now you can use the model to encode sentences to vectors
evidence_embeddings = sentence_transformer.encode(evidence_data)
# Concatenate the claim and evidence input layers
combined_input = concatenate([claim_input, evidence_input])
claim_embeddings_train = sentence_transformer.encode(train_df['claim_text'].tolist())
evidence_embeddings_train = []

for e in train_df['evidences']:
    evidence_contents = evidence_df[evidence_df['id'].isin(e)]['evidence'].tolist()
    evidence_content_embeddings = sentence_transformer.encode(evidence_contents)
    evidence_embeddings_train.append(evidence_content_embeddings)

# Make sure both lists are NumPy arrays
claim_embeddings_train = np.array(claim_embeddings_train)

# Concatenate claim and evidence embeddings in training data
evidence_embeddings_train_mean = [np.mean(evidence_emb, axis=0) for evidence_emb in evidence_embeddings_train]
X_train = np.array([np.hstack((claim_emb, evidence_emb)) for claim_emb, evidence_emb in zip(claim_embeddings_train, evidence_embeddings_train_mean)])

# Preprocess the concatenated embeddings using StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
y_train = train_df['claim_label']


Grid Search LR

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Prepare dev set embeddings
claim_embeddings_dev = sentence_transformer.encode(dev_df['claim_text'].tolist())
evidence_embeddings_dev = []

for e in dev_df['evidences']:
    evidence_contents = evidence_df[evidence_df['id'].isin(e)]['evidence'].tolist()
    evidence_content_embeddings = sentence_transformer.encode(evidence_contents)
    evidence_embeddings_dev.append(evidence_content_embeddings)

# Concatenate claim and evidence embeddings in your dev data
evidence_embeddings_dev_mean = [np.mean(evidence_emb, axis=0) for evidence_emb in evidence_embeddings_dev]
X_dev = np.array([np.hstack((claim_emb, evidence_emb)) for claim_emb, evidence_emb in zip(claim_embeddings_dev, evidence_embeddings_dev_mean)])

# Preprocess the concatenated embeddings using StandardScaler
X_dev = scaler.transform(X_dev)
# Calculate the confusion matrix
y_dev_true = dev_df['claim_label']
# Tune hyperparameters using grid search
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'max_iter': [500, 1000, 2000, 4000],
}

logreg_cv = LogisticRegression(multi_class="ovr", solver='lbfgs')
grid_search = GridSearchCV(logreg_cv, param_grid, scoring='accuracy', cv=5)
try:
    grid_search.fit(X_train, y_train)
    # Find the best hyperparameters
    best_params = grid_search.best_params_
    print("Best hyperparameters:", best_params)

    # Train the model using the best hyperparameters
    best_logreg = LogisticRegression(**best_params, multi_class="ovr", solver='lbfgs')    
except Exception as e:
    print("Error during grid search:", e)
    best_logreg = logreg_cv
    # Predict the labels for the dev set using the best model
best_logreg.fit(X_train, y_train)
y_dev_pred_best = best_logreg.predict(X_dev)

# Calculate the accuracy for the best model
accuracy_best = accuracy_score(y_dev_true, y_dev_pred_best)
print("Accuracy on the dev set using the best model: {:.4f}".format(accuracy_best))



Grid Search SVC

In [None]:
from sklearn.svm import SVC

# Train an SVM model
svm = SVC(kernel='linear', decision_function_shape='ovr')
svm.fit(X_train, y_train)

# Predict the labels for the dev set
y_dev_pred_svm = svm.predict(X_dev)

# Calculate the accuracy
accuracy_svm = accuracy_score(y_dev_true, y_dev_pred_svm)
print("Accuracy on the dev set (SVM): {:.2f}".format(accuracy_svm))

# Tune hyperparameters using grid search or other optimization techniques
param_grid_svm = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf'],
    'decision_function_shape': ['ovr']
}

svm_cv = SVC()
grid_search_svm = GridSearchCV(svm_cv, param_grid_svm, scoring='accuracy', cv=5)
grid_search_svm.fit(X_train, y_train)

# Find the best hyperparameters
best_params_svm = grid_search_svm.best_params_
print("Best hyperparameters (SVM):", best_params_svm)

# Train the model using the best hyperparameters
best_svm = SVC(**best_params_svm)
best_svm.fit(X_train, y_train)

# Predict the labels for the dev set using the best model
y_dev_pred_best_svm = best_svm.predict(X_dev)

# Calculate the accuracy for the best model
accuracy_best_svm = accuracy_score(y_dev_true, y_dev_pred_best_svm)
print("Accuracy on the dev set using the best model (SVM): {:.2f}".format(accuracy_best_svm))


Grid Search RF

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

# Train a random forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict the labels for the dev set
y_dev_pred_rf = rf.predict(X_dev)

# Calculate the accuracy
accuracy_rf = accuracy_score(y_dev_true, y_dev_pred_rf)
print("Accuracy on the dev set (Random Forest): {:.2f}".format(accuracy_rf))

# Tune hyperparameters using grid search
param_grid_rf = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_cv = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf_cv, param_grid_rf, scoring='accuracy', cv=5)
grid_search_rf.fit(X_train, y_train)

# Find the best hyperparameters
best_params_rf = grid_search_rf.best_params_
print("Best hyperparameters (Random Forest):", best_params_rf)

# Train the model using the best hyperparameters
best_rf = RandomForestClassifier(**best_params_rf, random_state=42)
best_rf.fit(X_train, y_train)

# Predict the labels for the dev set using the best model
y_dev_pred_best_rf = best_rf.predict(X_dev)

# Calculate the accuracy for the best model
accuracy_best_rf = accuracy_score(y_dev_true, y_dev_pred_best_rf)
print("Accuracy on the dev set using the best model (Random Forest): {:.2f}".format(accuracy_best_rf))


Grid Search RNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from scikeras.wrappers import KerasClassifier
if tf.config.list_physical_devices('GPU'):
    print("GPU is available.")
    device = "/GPU:0"
else:
    print("GPU is not available.")
    device = "/CPU:0"
def create_rnn_model(lstm_units=50, dropout_rate=0.2):
    model = Sequential()
    model.add(LSTM(lstm_units, input_shape=(2, 768), activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
X_train_rnn = X_train.reshape(X_train.shape[0], 2, 768)
y_train_onehot = pd.get_dummies(train_df['claim_label']).to_numpy()
from sklearn.model_selection import GridSearchCV

rnn_model = KerasClassifier(build_fn=create_rnn_model, epochs=10, batch_size=32, verbose=0)

param_grid_lstm = {
    'lstm_units': [50, 100, 150],
    'dropout_rate': [0.1, 0.2, 0.3],
    'epochs': [10, 20]
}

grid_search = GridSearchCV(estimator=rnn_model, param_grid=param_grid_lstm, cv=3, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_rnn, y_train_onehot)
best_params = grid_search.best_params_
print("Best hyperparameters (RNN):", best_params)
from sklearn.metrics import accuracy_score

# Preprocess the dev set
X_dev_rnn = X_dev.reshape(X_dev.shape[0], 2, 768)
y_dev_true = dev_df['claim_label']


# Train the model with the best hyperparameters
best_rnn_model = create_rnn_model(lstm_units=best_params['lstm_units'], dropout_rate=best_params['dropout_rate'])
with tf.device(device):
    best_rnn_model.fit(X_train_rnn, y_train_onehot, epochs=best_params['epochs'], batch_size=32, verbose=0)

# Predict the labels for the dev set
y_dev_pred_proba = best_rnn_model.predict(X_dev_rnn)
y_dev_pred_rnn = np.argmax(y_dev_pred_proba, axis=1)


# Calculate the accuracy
accuracy_rnn = accuracy_score(y_dev_true, y_dev_pred_rnn)
print("Accuracy on the dev set (RNN): {:.2f}".format(accuracy_rnn))


Grid Search LSTM

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN, Dropout

# Define the RNN model
def create_rnn_model(lstm_units=50, dropout_rate=0.2):
    model = Sequential()
    model.add(SimpleRNN(lstm_units, input_shape=(2, 768), activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

rnn_model = KerasClassifier(build_fn=create_rnn_model, epochs=10, batch_size=32, verbose=0)

# Define hyperparameters to search over
param_grid_rnn = {
    'batch_size': [16, 32, 64],
    'epochs': [10, 20, 30],
    'dropout_rate': [0.2, 0.3, 0.5]
}

# Use grid search to find the best hyperparameters
grid_search_rnn = GridSearchCV(estimator=model, param_grid=param_grid_rnn, scoring='accuracy', cv=5)
grid_search_rnn.fit(X_train, y_train)

# Find the best hyperparameters
best_params_rnn = grid_search_rnn.best_params_
print("Best hyperparameters (RNN):", best_params_rnn)

# Train the RNN model using the best hyperparameters
model = Sequential()
model.add(LSTM(64, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(best_params_rnn['dropout_rate']))
model.add(Dense(4, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=best_params_rnn['batch_size'], epochs=best_params_rnn['epochs'])

# Predict the labels for the dev set using the best model
y_dev_pred_rnn = model.predict_classes(X_dev)

# Calculate the accuracy for the best model
accuracy_best_rnn = accuracy_score(y_dev_true, y_dev_pred_rnn)
print("Accuracy on the dev set using the best model (RNN): {:.2f}".format(accuracy_best_rnn))


Better X_train

In [None]:

import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.svm import SVC
from transformers import RobertaTokenizer, RobertaModel

nltk.download('omw-1.4')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
path = "project-data"
train_data = load_data(os.path.join(path,'train-claims.json'))
dev_data = load_data(os.path.join(path,'dev-claims.json'))
test_data = load_data(os.path.join(path,'test-claims-unlabelled.json'))
evidence_data = load_data(os.path.join(path,'evidence.json'))
def preprocess(text):
    # Remove punctuation
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.lower() not in stop_words]
    # Join tokens back into a string
    text = ' '.join(tokens)
    return text

for claim_id, claim_data in train_data.items():
    train_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        train_data[claim_id]['evidences'][i] = preprocess(evidence_data[evidence_id])

for claim_id, claim_data in dev_data.items():
    dev_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        dev_data[claim_id]['evidences'][i] = preprocess(evidence_data[evidence_id])

for claim_id, claim_data in test_data.items():
    test_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta = RobertaModel.from_pretrained("roberta-base")
device = 'cuda' if torch.cuda.is_available() else 'cpu'
roberta.to(device)
# Create the corpus and doc_ids list from evidence_data ([100000,200000] records)
corpus = []
doc_ids = []
for i, (evidence_id, evidence_text) in enumerate(evidence_data.items()):
    corpus.append(preprocess(evidence_text))
    doc_ids.append(evidence_id)
    
def get_claim_embedding(claim_text):
    inputs = tokenizer(claim_text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: tensor.to(device) for key, tensor in inputs.items()}
    with torch.no_grad():
        outputs = roberta(**inputs)
    return outputs[1].cpu().numpy()

def generate_evidence_embeddings(corpus, batch_size=32):
    evidence_embeddings = []
    
    for i in range(0, len(corpus), batch_size):
        batch = corpus[i:i + batch_size]
        batch_embeddings = [get_claim_embedding(evidence_text) for evidence_text in batch]
        evidence_embeddings.extend(batch_embeddings)
    
    return evidence_embeddings
# Generate evidence embeddings
evidence_embeddings = generate_evidence_embeddings(corpus)

np.save('corpus',corpus)
np.save('doc_ids',doc_ids)
np.save('evidence_embeddings_all',evidence_embeddings)


In [None]:
evidence_embeddings_all = np.load('evidence_embeddings_all.npy')
evidence_embeddings_all= np.vstack(evidence_embeddings_all)

In [None]:
from joblib import dump, load
k = 20
train_data = load_data(os.path.join(path,'train-claims.json'))
dev_data = load_data(os.path.join(path,'dev-claims.json'))
test_data = load_data(os.path.join(path,'test-claims-unlabelled.json'))
evidence_data = load_data(os.path.join(path,'evidence.json'))

for claim_id, claim_data in train_data.items():
    train_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        train_data[claim_id]['evidences'][i] = re.sub("[^0-9]", "", evidence_id)
for claim_id, claim_data in dev_data.items():
    dev_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        dev_data[claim_id]['evidences'][i] = re.sub("[^0-9]", "", evidence_id)

for claim_id, claim_data in test_data.items():
    test_data[claim_id]['claim_text'] = re.sub("[^0-9]", "", evidence_id)

def train_svm_classifier(train_data, evidence_embeddings_all, k=15):
    X_train = []
    y_train = []

    for claim_id in train_data:
        claim_emb = get_claim_embedding(preprocess(train_data[claim_id]['claim_text'])).reshape(-1)
        claim_label = train_data[claim_id]['claim_label']

        # Calculate cosine similarity between the claim and all evidence embeddings
        cosine_sims = cosine_similarity([claim_emb], evidence_embeddings_all)[0]
        
        # Select the top-k evidence embeddings
        top_k_indices = np.argsort(cosine_sims)[-k:]
        top_k_evidence_embeddings = evidence_embeddings_all[top_k_indices]

        # Concatenate the claim embedding with the top-k evidence embeddings
        concatenated_embeddings = np.hstack((np.tile(claim_emb, (k, 1)), top_k_evidence_embeddings))

        X_train.extend(concatenated_embeddings)
        y_train.extend([claim_label] * k)

    svm_clf = SVC(kernel='rbf')
    svm_clf.fit(X_train, y_train)
    return svm_clf



svm_clf = train_svm_classifier(train_data, evidence_embeddings_all)
dump(svm_clf, 'svm_model.joblib')

print(dev_predictions)

Load trained SVC and make prediction

In [None]:
svm_clf = load('svm_model.joblib')
label_map = {'SUPPORTS': 0, 'REFUTES': 1, 'NOT_ENOUGH_INFO': 2, 'DISPUTED': 3}
inv_label_map = {v: key for key, v in label_map.items()}
k = 20
def predict_labels(dev_data, svm_clf, evidence_embeddings_all):
    X_dev = [get_claim_embedding(dev_data[claim_id]['claim_text']).reshape(-1) for claim_id in dev_data]
    selected_evidence_indices_dev = []
    for claim_emb in X_dev:
        cosine_sims = cosine_similarity([claim_emb], evidence_embeddings_all)[0]
        top_k_indices = np.argsort(cosine_sims)[-15:]
        selected_evidence_indices_dev.append(top_k_indices)
    dev_predictions = {}

    for i, claim_id in enumerate(dev_data):
        claim_data = dev_data[claim_id]
        claim_text = claim_data['claim_text']

        top_k_evidence_indices = selected_evidence_indices_dev[i]
        top_k_evidence_embeddings = evidence_embeddings_all[top_k_evidence_indices]

        concatenated_embeddings = [np.hstack((X_dev[i], evidence_emb)) for evidence_emb in top_k_evidence_embeddings]

        y_pred_top_k = svm_clf.predict(concatenated_embeddings)
        top_k_similarity_scores = cosine_similarity([X_dev[i]], top_k_evidence_embeddings)[0]

        weighted_vote = {}
        for label, similarity_score in zip(y_pred_top_k, top_k_similarity_scores):
            if label in weighted_vote:
                weighted_vote[label] += similarity_score 
            else:
                weighted_vote[label] = similarity_score 
        
        y_pred_final = max(weighted_vote, key=weighted_vote.get)

        filtered_evidences = [top_k_evidence_indices[j] for j, label in enumerate(y_pred_top_k) if label == y_pred_final]

        dev_predictions[claim_id] = {
            "claim_text": claim_text,
            "claim_label": y_pred_final,
            "evidences": filtered_evidences
        }

    return dev_predictions    
dev_predictions = predict_labels(dev_data, svm_clf, evidence_embeddings_all)

Siamese

In [60]:
# Import the necessary libraries
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import InputExample, losses
from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
import torch
import torch.nn as nn
# Load the pre-trained BERT model and tokenizer
bert_name = "bert-base-uncased"
model = AutoModel.from_pretrained(bert_name)
device = 'cuda'
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(bert_name)
path = "project-data"
def load_data(file_path):
    with open(file_path) as f:
        data = json.load(f)
    return data
train_data = load_data(os.path.join(path, 'train-claims.json'))
dev_data = load_data(os.path.join(path, 'dev-claims.json'))
evidence_data = load_data(os.path.join(path, 'evidence.json'))
# Preprocess the data
for claim_id, claim_data in train_data.items():
    train_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        train_data[claim_id]['evidences'][i] = preprocess(evidence_data[evidence_id])

for claim_id, claim_data in dev_data.items():
    dev_data[claim_id]['claim_text'] = preprocess(claim_data['claim_text'])
    for i, evidence_id in enumerate(claim_data['evidences']):
        dev_data[claim_id]['evidences'][i] = preprocess(evidence_data[evidence_id])


for ev_id, ev_data in evidence_data.items():
    evidence_data[ev_id]= preprocess(ev_data)

data_list = [{"id": k, **v} for k, v in train_data.items()]
train_sp, temp_data = train_test_split(data_list, test_size=0.3, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)
import random
class SiameseDataset(Dataset):
    def __init__(self, claims_data, evidence_data, tokenizer):
        self.claims_data = claims_data
        self.evidence_data = evidence_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.claims_data) * 2  # Generating a pair of similar and dissimilar sentences for each claim

    def __getitem__(self, idx):
        claim_id = list(self.claims_data.keys())[idx // 2]
        claim_text = self.claims_data[claim_id]["claim_text"]
        evidence_ids = self.claims_data[claim_id]['evidences']

        if idx % 2 == 0:
            # For even indices, return a similar claim-evidence pair
            evidence_text = self.evidence_data[f'evidence-{random.choice(evidence_ids)}']
            label = 1.0
        else:
            # For odd indices, return a dissimilar claim-evidence pair
            random_evidence_id = random.choice(list(self.evidence_data.keys()))
            while f'evidence-{random_evidence_id}' in evidence_ids:
                random_evidence_id = random.choice(list(self.evidence_data.keys()))
            evidence_text = self.evidence_data[random_evidence_id]
            label = 0.0

        claim_encoding = self.tokenizer(claim_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
        evidence_encoding = self.tokenizer(evidence_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)

        return {
            'claim_input_ids': claim_encoding['input_ids'].squeeze(0),
            'claim_attention_mask': claim_encoding['attention_mask'].squeeze(0),
            'evidence_input_ids': evidence_encoding['input_ids'].squeeze(0),
            'evidence_attention_mask': evidence_encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label)
        }

# Load the pre-trained BERT model and tokenizer
bert_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(bert_name)

# Create the dataset
train_dataset = SiameseDataset(train_sp, evidence_data, tokenizer)
val_dataset = SiameseDataset(val_data, evidence_data, tokenizer)
# Create a dataloader for the training data
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32)
val_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=32)

# Create a loss function for the siamese network
train_loss = losses.CosineSimilarityLoss(model=model)

# Define the loss function, optimizer and scheduler
criterion = nn.CosineEmbeddingLoss()
optimizer = Adam(model.parameters(), lr=1e-5)
scheduler = StepLR(optimizer, step_size=2, gamma=0.1)  # Decrease learning rate by 0.1 every 2 epochs

best_val_loss = float('inf')

# Training loop
for epoch in range(10):  # Number of epochs
    model.train()
    total_train_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        # Get the inputs and labels
        claim_input_ids = batch['claim_input_ids'].to(device)
        claim_attention_mask = batch['claim_attention_mask'].to(device)
        evidence_input_ids = batch['evidence_input_ids'].to(device)
        evidence_attention_mask = batch['evidence_attention_mask'].to(device)
        labels = batch['label'].to(device).unsqueeze(-1)

        # Forward pass
        claim_outputs = model(claim_input_ids, attention_mask=claim_attention_mask).last_hidden_state[:,0,:]
        evidence_outputs = model(evidence_input_ids, attention_mask=evidence_attention_mask).last_hidden_state[:,0,:]
        loss = criterion(claim_outputs, evidence_outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{10}, Train Loss: {avg_train_loss}")

    # Validation
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_dataloader:
            claim_input_ids = batch['claim_input_ids'].to(device)
            claim_attention_mask = batch['claim_attention_mask'].to(device)
            evidence_input_ids = batch['evidence_input_ids'].to(device)
            evidence_attention_mask = batch['evidence_attention_mask'].to(device)
            labels = batch['label'].to(device).unsqueeze(-1)

            claim_outputs = model(claim_input_ids, attention_mask=claim_attention_mask).last_hidden_state[:,0,:]
            evidence_outputs = model(evidence_input_ids, attention_mask=evidence_attention_mask).last_hidden_state[:,0,:]
            loss = criterion(claim_outputs, evidence_outputs, labels)

            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}/{10}, Val Loss: {avg_val_loss}")

    # Checkpoint the model if it has the best validation loss so far
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), 'best_model.pth')

    # Step the scheduler
    scheduler.step()



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


KeyError: 2