### **First Stage Sentence Retrieval using SBERT**

In [3]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity


In [29]:
import json

# Load datasets
path = 'data/'

with open(path+"train-claims.json", "r") as f:
    train_claims = json.load(f)

train_df = pd.DataFrame(train_claims).transpose()

with open(path+"dev-claims.json", "r") as f:
    dev_claims = json.load(f)
dev_df = pd.DataFrame(dev_claims).transpose()

with open(path+"evidence.json", "r") as f:
    evidence = json.load(f)
evidence_df = pd.DataFrame(list(evidence.items()), columns=["key", "value"])

In [None]:
# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [12]:
embeddings = model.encode(evidence_df['value'].tolist(), show_progress_bar=True, device='cuda')

Batches:   0%|          | 0/37776 [00:00<?, ?it/s]

In [30]:
claim_embeddings = model.encode(train_df['claim_text'].tolist(), show_progress_bar=True, device='cuda')

Batches:   0%|          | 0/39 [00:00<?, ?it/s]

In [31]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(claim_embeddings, embeddings)

In [None]:
import numpy as np
# Get top 10000 evidence for each claim
top_k = 10000
top_indices = np.argsort(-similarity_matrix, axis=1)[:, :top_k]  # sort descending, get top 100 indices

In [47]:
# Create new column with top evidence texts for each claim
train_df["top_{top_k}_evidence"] = [
    evidence_df.iloc[indices]["key"].tolist() for indices in top_indices
]

In [53]:
total_matches = 0
n = 0 
for i in range(len(train_df)):
    k = 0
    evidence_list = train_df["evidences"].iloc[i]
    top_100_list = train_df["top_{top_k}_evidence"].iloc[i]

    for ev in evidence_list:
        if ev in top_100_list:
            k += 1
    print(f"Found {k} matches for gold standard {len(evidence_list)}")
    n += len(evidence_list)
    total_matches += k

print(f"Total matches: {total_matches}/{n}")

Found 3 matches for gold standard 3
Found 2 matches for gold standard 2
Found 2 matches for gold standard 2
Found 5 matches for gold standard 5
Found 3 matches for gold standard 5
Found 4 matches for gold standard 5
Found 2 matches for gold standard 2
Found 1 matches for gold standard 3
Found 5 matches for gold standard 5
Found 1 matches for gold standard 1
Found 1 matches for gold standard 1
Found 5 matches for gold standard 5
Found 5 matches for gold standard 5
Found 5 matches for gold standard 5
Found 1 matches for gold standard 1
Found 1 matches for gold standard 1
Found 3 matches for gold standard 3
Found 5 matches for gold standard 5
Found 1 matches for gold standard 1
Found 4 matches for gold standard 4
Found 2 matches for gold standard 2
Found 2 matches for gold standard 2
Found 4 matches for gold standard 4
Found 3 matches for gold standard 3
Found 3 matches for gold standard 4
Found 2 matches for gold standard 2
Found 1 matches for gold standard 1
Found 5 matches for gold sta