In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
import json
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from tqdm import tqdm

# Load data
data_dir = "data"
train_claims_file = os.path.join(data_dir, "train-claims.json")
evidence_file = os.path.join(data_dir, "evidence.json")

# Load training data and evidence
with open(train_claims_file, 'r') as f:
    train_claims = json.load(f)
with open(evidence_file, 'r') as f:
    evidence = json.load(f)

# Text preprocessing
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

def text_tokenizer(text):
    """Tokenize and remove stopwords"""
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in stop_words]
    return tokens

# Initialize TF-IDF
print("Training TF-IDF vectorizer...")
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    tokenizer=text_tokenizer
)

# Prepare evidence data
evidence_ids = list(evidence.keys())
evidence_texts = [evidence[eid] for eid in evidence_ids]

# Train TF-IDF and transform evidence
print("Transforming evidence texts...")
evidence_vectors = tfidf_vectorizer.fit_transform(evidence_texts)

# Find hard negatives for each claim
print("Generating hard negatives...")
claim_to_hard_negatives = {}
batch_size = 100
initial_pool_size = 100  # Get top 100 most similar first
top_k = 100  # Save top 50 hard negatives per claim

for i in tqdm(range(0, len(train_claims), batch_size)):
    batch_claims = list(train_claims.items())[i:i+batch_size]
    
    for claim_id, claim in batch_claims:
        # Get claim TF-IDF vector
        claim_vector = tfidf_vectorizer.transform([claim['claim_text']])
        
        # Calculate similarities with all evidence
        similarities = cosine_similarity(claim_vector, evidence_vectors)[0]
        
        # Get top similar evidence
        top_indices = np.argsort(-similarities)[:initial_pool_size]  # First get top 100 most similar
        
        # Filter out hard negatives
        hard_negatives = []
        for idx in top_indices:
            evidence_id = evidence_ids[idx]
            if evidence_id not in claim['evidences']:  # If not in positive samples
                hard_negatives.append({
                    'evidence_id': evidence_id,
                    'similarity': float(similarities[idx])
                })
            if len(hard_negatives) >= top_k:
                break
        
        claim_to_hard_negatives[claim_id] = hard_negatives

# Save both full and medium difficulty results
output_file = os.path.join(data_dir, "hard_negatives.json")
with open(output_file, 'w') as f:
    json.dump(claim_to_hard_negatives, f, indent=4)

print(f"Hard negatives saved to: {output_file}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rasindu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rasindu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Rasindu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Training TF-IDF vectorizer...
Transforming evidence texts...




Generating hard negatives...


100%|██████████| 13/13 [05:19<00:00, 24.59s/it]


Hard negatives saved to: data\hard_negatives.json
