In [7]:
#%% Setup Environment
%pip install transformers[torch] faiss-gpu datasets sentence-transformers accelerate -q
import torch
import faiss
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          T5ForConditionalGeneration, AutoConfig,
                          TrainingArguments, Trainer, pipeline)
from sentence_transformers import SentenceTransformer
from datasets import Dataset
import evaluate

# Check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

ERROR: Could not find a version that satisfies the requirement faiss-gpu (from versions: none)
ERROR: No matching distribution found for faiss-gpu


Note: you may need to restart the kernel to use updated packages.


ModuleNotFoundError: No module named 'datasets'

In [None]:
#%% Data Preparation
def preprocess_data():
    # Load dataset
    df = pd.read_csv("recovery-news-data.csv")
    
    # Convert to 3-tier trust score
    def calculate_trust(row):
        if row['news_guard_score'] >= 75 and row['mbfc_level'] == 'High':
            return 2  # High trust
        elif row['news_guard_score'] >= 50 or row['mbfc_level'] in ['Mixed', 'Mostly Factual']:
            return 1  # Medium trust
        else:
            return 0  # Low trust
    
    df['trust_tier'] = df.apply(calculate_trust, axis=1)
    
    # Split data
    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['trust_tier'])
    return train_df, val_df, df

train_df, val_df, full_df = preprocess_data()

In [None]:
#%% Trustworthiness Classifier
class TrustModelTrainer:
    def __init__(self):
        self.model_name = "bert-base-uncased"
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.config = AutoConfig.from_pretrained(self.model_name, num_labels=3)
        self.model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, config=self.config).to(device)
        
    def _prepare_dataset(self, texts, labels):
        encodings = self.tokenizer(texts, truncation=True, padding=True, max_length=256)
        return Dataset.from_dict({
            'input_ids': encodings['input_ids'],
            'attention_mask': encodings['attention_mask'],
            'labels': labels
        })
    
    def train(self, train_texts, train_labels, val_texts, val_labels):
        # Convert to HuggingFace datasets
        train_dataset = self._prepare_dataset(train_texts, train_labels)
        val_dataset = self._prepare_dataset(val_texts, val_labels)

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=4,
            per_device_train_batch_size=32,
            per_device_eval_batch_size=64,
            fp16=True,
            evaluation_strategy="epoch",
            save_strategy="epoch",
            logging_dir='./logs',
            report_to="none",
            gradient_accumulation_steps=2,
            learning_rate=2e-5,
            weight_decay=0.01,
        )

        # Metrics
        f1_metric = evaluate.load("f1")

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            predictions = np.argmax(logits, axis=-1)
            return f1_metric.compute(
                predictions=predictions, references=labels, average="weighted")

        # Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=compute_metrics,
        )

        # Train
        trainer.train()
        return self.model

# Initialize and train
trust_trainer = TrustModelTrainer()
trust_model = trust_trainer.train(
    train_df['body_text'].tolist(),
    train_df['trust_tier'].tolist(),
    val_df['body_text'].tolist(),
    val_df['trust_tier'].tolist()
)

In [None]:
#%% FAISS Retriever
class FAISSRetriever:
    def __init__(self, model_name="sentence-transformers/all-mpnet-base-v2"):
        self.encoder = SentenceTransformer(model_name).to(device)
        self.index = None
        self.metadata = []
        
    def build_index(self, documents, trust_scores, batch_size=512):
        # Convert documents to embeddings
        embeddings = []
        for i in tqdm(range(0, len(documents), batch_size)):
            batch = documents[i:i+batch_size]
            embeds = self.encoder.encode(batch, convert_to_tensor=True,
                                       device=device, show_progress_bar=False)
            embeddings.append(embeds.cpu().numpy())
            
        embeddings = np.concatenate(embeddings)
        
        # Build FAISS index
        self.index = faiss.IndexFlatIP(embeddings.shape[1])
        self.index.add(embeddings.astype(np.float32))
        self.trust_scores = np.array(trust_scores)
        
    def search(self, query, k=10, trust_weight=0.4):
        query_embed = self.encoder.encode([query], convert_to_tensor=True,
                                        device=device).cpu().numpy()
        
        # Search with FAISS
        distances, indices = self.index.search(query_embed.astype(np.float32), k*3)
        
        # Rerank with trust scores
        results = []
        for idx, score in zip(indices[0], distances[0]):
            trust_score = self.trust_scores[idx] / 2  # Normalize 0-2 to 0-1
            combined_score = (1 - trust_weight) * score + trust_weight * trust_score
            results.append((idx, combined_score))
            
        # Return top-k sorted
        return sorted(results, key=lambda x: x[1], reverse=True)[:k]

# Build index
retriever = FAISSRetriever()
retriever.build_index(
    full_df['body_text'].tolist(),
    full_df['trust_tier'].values
)

In [None]:
#%% Generator Component
class TrustAwareGenerator:
    def __init__(self, model_name="t5-base"):
        self.model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.trust_model = trust_model
        self.trust_tokenizer = trust_trainer.tokenizer
        
    def generate(self, query, context_docs, max_length=200):
        # Prepare context
        context = "\n".join(context_docs[:3])
        
        # Prepare input
        input_text = f"answer: {query} context: {context}"
        input_ids = self.tokenizer(
            input_text, return_tensors="pt", truncation=True, max_length=512
        ).input_ids.to(device)
        
        # Generate
        outputs = self.model.generate(
            input_ids,
            max_length=max_length,
            num_beams=4,
            early_stopping=True,
            repetition_penalty=2.5,
            temperature=0.7,
        )
        
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)

# Initialize generator
generator = TrustAwareGenerator()

In [None]:
#%% Evaluation Metrics
def evaluate_system(val_queries, k=5):
    precision_metric = evaluate.load("precision_at_k")
    recall_metric = evaluate.load("recall_at_k")
    
    results = []
    for query in tqdm(val_queries[:100]):  # Sample 100 for evaluation
        retrieved = retriever.search(query, k=k)
        doc_ids = [idx for idx, _ in retrieved]
        trust_scores = full_df.iloc[doc_ids]['trust_tier'].values
        
        # Calculate metrics
        precision = np.mean([1 if s >= 1 else 0 for s in trust_scores])
        recall = np.sum([1 if s >= 1 else 0 for s in trust_scores]) / len(trust_scores)
        
        results.append({"precision": precision, "recall": recall})
    
    avg_precision = np.mean([r["precision"] for r in results])
    avg_recall = np.mean([r["recall"] for r in results])
    print(f"Precision@{k}: {avg_precision:.2f}, Recall@{k}: {avg_recall:.2f}")

# Example evaluation
evaluate_system(["COVID vaccine efficacy", "Climate change impacts"])

In [None]:
#%% End-to-End Pipeline
def full_pipeline(query, trust_threshold=1):
    # Retrieve documents
    retrieved = retriever.search(query)
    doc_ids = [idx for idx, _ in retrieved]
    docs = full_df.iloc[doc_ids]['body_text'].tolist()
    
    # Filter by trust
    filtered_docs = [doc for doc, score in zip(docs, full_df.iloc[doc_ids]['trust_tier']) 
                   if score >= trust_threshold]
    
    # Generate answer
    if len(filtered_docs) == 0:
        return "No trustworthy sources found"
    
    return generator.generate(query, filtered_docs)

# Example usage
result = full_pipeline("What are the long-term effects of COVID-19 vaccines?")
print("Generated Answer:", result)