In [1]:
#%% 1. Accelerated Environment Setup
%pip install transformers faiss-cpu sentence-transformers pandas sklearn


Collecting sklearn
  Using cached sklearn-0.0.post12.tar.gz (2.6 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'
Note: you may need to restart the kernel to use updated packages.


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [15 lines of output]
      The 'sklearn' PyPI package is deprecated, use 'scikit-learn'
      rather than 'sklearn' for pip commands.
      
      Here is how to fix this error in the main use cases:
      - use 'pip install scikit-learn' rather than 'pip install sklearn'
      - replace 'sklearn' by 'scikit-learn' in your pip requirements files
        (requirements.txt, setup.py, setup.cfg, Pipfile, etc ...)
      - if the 'sklearn' package is used by one of your dependencies,
        it would be great if you take some time to track which package uses
        'sklearn' instead of 'scikit-learn' and report it to their issue tracker
      - as a last resort, set the environment variable
        SKLEARN_ALLOW_DEPRECATED_SKLEARN_PACKAGE_INSTALL=True to avoid this error
      
      More information is available at
      https://github.com/scikit-learn/sklearn-

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("recovery-news-data.csv")

# Convert reliability to 3-tier trust score
def get_trust_tier(row):
    if row['news_guard_score'] >= 75 and row['mbfc_level'] == 'High':
        return 2  # High trust
    elif row['news_guard_score'] >= 50 or row['mbfc_level'] == 'Mixed':
        return 1  # Medium trust
    else:
        return 0  # Low trust

df['trust_tier'] = df.apply(get_trust_tier, axis=1)

# Split data
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['trust_tier'])

In [4]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics import accuracy_score

class TrustDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

class TrustworthinessModel:
    def __init__(self):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=3)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        
    def train(self, train_texts, train_labels, val_texts, val_labels):
        # Tokenize texts
        train_encodings = self.tokenizer(train_texts, truncation=True, padding=True, max_length=512)
        val_encodings = self.tokenizer(val_texts, truncation=True, padding=True, max_length=512)
        
        # Create datasets
        train_dataset = TrustDataset(train_encodings, train_labels)
        val_dataset = TrustDataset(val_encodings, val_labels)
        
        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            num_train_epochs=3,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=16,
            warmup_steps=500,
            weight_decay=0.01,
            evaluation_strategy='epoch',
            logging_dir='./logs',
        )
        
        # Trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            compute_metrics=lambda p: {'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1))}
        )
        
        trainer.train()
    
    def predict(self, text):
        inputs = self.tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model(**inputs)
        return torch.argmax(outputs.logits, dim=1).cpu().item()

# Initialize and train
trust_model = TrustworthinessModel()
trust_model.train(
    train_df['body_text'].tolist(),
    train_df['trust_tier'].tolist(),
    val_df['body_text'].tolist(),
    val_df['trust_tier'].tolist()
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

class TrustAwareRetriever:
    def __init__(self):
        self.encoder = SentenceTransformer('all-mpnet-base-v2')
        self.index = faiss.IndexFlatL2(768)
        self.metadata = []
        
    def build_index(self, documents, trust_scores):
        # Encode documents
        embeddings = self.encoder.encode(documents)
        
        # Store embeddings with trust scores
        self.index.add(embeddings)
        self.trust_scores = np.array(trust_scores)
        
    def query(self, query_text, k=10, trust_weight=0.3):
        # Encode query
        query_embed = self.encoder.encode([query_text])
        
        # Search similarity
        distances, indices = self.index.search(query_embed, k*3)  # Over-fetch
        
        # Rerank with trust scores
        results = []
        for i, score in zip(indices[0], distances[0]):
            trust_score = self.trust_scores[i]
            combined_score = (1 - trust_weight) * (1 - score) + trust_weight * trust_score
            results.append((i, combined_score))
        
        # Sort and return top-k
        results.sort(key=lambda x: x[1], reverse=True)
        return results[:k]

# Build index
retriever = TrustAwareRetriever()
retriever.build_index(
    df['body_text'].tolist(),
    df['trust_tier'].values / 2  # Normalize to 0-1
)


In [None]:
from transformers import pipeline

class TrustAwareGenerator:
    def __init__(self):
        self.generator = pipeline('text2text-generation', model='t5-small')
        self.trust_model = trust_model  # From previous step
        
    def generate(self, query, retrieved_docs, trust_threshold=1):
        # Filter by trust
        trusted_docs = [doc for doc in retrieved_docs 
                      if self.trust_model.predict(doc) >= trust_threshold]
        
        # Create context
        context = " | ".join(trusted_docs[:3])  # Truncate if needed
        
        # Generate with trust cues
        return self.generator(
            f"answer: {query} context: {context}",
            max_length=200,
            num_beams=4,
            repetition_penalty=2.5
        )


In [None]:
def rag_pipeline(query):
    # Retrieve
    retrieved = retriever.query(query)
    doc_ids = [item[0] for item in retrieved]
    documents = [df.iloc[i]['body_text'] for i in doc_ids]
    
    # Generate
    generator = TrustAwareGenerator()
    return generator.generate(query, documents)

# Example usage
result = rag_pipeline("What are the health impacts of COVID-19 vaccines?")
print(result[0]['generated_text'])