In [1]:
from transformers import RobertaTokenizer, RobertaModel
import torch

# Load RoBERTa model and tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
roberta   = RobertaModel.from_pretrained("roberta-base")

# 1) Freeze all parameters
for param in roberta.parameters():
    param.requires_grad = False

# 2) Unfreeze last 2 encoder layers
for layer in roberta.encoder.layer[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

# 3) Unfreeze the pooler if present (gives you a [CLS]–to–sentence vector head)
if hasattr(roberta, "pooler"):
    for param in roberta.pooler.parameters():
        param.requires_grad = True

# Encode a sentence via the [CLS] token
def encode_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = roberta(**inputs)
    return outputs.last_hidden_state[:, 0]  # [CLS] 

# Quick smoke test
text_emb = encode_text("Test sentence for fine-tuning.")

print("Text embedding shape:", text_emb.shape)  # → [1, 768]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Text embedding shape: torch.Size([1, 768])


In [2]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

# Load CLIP model and processor
clip_model     = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# 1) Freeze all vision parameters
for param in clip_model.vision_model.parameters():
    param.requires_grad = False

# 2) Unfreeze last 2 vision encoder layers
for layer in clip_model.vision_model.encoder.layers[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

# 3) Unfreeze the visual projection head
for param in clip_model.visual_projection.parameters():
    param.requires_grad = True

# Encode an image (gradients enabled on last layers)
def encode_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    image_emb = clip_model.get_image_features(**inputs)
    return image_emb  # [1, 512]

# Quick smoke test
image_emb = encode_image("/Users/kevinpatel/UWF/Donald.jpeg")
print("Image embedding shape:", image_emb.shape)



Image embedding shape: torch.Size([1, 512])


In [3]:
import spacy
import torch.nn as nn
nlp = spacy.load("en_core_web_sm")

# Extract entities
def extract_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ in ["PERSON", "ORG", "GPE"]]


#Test
text = "Apple Inc. was founded by Steve Jobs and is headquartered in Cupertino."
print(extract_entities(text))

['Apple Inc.', 'Steve Jobs', 'Cupertino']


In [4]:
from SPARQLWrapper import SPARQLWrapper, JSON

# Step 1: Get QID
def get_entity_qid(entity_label):
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?item WHERE {{
      ?item rdfs:label "{entity_label}"@en.
    }} LIMIT 1
    """
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.addCustomHttpHeader("User-Agent", "KITE/1.0 (kevin@yourdomain.com)")
    try:
        results = sparql.query().convert()
        bindings = results["results"]["bindings"]
        if bindings:
            entity_url = bindings[0]["item"]["value"]
            return entity_url.split("/")[-1]
        else:
            return None
    except Exception as e:
        print("Error fetching QID:", e)
        return None

# Step 2: Query facts with QID
def get_wikidata_facts_by_qid(qid):
    endpoint_url = "https://query.wikidata.org/sparql"
    query = f"""
    SELECT ?propertyLabel ?valueLabel WHERE {{
      wd:{qid} ?prop ?value.
      ?property wikibase:directClaim ?prop.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }} LIMIT 20
    """
    sparql = SPARQLWrapper(endpoint_url)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    sparql.addCustomHttpHeader("User-Agent", "KITE/1.0 (kevin@yourdomain.com)")
    try:
        results = sparql.query().convert()
        facts = []
        for result in results["results"]["bindings"]:
            prop = result["propertyLabel"]["value"]
            val = result["valueLabel"]["value"]
            facts.append((prop, val))
        return facts
    except Exception as e:
        print("Error fetching facts:", e)
        return []

print("Done!");

Done!


In [5]:
# Testing usage
entity_name = "Donald Trump"

qid = get_entity_qid(entity_name)
if qid:
    print(f"\nWikidata QID for '{entity_name}': {qid}")
    facts = get_wikidata_facts_by_qid(qid)
    print(f"\nTop {len(facts)} facts about {entity_name}:")
    for prop, val in facts:
        print(f"  {prop}: {val}")
else:
    print(f"No Wikidata QID found for '{entity_name}'")



Wikidata QID for 'Donald Trump': Q22686

Top 20 facts about Donald Trump:
  member of political party: Republican Party
  native language: English
  occupation: actor
  occupation: writer
  occupation: businessperson
  occupation: politician
  occupation: entrepreneur
  occupation: merchant
  occupation: chief executive officer
  occupation: investor
  occupation: television producer
  occupation: business magnate
  occupation: film producer
  occupation: real estate entrepreneur
  occupation: game show host
  employer: The Trump Organization
  signature: http://commons.wikimedia.org/wiki/Special:FilePath/Donald%20Trump%20%28Presidential%20signature%29.svg
  movement: conservatism
  movement: isolationism
  movement: right-wing populism


In [6]:
# Convert facts into a graph: nodes and edges
def build_knowledge_graph(entity_name, facts):
    nodes = set()
    edges = []

    # Add the main entity as a node
    nodes.add(entity_name)

    for prop, val in facts:
        nodes.add(val)
        edges.append((entity_name, val))  # edge: entity → value

    node_list = list(nodes)
    node_index = {node: idx for idx, node in enumerate(node_list)}

    # Convert edges to index format for PyTorch Geometric
    edge_index = torch.tensor([[node_index[src], node_index[dst]] for src, dst in edges], dtype=torch.long).T

    return node_list, edge_index

In [7]:
entity = "Barack Obama"
qid = get_entity_qid(entity)
facts = get_wikidata_facts_by_qid(qid)

node_list, edge_index = build_knowledge_graph(entity, facts)

print("Nodes:", node_list)
print("Edge Index:\n", edge_index)


Nodes: ['Kenya', 'human', 'Stanley Ann Dunham', 'male', 'Columbia University', 'Kapiolani Medical Center for Women and Children', 'Barack Obama', 'Harvard Law School', 'President of the United States', 'Michelle Obama', 'President-elect of the United States', 'Malia Obama', 'member of the State Senate of Illinois', 'Harvard University', 'United States', 'Barack Obama Sr.', 'Sasha Obama', 'http://commons.wikimedia.org/wiki/Special:FilePath/President%20Barack%20Obama.jpg', 'United States senator', 'http://commons.wikimedia.org/wiki/Special:FilePath/2016%20State%20of%20the%20Union%20Address%20%E2%80%93%20Barack%20Obama%20Presidential%20Library.webm', 'family of Barack Obama']
Edge Index:
 tensor([[ 6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,
          6,  6],
        [19, 17,  5,  3, 15,  2,  9, 14,  0,  1,  8, 10, 18, 12, 11, 16, 20, 13,
          4,  7]])


In [8]:
def encode_nodes_with_roberta(node_list):
    embeddings = []
    for node in node_list:
        emb = encode_text(node)
        embeddings.append(emb)
    return torch.vstack(embeddings)  # Shape: [num_nodes, 768]

In [9]:
node_features = encode_nodes_with_roberta(node_list)
print("Node Embedding Shape:", node_features.shape)

Node Embedding Shape: torch.Size([21, 768])


In [10]:
pip install torch-scatter torch-sparse torch-cluster torch-spline-conv -f https://data.pyg.org/whl/torch-2.1.0+cpu.html


Looking in links: https://data.pyg.org/whl/torch-2.1.0+cpu.html
[0mNote: you may need to restart the kernel to use updated packages.


In [11]:
#Test to see previous worked
from torch_geometric.data import Data
print("PyG is ready!")


PyG is ready!


In [12]:
pip install torch-geometric


[0mNote: you may need to restart the kernel to use updated packages.


In [13]:
import torch
import torch.nn as nn
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from torch.nn import TransformerEncoder, TransformerEncoderLayer

# ——— Knowledge Graph GAT ———
class KnowledgeGraphGAT(nn.Module):
    def __init__(self, in_channels: int = 768, out_channels: int = 256):
        super(KnowledgeGraphGAT, self).__init__()
        self.gat1 = GATConv(in_channels, 256, heads=2, concat=True)
        self.gat2 = GATConv(512, out_channels, heads=1, concat=False)

    def forward(self, x, edge_index):
        x = self.gat1(x, edge_index)
        x = torch.relu(x)
        x = self.gat2(x, edge_index)
        return x

# ——— Cross-Modal Transformer Fusion ———
class CrossModalTransformer(nn.Module):
    def __init__(
        self,
        text_dim:  int = 768,
        img_dim:   int = 512,
        kg_dim:    int = 256,
        token_dim: int = 512,
        nhead:     int = 8,
        layers:    int = 1,
    ):
        super().__init__()
        # Project each modality to the common token_dim
        self.proj_t = nn.Linear(text_dim,  token_dim)
        self.proj_i = nn.Linear(img_dim,   token_dim)
        self.proj_k = nn.Linear(kg_dim,    token_dim)

        # Transformer over the 3 tokens
        encoder_layer = TransformerEncoderLayer(
            d_model=token_dim,
            nhead=nhead,
            dim_feedforward=token_dim * 4,
            dropout=0.1,
            activation="relu",
        )
        self.transformer = TransformerEncoder(encoder_layer, num_layers=layers)

        # Project transformer output back to 1536-dim fusion size
        self.out_proj = nn.Linear(token_dim, text_dim + img_dim + kg_dim)

    def forward(self, text_emb, image_emb, kg_emb):
        # text_emb: [B,768], image_emb: [B,512], kg_emb: [B,256]
        t_tok = self.proj_t(text_emb)    # [B, token_dim]
        i_tok = self.proj_i(image_emb)   # [B, token_dim]
        k_tok = self.proj_k(kg_emb)      # [B, token_dim]

        # Stack into [seq_len=3, batch, token_dim] for transformer
        tokens = torch.stack([t_tok, i_tok, k_tok], dim=1)  # [B,3,token_dim]
        tokens = tokens.permute(1, 0, 2)                    # [3,B,token_dim]
        out    = self.transformer(tokens)                   # [3,B,token_dim]
        out    = out.permute(1, 0, 2)                       # [B,3,token_dim]

        # Mean-pool and project back to fusion dimension
        fused_token = out.mean(dim=1)                       # [B, token_dim]
        return self.out_proj(fused_token)                   # [B,1536]

# Instantiate both modules
gat_encoder   = KnowledgeGraphGAT()
fusion_xform  = CrossModalTransformer()



In [14]:
#fuse knowledge representation with the other two modalities: text and image
# Prepare PyG data object
graph_data = Data(x=node_features, edge_index=edge_index)


# Run the GAT (in eval mode, no gradients needed for now)
with torch.no_grad():
    knowledge_emb = gat_encoder(graph_data.x, graph_data.edge_index)

# Mean-pool the node embeddings to create a single graph-level vector
knowledge_pooled = knowledge_emb.mean(dim=0, keepdim=True)  # Shape: [1, 256]

print("Pooled Knowledge Embedding Shape:", knowledge_pooled.shape)

#knowledge graph has been encoded


Pooled Knowledge Embedding Shape: torch.Size([1, 256])


In [15]:
# ─── Fusion via Cross-Modal Transformer ───
fused = fusion_xform(text_emb, image_emb, knowledge_pooled)  # [B, 1536]
print("Fused Embedding Shape:", fused.shape)

Fused Embedding Shape: torch.Size([1, 1536])


In [16]:
import torch
import torch.nn as nn

# ─── Tri-modal Classifier with Per-Modality Confidence Heads ───
class NewsClassifier(nn.Module):
    def __init__(self):
        super(NewsClassifier, self).__init__()
        # — Classification head (1536 → 512 → 128 → 1) —
        self.fc1      = nn.Linear(1536, 512)
        self.relu1    = nn.ReLU()
        self.dropout1 = nn.Dropout(0.3)

        self.fc2      = nn.Linear(512, 128)
        self.relu2    = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)

        self.fc3      = nn.Linear(128, 1)   # raw logit output

        # — Confidence heads for each modality —
        self.text_conf_head  = nn.Linear(768, 1)
        self.image_conf_head = nn.Linear(512, 1)
        self.kg_conf_head    = nn.Linear(256, 1)

    def forward(self, fused):
        # — Main logit prediction —
        x = self.relu1(self.fc1(fused))
        x = self.dropout1(x)
        x = self.relu2(self.fc2(x))
        x = self.dropout2(x)
        logit = self.fc3(x)

        # — Slice fused vector back into modalities —
        # fused shape: [batch, 1536] = [768 (text) | 512 (image) | 256 (kg)]
        text_feat  = fused[:, :768]
        image_feat = fused[:, 768:1280]   # 768 + 512 = 1280
        kg_feat    = fused[:, 1280:]      # remaining 256 dims

        # — Compute per-modality confidences (0–1 range) —
        text_conf  = torch.sigmoid(self.text_conf_head(text_feat))
        image_conf = torch.sigmoid(self.image_conf_head(image_feat))
        kg_conf    = torch.sigmoid(self.kg_conf_head(kg_feat))

        return logit, {
            "text_conf":  text_conf,
            "image_conf": image_conf,
            "kg_conf":    kg_conf
        }

# ─── Smoke-test ───
classifier = NewsClassifier()
classifier.eval()
with torch.no_grad():
    logit, confs = classifier(fused)  # simply pass fused
    print("Smoke-test logit shape:", logit.shape)   # should be [1, 1]
    print("Smoke-test confidences:", 
          {k: v.item() for k, v in confs.items()})

Smoke-test logit shape: torch.Size([1, 1])
Smoke-test confidences: {'text_conf': 0.46819454431533813, 'image_conf': 0.5450189709663391, 'kg_conf': 0.40811699628829956}


In [17]:
import torch
import torch.nn as nn

# 0) (re-)instantiate your model if needed
classifier = NewsClassifier()     # ← make sure this matches your setup
classifier.train()

# 1) Define a dummy/fresh label
label = torch.tensor([[1.0]])     # shape [1,1]; move to GPU if you use one

# 2) Make sure `fused` exists:
#    if you haven’t just run the fusion cell, recreate it:
# fused = torch.cat([text_emb, image_emb, knowledge_pooled], dim=1)

# 3) Forward + compute BCEWithLogits loss
criterion = nn.BCEWithLogitsLoss()
logits, _ = classifier(fused)
# unpack the (logit, confs) tuple
loss      = criterion(logits, label)
print(f"Loss value: {loss.item():.4f}")

# 4) Backprop test
loss.backward()
print("✅ Backpropagation successful — model is ready for training.")

Loss value: 0.7346
✅ Backpropagation successful — model is ready for training.


In [18]:
from torch.utils.data import Dataset
import torch
from torch_geometric.data import Data

# In-memory caches for entities and facts
qid_cache = {}
facts_cache = {}

class KITEDataset(Dataset):
    def __init__(self, data_rows):
        self.data = data_rows

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]

        # --- Encode text ---
        text_emb = encode_text(entry["text"])  # [1, 768]

        # --- Encode image ---
        image_emb = encode_image(entry["image_path"])  # [1, 512]

        # --- Entity Extraction ---
        entities = extract_entities(entry["text"])
        qid = None
        facts = []

        if entities:
            # Sanitize entity label
            raw_entity = entities[0]
            clean_entity = raw_entity.replace("\n", " ").strip()

            # --- QID lookup with caching ---
            if clean_entity in qid_cache:
                qid = qid_cache[clean_entity]
            else:
                try:
                    qid = get_entity_qid(clean_entity)
                    qid_cache[clean_entity] = qid
                except Exception:
                    qid = None

            # --- Fact lookup with caching ---
            if qid:
                if qid in facts_cache:
                    facts = facts_cache[qid]
                else:
                    try:
                        facts = get_wikidata_facts_by_qid(qid)
                        facts_cache[qid] = facts
                    except Exception:
                        facts = []

        # --- Build Knowledge Graph (if facts exist) ---
        if facts:
            try:
                node_list, edge_index = build_knowledge_graph(clean_entity, facts)
                node_features = encode_nodes_with_roberta(node_list)
                graph_data = Data(x=node_features, edge_index=edge_index)
                gat_encoder = KnowledgeGraphGAT()
                with torch.no_grad():
                    knowledge_emb = gat_encoder(graph_data.x, graph_data.edge_index)
                knowledge_pooled = knowledge_emb.mean(dim=0, keepdim=True)  # [1, 256]
            except Exception:
                knowledge_pooled = torch.zeros(1, 256)
        else:
            knowledge_pooled = torch.zeros(1, 256)

        # --- Fuse Embeddings ---
        fused = torch.cat([text_emb, image_emb, knowledge_pooled], dim=1)  # [1, 1536]
        label = torch.tensor([entry["label"]], dtype=torch.float32)

        return fused.squeeze(0), label


In [19]:
# Example input row for validation
sample_data = [
    {
        "text": "Joe Biden visited Ukraine to show support during the crisis.",
        "image_path": "/Users/kevinpatel/UWF/BidenUkraine.jpeg",  # Replace with a real path
        "label": 1  # Real news
    }
]

# Create dataset object
test_dataset = KITEDataset(sample_data)

# Fetch one sample and check shapes
sample_fused, sample_label = test_dataset[0]
print("Fused shape:", sample_fused.shape)   # Should be [1536]
print("Label:", sample_label.item())        # Should be 1 or 0


Fused shape: torch.Size([1536])
Label: 1.0


In [20]:
from torch.utils.data import DataLoader

# Set batch size (adjust later for full training)
batch_size = 1

# Wrap your dataset
loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Grab a batch and test
for batch_fused, batch_label in loader:
    print("Batch Fused Shape:", batch_fused.shape)   # Should be [B, 1536]
    print("Batch Labels:", batch_label)              # Should be [B, 1] or [B]
    break  # Just show one batch for now


Batch Fused Shape: torch.Size([1, 1536])
Batch Labels: tensor([[1.]])


In [21]:
from torch.optim import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import torch.nn as nn

# ─── Unfreeze additional transformer layers (as before) ───
for layer in roberta.encoder.layer[-4:]:
    for param in layer.parameters():
        param.requires_grad = True
if hasattr(roberta, "pooler"):
    for param in roberta.pooler.parameters():
        param.requires_grad = True

for layer in clip_model.vision_model.encoder.layers[-4:]:
    for param in layer.parameters():
        param.requires_grad = True
for param in clip_model.visual_projection.parameters():
    param.requires_grad = True

# ─── Setup with differential LRs & BCEWithLogitsLoss ───
epochs     = 1              # keep at 1 for quick validation
encoder_lr = 5e-6
head_lr    = 3e-4
optimizer = AdamW([
    {
        "params": list(roberta.encoder.layer[-4:].parameters()) +
                  (list(roberta.pooler.parameters()) if hasattr(roberta, "pooler") else []),
        "lr": encoder_lr, "weight_decay": 1e-4
    },
    {
        "params": list(clip_model.vision_model.encoder.layers[-4:].parameters()) +
                  list(clip_model.visual_projection.parameters()),
        "lr": encoder_lr, "weight_decay": 1e-4
    },
    {
        "params": classifier.parameters(), "lr": head_lr, "weight_decay": 1e-3
    }
])
criterion = nn.BCEWithLogitsLoss()

# ─── Scheduler Setup (5% warmup) ───
total_steps  = epochs * len(loader)
warmup_steps = int(0.05 * total_steps)
scheduler    = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)
# ─── Training Loop with Scheduler & Label Smoothing ───
smooth = 0.1  # label smoothing factor

classifier.train()
for epoch in range(epochs):
    epoch_loss = 0.0

    for batch_fused, batch_label in loader:
        optimizer.zero_grad()
        # only pass in fused; confidences dict will be empty here
        logits, _ = classifier(batch_fused)  
        # label smoothing: real→0.9, fake→0.1
        smoothed = batch_label * (1 - smooth) + 0.5 * smooth
        loss = criterion(logits, smoothed)
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()

    print(f"[Epoch {epoch+1}] Loss: {epoch_loss:.4f}")

[Epoch 1] Loss: 0.7834


In [22]:
!pip install pandas
import pandas as pd

[0m

In [23]:
import pandas as pd

# Load real and fake datasets using your full paths
real_df = pd.read_csv("/Users/kevinpatel/Git/FakeNewsNet/dataset/gossipcop_real.csv")
fake_df = pd.read_csv("/Users/kevinpatel/Git/FakeNewsNet/dataset/gossipcop_fake.csv")

# Peek at the structure
print("Real sample:")
print(real_df.head(2))

print("\nFake sample:")
print(fake_df.head(2))


Real sample:
                 id                                           news_url  \
0  gossipcop-882573  https://www.brides.com/story/teen-mom-jenelle-...   
1  gossipcop-875924  https://www.dailymail.co.uk/tvshowbiz/article-...   

                                               title  \
0  Teen Mom Star Jenelle Evans' Wedding Dress Is ...   
1  Kylie Jenner refusing to discuss Tyga on Life ...   

                                           tweet_ids  
0  912371411146149888\t912371528343408641\t912372...  
1  901989917546426369\t901989992074969089\t901990...  

Fake sample:
                     id                                           news_url  \
0  gossipcop-2493749932  www.dailymail.co.uk/tvshowbiz/article-5874213/...   
1  gossipcop-4580247171  hollywoodlife.com/2018/05/05/paris-jackson-car...   

                                               title  \
0  Did Miley Cyrus and Liam Hemsworth secretly ge...   
1  Paris Jackson & Cara Delevingne Enjoy Night Ou...   

            

In [24]:
import os
import json
from PIL import Image
import fnmatch

valid_exts = [".jpg", ".jpeg", ".png", ".JPG"]

def is_valid_image(path):
    try:
        Image.open(path).verify()
        return True
    except:
        return False

def collect_article_samples(root_dir, label):
    samples = []
    for article_folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, article_folder)
        content_path = os.path.join(folder_path, "news content.json")

        if not os.path.isfile(content_path):
            continue

        try:
            with open(content_path, "r") as f:
                content = json.load(f)

            text = content.get("text") or content.get("title")
            if not text or len(text) < 50:
                continue

            # 🖼️ Prioritize top images first
            top_image = None
            fallback_image = None

            for fname in os.listdir(folder_path):
                img_path = os.path.join(folder_path, fname)
                if not os.path.isfile(img_path):
                    continue

                ext_match = any(fname.lower().endswith(ext) for ext in valid_exts)
                if not ext_match:
                    continue

                if fnmatch.fnmatch(fname.lower(), "top*") and is_valid_image(img_path):
                    top_image = img_path
                    break  # Stop if top image is valid
                elif is_valid_image(img_path) and fallback_image is None:
                    fallback_image = img_path

            # Final decision
            selected_img = top_image or fallback_image
            if selected_img:
                samples.append({
                    "text": text,
                    "image_path": selected_img,
                    "label": label
                })

        except Exception as e:
            print(f"⚠️ Error reading {content_path}: {e}")

    return samples


In [25]:
from sklearn.model_selection import train_test_split
from pathlib import Path
from time import time

# --- Folder paths ---
base_dir = Path("/Users/kevinpatel/Git/FakeNewsNet/code/fakenewsnet_dataset")
real_gossipcop    = base_dir / "gossipcop/real"
fake_gossipcop    = base_dir / "gossipcop/fake"
real_politifact   = base_dir / "politifact/real"
fake_politifact   = base_dir / "politifact/fake"

# --- Collect samples ---
kite_data = []

print("🔍 Loading articles from GossipCop (real)...")
start = time()
gc_real = collect_article_samples(str(real_gossipcop), label=1)
print(f"  → {len(gc_real)} samples loaded [{time() - start:.2f}s]")
kite_data += gc_real

print("🔍 Loading articles from GossipCop (fake)...")
start = time()
gc_fake = collect_article_samples(str(fake_gossipcop), label=0)
print(f"  → {len(gc_fake)} samples loaded [{time() - start:.2f}s]")
kite_data += gc_fake

print("🔍 Loading articles from PolitiFact (real)...")
start = time()
pf_real = collect_article_samples(str(real_politifact), label=1)
print(f"  → {len(pf_real)} samples loaded [{time() - start:.2f}s]")
kite_data += pf_real

print("🔍 Loading articles from PolitiFact (fake)...")
start = time()
pf_fake = collect_article_samples(str(fake_politifact), label=0)
print(f"  → {len(pf_fake)} samples loaded [{time() - start:.2f}s]")
kite_data += pf_fake

print(f"\n📦 Total usable samples: {len(kite_data)}")

# --- Full-data split per source ---

# GossipCop split
gc_samples = gc_real + gc_fake
gc_labels  = [row["label"] for row in gc_samples]
gc_train, gc_test = train_test_split(
    gc_samples,
    test_size=0.2,
    random_state=42,
    stratify=gc_labels
)
print(f"🔧 GossipCop train/test: {len(gc_train)}/{len(gc_test)} samples")

# PolitiFact split
pf_samples = pf_real + pf_fake
pf_labels  = [row["label"] for row in pf_samples]
pf_train, pf_test = train_test_split(
    pf_samples,
    test_size=0.2,
    random_state=42,
    stratify=pf_labels
)
print(f"🔧 PolitiFact train/test: {len(pf_train)}/{len(pf_test)} samples")

# --- Build KITE datasets & DataLoaders for each ---

# GossipCop
gc_train_ds = KITEDataset(gc_train)
gc_test_ds  = KITEDataset(gc_test)
gc_train_loader = DataLoader(gc_train_ds, batch_size=32, shuffle=True)
gc_test_loader  = DataLoader(gc_test_ds,  batch_size=32)
print("🚀 GossipCop DataLoaders ready")

# PolitiFact
pf_train_ds = KITEDataset(pf_train)
pf_test_ds  = KITEDataset(pf_test)
pf_train_loader = DataLoader(pf_train_ds, batch_size=32, shuffle=True)
pf_test_loader  = DataLoader(pf_test_ds,  batch_size=32)
print("🚀 PolitiFact DataLoaders ready")

🔍 Loading articles from GossipCop (real)...




  → 4610 samples loaded [11.03s]
🔍 Loading articles from GossipCop (fake)...
  → 3838 samples loaded [7.51s]
🔍 Loading articles from PolitiFact (real)...
  → 169 samples loaded [0.19s]
🔍 Loading articles from PolitiFact (fake)...
  → 156 samples loaded [0.33s]

📦 Total usable samples: 8773
🔧 GossipCop train/test: 6758/1690 samples
🔧 PolitiFact train/test: 260/65 samples
🚀 GossipCop DataLoaders ready
🚀 PolitiFact DataLoaders ready


In [27]:
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    precision_recall_curve,
)
import torch
import torch.nn as nn
import torch.nn.functional as F

# ─── Focal Loss Definition ───
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, alpha=0.25, reduction='mean'):
        super().__init__()
        self.gamma = gamma
        self.alpha = alpha
        self.reduction = reduction

    def forward(self, logits, targets):
        bce = F.binary_cross_entropy_with_logits(logits, targets, reduction='none')
        p_t = torch.exp(-bce)
        loss = self.alpha * (1 - p_t) ** self.gamma * bce
        return loss.mean() if self.reduction == 'mean' else loss.sum()

# ─── Training + Eval on Both Splits with Focal Loss + Threshold Tuning ───
epochs, patience = 10, 3
results = {}

for name, tr_loader, te_loader in [
    ("PolitiFact", pf_train_loader, pf_test_loader),
    ("GossipCop",  gc_train_loader, gc_test_loader),
]:
    print(f"\n=== Running on {name} split ===")

    # 1) Re-init model, optimizer, scheduler, loss
    classifier = NewsClassifier()
    optimizer  = AdamW([
        {"params": roberta.encoder.layer[-4:].parameters(),           "lr": 5e-6, "weight_decay": 1e-4},
        {"params": roberta.pooler.parameters(),                        "lr": 5e-6, "weight_decay": 1e-4},
        {"params": clip_model.vision_model.encoder.layers[-4:].parameters(), 
         "lr": 5e-6, "weight_decay": 1e-4},
        {"params": clip_model.visual_projection.parameters(),          "lr": 5e-6, "weight_decay": 1e-4},
        {"params": classifier.parameters(),                           "lr": 3e-4, "weight_decay": 1e-3},
    ])
    criterion = FocalLoss(gamma=2.0, alpha=0.25)
    total_steps = epochs * len(tr_loader)
    scheduler   = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),
        num_training_steps=total_steps
    )

    best_val_acc, no_improve = 0.0, 0
    best_thr_for_split = 0.5

    for epoch in range(1, epochs + 1):
        print(f"\n— Epoch {epoch}/{epochs} —")
        # ——— Training ———
        classifier.train()
        running_loss = 0.0
        for i, (x, y) in enumerate(tr_loader, start=1):
            optimizer.zero_grad()
            logits, _ = classifier(x)                  # unpack tuple here
            loss      = criterion(logits, y)
            loss.backward()
            optimizer.step()
            scheduler.step()
            running_loss += loss.item()
            if i % 100 == 0 or i == len(tr_loader):
                print(f"  [Train] Batch {i}/{len(tr_loader)} – Loss: {loss:.4f}")
        print(f"  Avg Train Loss: {running_loss/len(tr_loader):.4f}")

        # ——— Validation & Threshold Sweep ———
        classifier.eval()
        all_probs, all_y = [], []
        with torch.no_grad():
            for x, y in te_loader:
                logits, _ = classifier(x)             # unpack tuple here
                probs      = torch.sigmoid(logits).view(-1).tolist()
                labels     = y.view(-1).tolist()
                all_probs.extend(probs)
                all_y.extend(labels)

        # compute precision/recall curve & pick threshold maxing F1
        precisions, recalls, ths = precision_recall_curve(all_y, all_probs)
        f1_scores = 2 * precisions * recalls / (precisions + recalls + 1e-8)
        best_idx  = f1_scores.argmax()
        best_thr  = ths[best_idx]
        best_thr_for_split = best_thr

        # compute early-stop metric (accuracy) at this threshold
        val_preds = [1 if p > best_thr else 0 for p in all_probs]
        val_acc   = accuracy_score(all_y, val_preds)

        if val_acc > best_val_acc:
            best_val_acc, no_improve = val_acc, 0
            torch.save(classifier.state_dict(), f"best_{name}.pt")
        else:
            no_improve += 1
            print(f"    → No improvement for {no_improve}/{patience} epochs")
            if no_improve >= patience:
                print(f"    Early stopping at epoch {epoch}")
                break

    # ——— Final Eval @ chosen threshold + collect confidences ———
    classifier.load_state_dict(torch.load(f"best_{name}.pt"))
    classifier.eval()

    final_probs, final_y = [], []
    text_confs, image_confs, kg_confs = [], [], []

    with torch.no_grad():
        for x, y in te_loader:
            # 1) get model outputs
            logit, confs = classifier(x)

            # 2) probabilities and true labels
            probs  = torch.sigmoid(logit).view(-1).tolist()
            labels = y.view(-1).tolist()

            final_probs.extend(probs)
            final_y.extend(labels)

            # 3) per-modality confidences
            text_confs.extend(  confs['text_conf'].view(-1).tolist() )
            image_confs.extend( confs['image_conf'].view(-1).tolist() )
            kg_confs.extend(    confs['kg_conf'].view(-1).tolist() )

    # apply your best threshold
    final_preds = [1 if p > best_thr_for_split else 0 for p in final_probs]

    # compute guarded averages for confidences
    avg_txt = sum(text_confs)  / len(text_confs)  if text_confs  else 0.0
    avg_img = sum(image_confs) / len(image_confs) if image_confs else 0.0
    avg_kg  = sum(kg_confs)    / len(kg_confs)    if kg_confs    else 0.0

    # store metrics + these averages
    results[name] = {
        "accuracy":       accuracy_score(final_y, final_preds),
        "f1":             f1_score(final_y, final_preds),
        "precision":      precision_score(final_y, final_preds),
        "recall":         recall_score(final_y, final_preds),
        "threshold":      best_thr_for_split,
        "avg_text_conf":  avg_txt,
        "avg_image_conf": avg_img,
        "avg_kg_conf":    avg_kg,
    }
    

for name, m in results.items():
        print(f"\n📊 Final {name} Results:")
        print(f"✅ Accuracy:  {m['accuracy']:.4f}")
        print(f"🎯 F1 Score:  {m['f1']:.4f}")
        print(f"🔍 Precision: {m['precision']:.4f}")
        print(f"📥 Recall:    {m['recall']:.4f}")
        print()
        print(f"📊 Final {name} Results:")
        print(f"📖 Avg Text Confidence:  {m['avg_text_conf']:.4f}")
        print(f"🖼️  Avg Image Confidence: {m['avg_image_conf']:.4f}")
        print(f"📚 Avg KG Confidence:    {m['avg_kg_conf']:.4f}")



=== Running on PolitiFact split ===

— Epoch 1/10 —




  [Train] Batch 9/9 – Loss: 0.0408
  Avg Train Loss: 0.0430

— Epoch 2/10 —




  [Train] Batch 9/9 – Loss: 0.0287
  Avg Train Loss: 0.0386

— Epoch 3/10 —




  [Train] Batch 9/9 – Loss: 0.0326
  Avg Train Loss: 0.0341

— Epoch 4/10 —




  [Train] Batch 9/9 – Loss: 0.0177
  Avg Train Loss: 0.0258
    → No improvement for 1/3 epochs

— Epoch 5/10 —




  [Train] Batch 9/9 – Loss: 0.0139
  Avg Train Loss: 0.0187
    → No improvement for 2/3 epochs

— Epoch 6/10 —




  [Train] Batch 9/9 – Loss: 0.0061
  Avg Train Loss: 0.0122
    → No improvement for 3/3 epochs
    Early stopping at epoch 6

=== Running on GossipCop split ===

— Epoch 1/10 —




Error fetching QID: QueryBadFormed: A bad request has been sent to the endpoint: probably the SPARQL query is badly formed. 

Response:
b'SPARQL-QUERY: queryStr=\n    SELECT ?item WHERE {\n      ?item rdfs:label "Milena Markovna "Mila"@en.\n    } LIMIT 1\n    \njava.util.concurrent.ExecutionException: org.openrdf.query.MalformedQueryException: Lexical error at line 3, column 46.  Encountered: "\\"" (34), after : "Mila"\n\tat java.util.concurrent.FutureTask.report(FutureTask.java:122)\n\tat java.util.concurrent.FutureTask.get(FutureTask.java:206)\n\tat com.bigdata.rdf.sail.webapp.BigdataServlet.submitApiTask(BigdataServlet.java:292)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doSparqlQuery(QueryServlet.java:678)\n\tat com.bigdata.rdf.sail.webapp.QueryServlet.doGet(QueryServlet.java:290)\n\tat com.bigdata.rdf.sail.webapp.RESTServlet.doGet(RESTServlet.java:240)\n\tat com.bigdata.rdf.sail.webapp.MultiTenancyServlet.doGet(MultiTenancyServlet.java:273)\n\tat javax.servlet.http.HttpServlet



  [Train] Batch 100/212 – Loss: 0.0180
  [Train] Batch 200/212 – Loss: 0.0286
  [Train] Batch 212/212 – Loss: 0.0285
  Avg Train Loss: 0.0287

— Epoch 3/10 —




  [Train] Batch 100/212 – Loss: 0.0295
  [Train] Batch 200/212 – Loss: 0.0196
  [Train] Batch 212/212 – Loss: 0.0266
  Avg Train Loss: 0.0208

— Epoch 4/10 —




  [Train] Batch 100/212 – Loss: 0.0049
  [Train] Batch 200/212 – Loss: 0.0136
  [Train] Batch 212/212 – Loss: 0.0051
  Avg Train Loss: 0.0119
    → No improvement for 1/3 epochs

— Epoch 5/10 —




  [Train] Batch 100/212 – Loss: 0.0080
  [Train] Batch 200/212 – Loss: 0.0148
  [Train] Batch 212/212 – Loss: 0.0010
  Avg Train Loss: 0.0062
    → No improvement for 2/3 epochs

— Epoch 6/10 —




  [Train] Batch 100/212 – Loss: 0.0020
  [Train] Batch 200/212 – Loss: 0.0007
  [Train] Batch 212/212 – Loss: 0.0102
  Avg Train Loss: 0.0039
    → No improvement for 3/3 epochs
    Early stopping at epoch 6





📊 Final PolitiFact Results:
✅ Accuracy:             0.8681
🎯 F1 Score:             0.8894
🔍 Precision:            0.9345
📥 Recall:               0.8788

📖 Avg Text Confidence:  0.5630
🖼️ Avg Image Confidence: 0.5058
📚 Avg KG Confidence:    0.4931

📊 Final GossipCop Results:
✅ Accuracy:             0.8851
🎯 F1 Score:             0.9297
🔍 Precision:            0.9265
📥 Recall:               0.9592

📖 Avg Text Confidence:  0.4319
🖼️ Avg Image Confidence: 0.4414
📚 Avg KG Confidence:    0.4933
