In [None]:
from urllib.parse import urlparse
import pandas as pd
import numpy as np
import re
import spacy
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score
from transformers import RobertaTokenizer, RobertaModel
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader, Dataset
import torch
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight

data_real_path = '/content/politifact_real.csv'
data_fake_path = '/content/politifact_fake.csv'

try:
    real_news = pd.read_csv(data_real_path)
    fake_news = pd.read_csv(data_fake_path)
except FileNotFoundError:
    print("Error: CSV files not found!")
    exit()

real_news['label'] = 1
fake_news['label'] = 0

dataset = pd.concat([real_news, fake_news], ignore_index=True)
dataset = dataset.dropna(subset=['title'])

def extract_domain(url):
    try:
        domain = urlparse(url).netloc
        return domain if domain else 'unknown_source'
    except:
        return 'unknown_source'

dataset['source_domain'] = dataset['news_url'].apply(extract_domain)

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

dataset['cleaned_title'] = dataset['title'].apply(clean_text)

spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')

def extract_named_entities(text):
    doc = nlp(text)
    return len(doc.ents)

dataset['ner_count'] = dataset['cleaned_title'].apply(extract_named_entities)

political_jargon = {"election", "senate", "congress", "campaign", "policy", "vote", "legislation", "republican", "democrat",
    "veto", "incumbent", "mandate", "bill", "quorum", "gridlock", "jingoism", "kleptocracy", "lobby", "caucus",
    "ballot", "nominee", "primary", "referendum", "coalition", "slush", "stump", "purge", "pivot", "surrogate",
    "rider", "markup", "cloture", "hopper", "proxy", "spin", "dogma", "frontrunner", "turnout", "platform",
    "populism", "cronyism"}

def identify_political_jargon(text):
    words = set(text.split())
    return len(words.intersection(political_jargon))

dataset['jargon_count'] = dataset['cleaned_title'].apply(identify_political_jargon)

class NewsDataset(Dataset):
    def __init__(self, encodings, labels, ner_features, jargon_features):
        self.encodings = encodings
        self.labels = labels
        self.ner_features = ner_features
        self.jargon_features = jargon_features

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        item['ner'] = torch.tensor(self.ner_features[idx], dtype=torch.float)
        item['jargon'] = torch.tensor(self.jargon_features[idx], dtype=torch.float)
        return item

class CustomRobertaClassifier(nn.Module):
    def __init__(self, model_name='roberta-base', num_labels=2, ner_dim=1, jargon_dim=1):
        super().__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.fc = nn.Linear(768 + ner_dim + jargon_dim, num_labels)

    def forward(self, input_ids, attention_mask, ner, jargon, labels=None):
        outputs = self.roberta(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]
        combined = torch.cat((pooled_output, ner.unsqueeze(1), jargon.unsqueeze(1)), dim=1)
        logits = self.fc(combined)
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss(weight=class_weights.to(device))
            loss = loss_fn(logits, labels)
            return {'loss': loss, 'logits': logits}
        return {'logits': logits}

batch_sizes = [16, 32]
learning_rates = [1e-5, 2e-5]
best_accuracy = 0
best_params = {}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
X = dataset['cleaned_title']
y = dataset['label']
ner_features = dataset['ner_count']
jargon_features = dataset['jargon_count']

for bs in batch_sizes:
    for lr in learning_rates:
        fold_accuracies = []
        print(f"Batch Size: {bs}, Learning Rate: {lr}")

        for fold, (train_idx, test_idx) in enumerate(kf.split(X)):
            print(f"Fold {fold + 1}")

            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
            y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
            ner_train, ner_test = ner_features.iloc[train_idx], ner_features.iloc[test_idx]
            jargon_train, jargon_test = jargon_features.iloc[train_idx], jargon_features.iloc[test_idx]

            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)
            ner_train, ner_val = ner_train.iloc[:-len(X_val)], ner_train.iloc[-len(X_val):]
            jargon_train, jargon_val = jargon_train.iloc[:-len(X_val)], jargon_train.iloc[-len(X_val):]

            tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
            X_train_tokens = tokenizer(X_train.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')
            X_val_tokens = tokenizer(X_val.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')
            X_test_tokens = tokenizer(X_test.tolist(), max_length=128, padding='max_length', truncation=True, return_tensors='pt')

            train_dataset = NewsDataset(X_train_tokens, y_train.tolist(), ner_train.tolist(), jargon_train.tolist())
            val_dataset = NewsDataset(X_val_tokens, y_val.tolist(), ner_val.tolist(), jargon_val.tolist())
            test_dataset = NewsDataset(X_test_tokens, y_test.tolist(), ner_test.tolist(), jargon_test.tolist())

            train_loader = DataLoader(train_dataset, batch_size=bs, shuffle=True)
            val_loader = DataLoader(val_dataset, batch_size=bs)
            test_loader = DataLoader(test_dataset, batch_size=bs)

            model = CustomRobertaClassifier()
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            model.to(device)

            class_weights = torch.tensor(compute_class_weight('balanced', classes=np.array([0, 1]), y=y_train), dtype=torch.float)
            optimizer = AdamW(model.parameters(), lr=lr)
            scheduler = ReduceLROnPlateau(optimizer, 'min', patience=1, factor=0.5)

            epochs = 10
            best_val_loss = float('inf')
            patience = 2
            patience_counter = 0

            for epoch in range(epochs):
                model.train()
                total_train_loss = 0
                for batch in train_loader:
                    optimizer.zero_grad()
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    ner = batch['ner'].to(device)
                    jargon = batch['jargon'].to(device)
                    labels = batch['labels'].to(device)

                    outputs = model(input_ids, attention_mask, ner, jargon, labels)
                    loss = outputs['loss']
                    total_train_loss += loss.item()
                    loss.backward()
                    optimizer.step()

                model.eval()
                total_val_loss = 0
                with torch.no_grad():
                    for batch in val_loader:
                        input_ids = batch['input_ids'].to(device)
                        attention_mask = batch['attention_mask'].to(device)
                        ner = batch['ner'].to(device)
                        jargon = batch['jargon'].to(device)
                        labels = batch['labels'].to(device)
                        outputs = model(input_ids, attention_mask, ner, jargon, labels)
                        total_val_loss += outputs['loss'].item()

                avg_train_loss = total_train_loss / len(train_loader)
                avg_val_loss = total_val_loss / len(val_loader)
                print(f"Epoch {epoch + 1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

                scheduler.step(avg_val_loss)

                if avg_val_loss < best_val_loss:
                    best_val_loss = avg_val_loss
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= patience:
                        print("Stopping early")
                        break

            model.eval()
            predictions = []
            true_labels = []
            with torch.no_grad():
                for batch in test_loader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    ner = batch['ner'].to(device)
                    jargon = batch['jargon'].to(device)
                    labels = batch['labels'].to(device)
                    outputs = model(input_ids, attention_mask, ner, jargon)
                    preds = torch.argmax(outputs['logits'], dim=1)
                    predictions.extend(preds.cpu().tolist())
                    true_labels.extend(labels.cpu().tolist())

            accuracy = accuracy_score(true_labels, predictions)
            print(f"Fold Accuracy: {accuracy * 100:.2f}%")
            print(classification_report(true_labels, predictions))
            print(confusion_matrix(true_labels, predictions))
            roc_auc = roc_auc_score(true_labels, predictions)
            print(f"ROC-AUC: {roc_auc:.4f}")
            fold_accuracies.append(accuracy)

        avg_accuracy = np.mean(fold_accuracies)
        print(f"Avg Accuracy for BS {bs}, LR {lr}: {avg_accuracy * 100:.2f}%")
        if avg_accuracy > best_accuracy:
            best_accuracy = avg_accuracy
            best_params = {'batch_size': bs, 'learning_rate': lr}

print(f"Best Params: {best_params}, Best Accuracy: {best_accuracy * 100:.2f}%")