<a href="https://colab.research.google.com/github/lazy-wav/Data-Analysis/blob/main/Distil_Text%2BHeadline_1ep_10K_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
uploaded = files.upload()

Saving True.csv to True.csv


In [2]:
uploaded = files.upload()

Saving Fake.csv to Fake.csv


In [3]:
!pip install transformers datasets scikit-learn
!pip install tqdm



In [4]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

Loading Datasets

In [5]:
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')
true_df["label"] = 0
fake_df["label"] = 1

Concatenating the datasets

In [6]:
df = pd.concat([true_df, fake_df], ignore_index=True)
df = df[["title", "text", "label"]].dropna().sample(frac=1, random_state=42)

Cleaning

In [7]:
def clean(text):
    text = text.lower()
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["title"] = df["title"].apply(clean)
df["text"] = df["text"].apply(clean)

In [8]:
print(df['label'].value_counts())

label
1    23481
0    21417
Name: count, dtype: int64


Splitting dataset into training and testing (70-30)

In [9]:
df = df.groupby('label').apply(lambda x: x.sample(n=10000, random_state=42)).reset_index(drop=True)

  df = df.groupby('label').apply(lambda x: x.sample(n=10000, random_state=42)).reset_index(drop=True)


In [10]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df["label"])

Tokenizing

In [11]:
from transformers import DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
import torch

In [12]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [13]:
class HeadlineBodyDataset(Dataset):
    def __init__(self, df, max_len=64):
        self.df = df
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        title = tokenizer(row["title"], padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")
        text = tokenizer(row["text"], padding='max_length', truncation=True, max_length=self.max_len, return_tensors="pt")

        return {
            'headline_input_ids': title["input_ids"].squeeze(),
            'headline_attention_mask': title["attention_mask"].squeeze(),
            'body_input_ids': text["input_ids"].squeeze(),
            'body_attention_mask': text["attention_mask"].squeeze(),
            'label': torch.tensor(row["label"], dtype=torch.float)
        }

In [14]:
train_data = HeadlineBodyDataset(train_df)
test_data = HeadlineBodyDataset(test_df)

train_loader = DataLoader(train_data, batch_size=8, shuffle=True)
test_loader = DataLoader(test_data, batch_size=8)

Model

In [15]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import DistilBertModel

In [18]:
class CrossAttention(nn.Module):
    def __init__(self, hidden_size=768):
        super().__init__()
        self.q = nn.Linear(hidden_size, hidden_size)
        self.k = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, hidden_size)
        self.scale = hidden_size ** -0.5

    def forward(self, Q, K, V):
        q = self.q(Q)
        k = self.k(K)
        v = self.v(V)
        attn_weights = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        attn_probs = F.softmax(attn_weights, dim=-1)
        return torch.matmul(attn_probs, v)

class CrossAttentionClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.attn = CrossAttention()
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Sequential(
            nn.Linear(768, 128),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, h_input_ids, h_attention_mask, b_input_ids, b_attention_mask):
        h_out = self.encoder(h_input_ids, h_attention_mask).last_hidden_state
        b_out = self.encoder(b_input_ids, b_attention_mask).last_hidden_state

        attn_output = self.attn(h_out, b_out, b_out)  # headline as query
        pooled = attn_output.mean(dim=1)
        return self.classifier(pooled)

Training

In [20]:
import time
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CrossAttentionClassifier().to(device)
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

total_start = time.time()

for epoch in range(1):
    start_time = time.time()
    model.train()
    total_loss = 0
    batch_times = []

    print(f"\nEpoch {epoch+1} started...")
    progress_bar = tqdm(train_loader, desc="Training", leave=False)

    for i, batch in enumerate(progress_bar):
        batch_start = time.time()

        optimizer.zero_grad()
        outputs = model(
            batch["headline_input_ids"].to(device),
            batch["headline_attention_mask"].to(device),
            batch["body_input_ids"].to(device),
            batch["body_attention_mask"].to(device)
        )
        labels = batch["label"].unsqueeze(1).to(device)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        batch_time = time.time() - batch_start
        batch_times.append(batch_time)

        # Estimate time left
        avg_batch_time = sum(batch_times) / len(batch_times)
        batches_left = len(train_loader) - (i + 1)
        eta = avg_batch_time * batches_left

        progress_bar.set_postfix({
            "Loss": f"{loss.item():.4f}",
            "ETA": f"{eta:.1f}s"
        })

        if (i + 1) % 100 == 0:
            torch.save(model.state_dict(), f"checkpoint_epoch{epoch+1}_batch{i+1}.pth")

    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1} completed | Loss: {total_loss:.4f} | Time: {epoch_time:.2f} sec")

print(f"\n Total training time: {time.time() - total_start:.2f} seconds")


Epoch 1 started...


                                                                                      

Epoch 1 completed | Loss: 48.1918 | Time: 9567.64 sec

 Total training time: 9567.65 seconds




In [21]:
from sklearn.metrics import classification_report

model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for batch in test_loader:
        outputs = model(
            batch["headline_input_ids"].to(device),
            batch["headline_attention_mask"].to(device),
            batch["body_input_ids"].to(device),
            batch["body_attention_mask"].to(device)
        )
        preds = (outputs > 0.5).long().cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(batch["label"].long().cpu().numpy())

print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.9987    0.9973    0.9980      3000
           1     0.9973    0.9987    0.9980      3000

    accuracy                         0.9980      6000
   macro avg     0.9980    0.9980    0.9980      6000
weighted avg     0.9980    0.9980    0.9980      6000



In [22]:
import os
import torch

save_dir = "/content/cross_attention_model"
os.makedirs(save_dir, exist_ok=True)

model_path = os.path.join(save_dir, "cross_attn_distilbert_fnd.pth")
torch.save(model.state_dict(), model_path)

print(f"Model weights saved at: {model_path}")

Model weights saved at: /content/cross_attention_model/cross_attn_distilbert_fnd.pth


In [23]:
tokenizer.save_pretrained(save_dir)
print(f"Tokenizer files saved at: {save_dir}")

import shutil

zip_path = shutil.make_archive(save_dir, 'zip', save_dir)
print(f" Zipped model path: {zip_path}")

from google.colab import files
files.download(zip_path)

Tokenizer files saved at: /content/cross_attention_model
 Zipped model path: /content/cross_attention_model.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>