In [40]:
import pandas as pd
true_data_df=pd.read_csv("./news/true.csv")
fake_data_df=pd.read_csv("./news/fake.csv")
true_data_df.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [41]:
fake_data_df.shape, true_data_df.shape

((23481, 4), (21417, 4))

In [None]:
fake_df=fake_data_df.sample(n=5000, random_state=42)
real_df=true_data_df.sample(n=5000, random_state=42)

In [42]:
true_data_df["label"]=1
fake_data_df["label"]=0
data_df=pd.concat([true_data_df, fake_data_df], axis=0)
data_df.shape

(44898, 5)

In [43]:
data_df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [44]:
import re
import unicodedata

def clean_text(text):
    text = unicodedata.normalize("NFKD", text)
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [45]:
data_df["combined_text"] = "[TITLE] " + data_df["title"] + " [TEXT] " + data_df["text"]

In [46]:
data_df["combined_text"] = data_df["combined_text"].apply(clean_text)

In [47]:
classes = data_df["label"].values
num_classes = len(set(classes))
print(f"Number of classes: {num_classes}")

Number of classes: 2


In [48]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
data_df["label_enc"] = label_encoder.fit_transform(data_df["label"])

In [49]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(data_df, test_size=0.2, stratify=data_df["label"], random_state=42)
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

Train shape: (35918, 8), Test shape: (8980, 8)


In [50]:
train_df.head()

Unnamed: 0,title,text,subject,date,label,combined_text,subject_enc,label_enc
14164,SocGen says no wrongdoing in handling of Natio...,PARIS (Reuters) - French bank Societe Generale...,worldnews,"November 22, 2017",1,[title] socgen says no wrongdoing in handling ...,7,1
6909,North Carolina governor concedes election to D...,"WINSTON-SALEM, N.C. (Reuters) - North Carolina...",politicsNews,"December 5, 2016",1,[title] north carolina governor concedes elect...,6,1
18191,TRUMP FEVER! W. VA Dem Senator Says He Won’t V...,Civil political discourse took a beating in We...,left-news,"Aug 7, 2017",0,[title] trump fever! w. va dem senator says he...,4,0
1903,New York vows to sue Trump over immigrant chil...,(Reuters) - New York and Washington state on M...,politicsNews,"September 4, 2017",1,[title] new york vows to sue trump over immigr...,6,1
9141,Orlando killer expressed support for multiple ...,"ORLANDO, Fla. (Reuters) - Orlando nightclub ki...",politicsNews,"June 12, 2016",1,[title] orlando killer expressed support for m...,6,1


In [51]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(texts):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch

class FakeNewsDataset(Dataset):
    def __init__(self, df):
        self.texts = df["combined_text"].tolist()
        self.labels = df["label_enc"].tolist()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        # Return raw data - tokenization will happen in collate_fn
        return {
            "text": self.texts[idx],
            "label": self.labels[idx]
        }

def collate_fn(batch):
    """Tokenize all texts in batch together for uniform padding"""
    texts = [item["text"] for item in batch]
    labels = [item["label"] for item in batch]
    
    encoded = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )

    return {
        "input_ids": encoded["input_ids"],
        "label": torch.tensor(labels, dtype=torch.long)
    }


In [53]:
tokenize("hi my name is isfar")

{'input_ids': tensor([[  101,  7632,  2026,  2171,  2003,  2003, 14971,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [54]:
tokenizer.convert_ids_to_tokens(tokenize("hi my name is isfar")["input_ids"][0])

['[CLS]', 'hi', 'my', 'name', 'is', 'is', '##far', '[SEP]']

In [55]:
train_dataloader = DataLoader(FakeNewsDataset(train_df), batch_size=8, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(FakeNewsDataset(test_df), batch_size=8, shuffle=False, collate_fn=collate_fn)

In [56]:
next(iter(train_dataloader))

{'input_ids': tensor([[ 101, 1031, 2516,  ..., 2166, 1999,  102],
         [ 101, 1031, 2516,  ..., 1998, 1996,  102],
         [ 101, 1031, 2516,  ..., 1012, 2027,  102],
         ...,
         [ 101, 1031, 2516,  ...,    0,    0,    0],
         [ 101, 1031, 2516,  ...,    0,    0,    0],
         [ 101, 1031, 2516,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'subject': tensor([6, 7, 5, 7, 7, 7, 7, 6]),
 'label': tensor([1, 1, 0, 1, 1, 1, 1, 1])}

In [None]:
import torch.nn as nn


class FakeNewsClassifier(nn.Module):
    def __init__(self, num_classes, vocab_size=30522, embed_dim=256, hidden_dim=256, num_layers=1, rnn_type="gru", bidirectional=True):
        super().__init__()
        self.bidirectional = bidirectional
        dir_mult = 2 if bidirectional else 1

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=tokenizer.pad_token_id)
        if rnn_type == "lstm":
            self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=bidirectional, dropout=0.5)
        elif rnn_type == "gru":
            self.rnn = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=bidirectional, dropout=0.5)  # add dropout=0.5
        else:
            self.rnn = nn.RNN(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, bidirectional=bidirectional)
        self.classifier = nn.Linear(hidden_dim * dir_mult, num_classes)

    def forward(self, input_ids):
        # Compute lengths from non-pad tokens so we don't rely on an external attention_mask
        lengths = (input_ids != tokenizer.pad_token_id).sum(dim=1)

        # Embed and pack for variable-length handling
        embedded = self.embedding(input_ids)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, hidden = self.rnn(packed)

        # Grab last hidden state(s)
        if isinstance(hidden, tuple):  # LSTM returns (h, c)
            hidden = hidden[0]

        if self.bidirectional:
            hidden_fwd = hidden[-2]
            hidden_bwd = hidden[-1]
            h_cat = torch.cat([hidden_fwd, hidden_bwd], dim=1)
        else:
            h_cat = hidden[-1]

        logits = self.classifier(h_cat)
        return logits   

### How the RNN forward works (step-by-step)
- **Inputs:** `input_ids` is a batch of token ids already padded; pad token id = tokenizer.pad_token_id.
- **Lengths:** We compute `lengths = (input_ids != pad_id).sum(dim=1)` so each sample knows how many real tokens it has.
- **Embed:** `embedded = embedding(input_ids)` turns token ids into vectors `[batch, seq_len, embed_dim]`.
- **Pack:** `pack_padded_sequence(embedded, lengths, batch_first=True, enforce_sorted=False)` compresses sequences so the RNN ignores padded tail positions.
- **RNN:** We run a bidirectional GRU/LSTM/RNN; it returns hidden states. For bi-RNN, the last forward and last backward hidden states summarize the sequence.
- **Concat:** `hidden_fwd` and `hidden_bwd` are concatenated → `[batch, hidden_dim*2]`.
- **Classify:** The classifier projects this concatenated vector to logits for the classes.

**Tiny example**
- Suppose a batch has 2 sequences after padding (pad_id = 0):
  - Sample A ids: `[101, 10, 20, 30, 102, 0, 0]` (length 5)
  - Sample B ids: `[101, 11, 12, 102, 0, 0, 0]` (length 4)
- Lengths = `[5, 4]`
- Packed input skips the padded zeros for each sample when feeding the RNN.
- Bi-RNN returns last forward + last backward states per sample; we concatenate them and feed to the classifier to get the logits.

Key point: We don’t need an external attention mask because we derive lengths directly from `input_ids` using the pad token id.

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = FakeNewsClassifier(
    num_classes=num_classes,
    vocab_size=tokenizer.vocab_size,
    embed_dim=256,
    hidden_dim=256,
    num_layers=1,
    rnn_type="gru",  # options: "gru", "lstm", "rnn"
    bidirectional=True  # set to False for a unidirectional RNN
).to(device)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-4)
EPOCHS = 5

In [None]:
from tqdm import tqdm

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()
        
        input_ids = batch["input_ids"].to(device)
        labels = batch["label"].to(device)
        
        outputs = model(input_ids)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_dataloader)

    # Evaluation phase
    model.eval()
    eval_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids = batch["input_ids"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            eval_loss += loss.item()

            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    avg_eval_loss = eval_loss / len(test_dataloader)
    eval_acc = correct / total if total > 0 else 0.0

    print(f"Epoch {epoch+1} | Train Loss: {avg_loss:.4f} | Val Loss: {avg_eval_loss:.4f} | Val Acc: {eval_acc:.4f}")


In [None]:
torch.save(model.state_dict(), "fake_news_rnn_model.pth")

In [None]:
def predict(texts):
    model.eval()
    encoded = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    input_ids = encoded["input_ids"].to(device)

    with torch.no_grad():
        outputs = model(input_ids)
        probs = nn.Softmax(dim=1)(outputs)
        preds = probs.argmax(dim=1).cpu().numpy()

    return label_encoder.inverse_transform(preds)

In [None]:
# Synthetic examples: two real-sounding headlines and one fake-sounding headline
synthetic_texts = [
    "Breaking: International coalition reaches historic climate agreement to cut emissions by 2030.",
    "Scientists confirm successful replication of fusion experiment setting new record for energy gain.",
    "Exclusive: Secret society of moon-dwelling lizard people plans to take over world banks next week."
]

print(predict(synthetic_texts))