# Transformer – Fake News

In [2]:
# IMPORTS

import pandas as pd
import numpy as np
import model_utils 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# NORMALIZE AND CONCAT ALL DATASETS

normalized_dfs = [
    # Misinfo Dataset
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_FAKE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_TRUE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/EXTRA_RussianPropagandaSubset.csv"),
    # Fake News Net Dataset
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_real.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_real.csv"),
    # Liar Dataset
    model_utils.normalize_liar_dataset("datasets/liar-dataset/train.tsv")
]

df = pd.concat(normalized_dfs, ignore_index=True)
df = df.dropna(subset=["text"]).reset_index(drop=True)

In [8]:
normalized_dfs = [
    # Misinfo Dataset
    #model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_FAKE.csv"),
    #model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_TRUE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/EXTRA_RussianPropagandaSubset.csv"),
    # Fake News Net Dataset
    #model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_fake.csv"),
    #model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_real.csv"),
    #model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_fake.csv"),
    #model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_real.csv"),
    # Liar Dataset
    #model_utils.normalize_liar_dataset("datasets/liar-dataset/train.tsv")
]
df = pd.concat(normalized_dfs, ignore_index=True)
df = df.dropna(subset=["text"]).reset_index(drop=True)

In [11]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import math
from tqdm import tqdm

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

texts = df["text"].tolist()
words = " ".join(texts).split()

vocab = sorted(set(words))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
encoded = [word2idx[w] for w in words]
subset = encoded[:10000]

class WordDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx : idx + self.seq_len])
        y = torch.tensor(self.data[idx + 1 : idx + self.seq_len + 1])
        return x, y

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

class TransformerLM(nn.Module):
    def __init__(self, vocab_size, d_model=128, nhead=4, num_layers=2):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, d_model)
        self.pos = PositionalEncoding(d_model)
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, batch_first=True)
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        x = self.embed(x)
        x = self.pos(x)
        tgt_mask = torch.triu(torch.full((x.size(1), x.size(1)), float('-inf')), diagonal=1).to(x.device)
        out = self.decoder(x, x, tgt_mask=tgt_mask)
        return self.fc(out)

seq_len = 12
batch_size = 64
epochs = 20

dataset = WordDataset(subset, seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = TransformerLM(len(vocab)).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(epochs):
    total_loss = 0
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    for x_batch, y_batch in loop:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        logits = model(x_batch)
        loss = criterion(logits.view(-1, len(vocab)), y_batch.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} complete. Avg loss: {total_loss/len(dataloader):.4f}")


Epoch 1: 100%|██████████| 157/157 [00:02<00:00, 57.49it/s, loss=2.04]


Epoch 1 complete. Avg loss: 5.6129


Epoch 2: 100%|██████████| 157/157 [00:02<00:00, 60.02it/s, loss=0.821]


Epoch 2 complete. Avg loss: 1.9418


Epoch 3: 100%|██████████| 157/157 [00:02<00:00, 59.43it/s, loss=0.27] 


Epoch 3 complete. Avg loss: 0.4969


Epoch 4: 100%|██████████| 157/157 [00:02<00:00, 59.64it/s, loss=0.202]


Epoch 4 complete. Avg loss: 0.2017


Epoch 5: 100%|██████████| 157/157 [00:02<00:00, 59.44it/s, loss=0.104] 


Epoch 5 complete. Avg loss: 0.1191


Epoch 6: 100%|██████████| 157/157 [00:02<00:00, 59.37it/s, loss=0.0505]


Epoch 6 complete. Avg loss: 0.0789


Epoch 7: 100%|██████████| 157/157 [00:02<00:00, 59.12it/s, loss=0.0309]


Epoch 7 complete. Avg loss: 0.0570


Epoch 8: 100%|██████████| 157/157 [00:02<00:00, 58.89it/s, loss=0.0356]


Epoch 8 complete. Avg loss: 0.0437


Epoch 9: 100%|██████████| 157/157 [00:02<00:00, 59.26it/s, loss=0.0379]


Epoch 9 complete. Avg loss: 0.0328


Epoch 10: 100%|██████████| 157/157 [00:02<00:00, 59.02it/s, loss=0.0241]


Epoch 10 complete. Avg loss: 0.0357


Epoch 11: 100%|██████████| 157/157 [00:02<00:00, 59.73it/s, loss=0.0138]


Epoch 11 complete. Avg loss: 0.0235


Epoch 12: 100%|██████████| 157/157 [00:02<00:00, 57.88it/s, loss=0.0122]


Epoch 12 complete. Avg loss: 0.0203


Epoch 13: 100%|██████████| 157/157 [00:02<00:00, 58.01it/s, loss=0.0371]


Epoch 13 complete. Avg loss: 0.0181


Epoch 14: 100%|██████████| 157/157 [00:02<00:00, 59.11it/s, loss=0.0112]


Epoch 14 complete. Avg loss: 0.0239


Epoch 15: 100%|██████████| 157/157 [00:02<00:00, 58.44it/s, loss=0.0668]


Epoch 15 complete. Avg loss: 0.0190


Epoch 16: 100%|██████████| 157/157 [00:02<00:00, 59.25it/s, loss=0.00681]


Epoch 16 complete. Avg loss: 0.0217


Epoch 17: 100%|██████████| 157/157 [00:02<00:00, 58.39it/s, loss=0.0213] 


Epoch 17 complete. Avg loss: 0.0170


Epoch 18: 100%|██████████| 157/157 [00:02<00:00, 58.62it/s, loss=0.0146] 


Epoch 18 complete. Avg loss: 0.0183


Epoch 19: 100%|██████████| 157/157 [00:02<00:00, 58.80it/s, loss=0.0219] 


Epoch 19 complete. Avg loss: 0.0162


Epoch 20: 100%|██████████| 157/157 [00:02<00:00, 58.30it/s, loss=0.0444]

Epoch 20 complete. Avg loss: 0.0217





In [12]:
def generate(model, seed_words, word2idx, idx2word, max_new_tokens=30):
    model.eval()
    tokens = [word2idx[w] for w in seed_words]
    input_ids = torch.tensor(tokens).unsqueeze(0).to(device)

    for _ in range(max_new_tokens):
        tgt = input_ids[:, -seq_len:]
        with torch.no_grad():
            logits = model(tgt)
        probs = torch.softmax(logits[0, -1], dim=0)
        next_token = torch.multinomial(probs, num_samples=1).item()
        input_ids = torch.cat([input_ids, torch.tensor([[next_token]]).to(device)], dim=1)

    return " ".join([idx2word[i] for i in input_ids[0].tolist()])


In [18]:
seeds = [
    ["the", "city", "was"],
    ["in", "the", "middle"],
    ["he", "looked", "at"],
    ["she", "had", "never"],
    ["after", "all", "this"],
    ["when", "they", "arrived"],
    ["we", "decided", "to"],
    ["it", "was", "clear"],
    ["this", "is", "not"],
    ["nothing", "could", "have"]
]



for i, seed in enumerate(seeds):
    result = generate(model, seed, word2idx, idx2word, max_new_tokens=50)
    words = result.split()
    line = " ".join(words[:30])
    print(f"{i+1:02d}: {line}")



01: the city was shot down by a Buk anti-aircraft system belonging to Kursk's 53rd Russian anti-aircraft brigade. The Russian electoral system its Defence Agency is to maintain the narrative that
02: in the middle the party’s loss followed through, namely history. Europe either do not think tanks and Korea. Constantinople that struggles to combat positions against Russia". &nbsp; &nbsp; &nbsp; &nbsp;
03: he looked at several occasions in 80 years ago are also used in the current Western front position against Russia. Ondřej Kolář, who brought up the idea of removing the
04: she had never that never that that that that that Soviet repressions against Lithuanian partisans can be treated as genocide, the ECHR is essentially rewriting the rule of law to
05: after all this after this declaration, the Union that after that after this after this declaration, the [local] authorities that after this declaration, the [local] authorities will make the correct
06: when they arrived they want to in