# LSTM

In [2]:
# IMPORTS

import pandas as pd
import numpy as np
import model_utils 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# NORMALIZE AND CONCAT ALL DATASETS

normalized_dfs = [
    # Misinfo Dataset
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_FAKE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_TRUE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/EXTRA_RussianPropagandaSubset.csv"),
    # Fake News Net Dataset
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_real.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_real.csv"),
    # Liar Dataset
    model_utils.normalize_liar_dataset("datasets/liar-dataset/train.tsv")
]

df = pd.concat(normalized_dfs, ignore_index=True)
df = df.dropna(subset=["text"]).reset_index(drop=True)

In [5]:
df.iloc[100]['text']

'Former Vice President Joe Biden was asked on Monday by Matt Lauer on NBC s  Today  to name something specific that Donald Trump has been  doing well. Well, that seems like a trick question since Trump has passed no major legislation and reaches across the aisle only to take shots at Democrats in his Twitter timeline during his morning rage-tweets, so Biden struggled to find something, anything, that Trump has done well since taking office. I think there s a number of things he s doing well. But even the things he s doing well, it s how he does them,  Biden said. It s more the tone of this administration that bothers me,  he continued. With all due respect, you haven t come up with one thing you think he s doing well,  Lauer said. Well, I think he married very well,  Biden joked.Although, Biden didn t mention which of Trump s three marriages he s speaking of. Trump s first marriage to Ivana ended after he had an affair with Marla Maples. Trump went on to marry Maples, then they divorce

In [6]:
from collections import Counter
from sklearn.model_selection import train_test_split

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenize (simple split)
def tokenize(text):
    return text.lower().split()

# Build vocabulary
counter = Counter()
for text in train_df["text"]:
    counter.update(tokenize(text))

vocab_size = 10000
most_common = counter.most_common(vocab_size - 2)  # Reserve 0 for PAD, 1 for UNK
word2idx = {"<PAD>": 0, "<UNK>": 1}
word2idx.update({word: i + 2 for i, (word, _) in enumerate(most_common)})

def encode(text):
    return [word2idx.get(w, 1) for w in tokenize(text)[:300]]  # truncate at 300


In [8]:
import torch.nn as nn

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_dim=64, pad_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 1)  # <-- 1 output for binary classification

    def forward(self, x, lengths):
        embedded = self.embedding(x)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        _, (hidden, _) = self.lstm(packed)
        logits = self.fc(hidden[-1])
        return logits.squeeze(1)


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score
from tqdm import tqdm

EPOCHS = 3

class Dataset(Dataset):
    def __init__(self, texts, labels):
        self.data = [torch.tensor(encode(t)) for t in texts]
        self.labels = torch.tensor(labels).float()

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

def collate_fn(batch):
    data, labels = zip(*batch)
    lengths = torch.tensor([len(seq) for seq in data])
    padded = nn.utils.rnn.pad_sequence(data, batch_first=True, padding_value=0)
    return padded, lengths, torch.tensor(labels)

train_dataset = Dataset(train_df["text"].tolist(), train_df["label"].tolist())
test_dataset = Dataset(test_df["text"].tolist(), test_df["label"].tolist())
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

device = torch.device("mps" if torch.cuda.is_available() else "cpu")
model = LSTMClassifier(vocab_size=len(word2idx), pad_idx=word2idx["<PAD>"]).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}")
    for x, lengths, y in pbar:
        x, lengths, y = x.to(device), lengths.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x, lengths)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        pbar.set_postfix(loss=total_loss / (pbar.n + 1))
    print(f"Epoch {epoch+1}: Loss = {total_loss/len(train_loader):.4f}")

Epoch 1/3: 100%|██████████| 2985/2985 [07:08<00:00,  6.97it/s, loss=0.581]


Epoch 1: Loss = 0.5812


Epoch 2/3: 100%|██████████| 2985/2985 [08:22<00:00,  5.94it/s, loss=0.442]


Epoch 2: Loss = 0.4416


Epoch 3/3: 100%|██████████| 2985/2985 [08:17<00:00,  6.00it/s, loss=0.329]

Epoch 3: Loss = 0.3294





In [12]:
model.eval()
all_preds, all_targets = [], []
with torch.no_grad():
    for x, lengths, y in test_loader:
        x, lengths = x.to(device), lengths.to(device)
        logits = model(x, lengths)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).int().cpu().tolist()
        all_preds.extend(preds)
        all_targets.extend(y.int().tolist())

acc = accuracy_score(all_targets, all_preds)
print(f"Test Accuracy: {acc:.4f}")


Test Accuracy: 0.8762
