# Recurrent Neural Network

In [None]:
# IMPORTS

import pandas as pd
import numpy as np
import model_utils 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [3]:
# NORMALIZE AND CONCAT ALL DATASETS

normalized_dfs = [
    # Misinfo Dataset
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_FAKE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_TRUE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/EXTRA_RussianPropagandaSubset.csv"),
    # Fake News Net Dataset
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_real.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_real.csv"),
    # Liar Dataset
    model_utils.normalize_liar_dataset("datasets/liar-dataset/train.tsv")
]

df = pd.concat(normalized_dfs, ignore_index=True)
df = df.dropna(subset=["text"]).reset_index(drop=True)

### RNN Model

In [None]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

texts = df["text"].tolist()
labels = df["label"].tolist()
words = " ".join(texts).split()

vocab = sorted(set(words))
word2idx = {w: i for i, w in enumerate(vocab)}

def encode(text, seq_len):
    tokens = text.split()
    ids = [word2idx.get(w, 0) for w in tokens[:seq_len]]
    return ids + [0] * (seq_len - len(ids))

class ClassificationDataset(Dataset):
    def __init__(self, texts, labels, seq_len):
        self.data = [torch.tensor(encode(text, seq_len)) for text in texts]
        self.labels = torch.tensor(labels).long()
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True, nonlinearity="relu")
        self.fc = nn.Linear(hidden_size, 2)
    def forward(self, x):
        x = self.embed(x)
        _, hidden = self.rnn(x)
        return self.fc(hidden.squeeze(0))

Using device: mps


Epoch 1: 100%|██████████| 1866/1866 [00:59<00:00, 31.57it/s, acc=0.793, loss=0.42] 


Epoch 1 Complete. Avg Loss: 0.4302, Accuracy: 0.7926


Epoch 2: 100%|██████████| 1866/1866 [00:57<00:00, 32.17it/s, acc=0.882, loss=0.346] 


Epoch 2 Complete. Avg Loss: 0.2690, Accuracy: 0.8821


Epoch 3: 100%|██████████| 1866/1866 [00:57<00:00, 32.56it/s, acc=0.914, loss=0.124] 


Epoch 3 Complete. Avg Loss: 0.1987, Accuracy: 0.9139


Epoch 4: 100%|██████████| 1866/1866 [00:56<00:00, 32.81it/s, acc=0.932, loss=0.205] 


Epoch 4 Complete. Avg Loss: 0.1617, Accuracy: 0.9323


Epoch 5: 100%|██████████| 1866/1866 [00:56<00:00, 32.80it/s, acc=0.947, loss=0.0814]


Epoch 5 Complete. Avg Loss: 0.1276, Accuracy: 0.9472


Epoch 6: 100%|██████████| 1866/1866 [00:57<00:00, 32.34it/s, acc=0.957, loss=0.128]  

Epoch 6 Complete. Avg Loss: 0.1041, Accuracy: 0.9571





#### Instantiate and Train Model

In [None]:
# Hyperparameters
seq_len = 20
batch_size = 64
embed_size = 128
hidden_size = 256
epochs = 6

# Instantiate 
dataset = ClassificationDataset(texts, labels, seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = RNNClassifier(len(vocab), embed_size, hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    correct = 0
    total = 0
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    for x_batch, y_batch in loop:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)

        # predict logits
        logits = model(x_batch)
        loss = criterion(logits, y_batch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(logits, dim=1)
        correct += (preds == y_batch).sum().item()
        total += y_batch.size(0)

        loop.set_postfix(loss=loss.item(), acc=correct / total)

    print(f"Epoch {epoch+1} Complete. Avg Loss: {total_loss / len(dataloader):.3f}")


#### Evaluate Model

In [None]:
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for x_batch, y_batch in dataloader:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        logits = model(x_batch)
        preds = torch.argmax(logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(y_batch.cpu().numpy())

# Compute metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average="binary")
recall = recall_score(all_labels, all_preds, average="binary")
f1 = f1_score(all_labels, all_preds, average="binary")

# Print results
print(f"Accuracy : {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall   : {recall:.3f}")
print(f"F1 Score : {f1:.3f}")



Accuracy : 0.9705
Precision: 0.9659
Recall   : 0.9737
F1 Score : 0.9698
