# Recurrent Neural Network

In [1]:
# IMPORTS

import pandas as pd
import numpy as np
import model_utils 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# NORMALIZE AND CONCAT ALL DATASETS

normalized_dfs = [
    # Misinfo Dataset
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_FAKE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/DataSet_Misinfo_TRUE.csv"),
    model_utils.normalize_misinfo_dataset("datasets/misinfo-dataset/EXTRA_RussianPropagandaSubset.csv"),
    # Fake News Net Dataset
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/gossipcop_real.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_fake.csv"),
    model_utils.normalize_fakenewsnet_dataset("datasets/fakenewsnet-dataset/politifact_real.csv"),
    # Liar Dataset
    model_utils.normalize_liar_dataset("datasets/liar-dataset/train.tsv")
]

df = pd.concat(normalized_dfs, ignore_index=True)
df = df.dropna(subset=["text"]).reset_index(drop=True)

In [17]:
df.iloc[119385]['text']

'There are a larger number of shark attacks in Florida than there are cases of voter fraud.'

### RNN Model

In [12]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

texts = df["text"].tolist()
words = " ".join(texts).split()

vocab = sorted(set(words))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for w, i in word2idx.items()}
encoded = [word2idx[w] for w in words]
subset = encoded[:10_000]

class WordDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        x = torch.tensor(self.data[idx : idx + self.seq_len])
        y = torch.tensor(self.data[idx + 1 : idx + self.seq_len + 1])
        return x, y

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True, nonlinearity="relu")
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embed(x)
        out, hidden = self.rnn(x, hidden)
        out = self.fc(out)
        return out, hidden

    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, hidden_size).to(device)

seq_len = 10
batch_size = 64
embed_size = 128
hidden_size = 256
epochs = 10

dataset = WordDataset(subset, seq_len)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
model = RNN(len(vocab), embed_size, hidden_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)

for epoch in range(epochs):
    total_loss = 0
    loop = tqdm(dataloader, desc=f"Epoch {epoch+1}")
    for x_batch, y_batch in loop:
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device)
        hidden = model.init_hidden(x_batch.size(0))

        out, hidden = model(x_batch, hidden)
        loss = criterion(out.view(-1, len(vocab)), y_batch.view(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1} Complete. Avg Loss: {avg_loss:.4f}")


Using device: mps


Epoch 1: 100%|██████████| 157/157 [00:49<00:00,  3.19it/s, loss=1.8] 


Epoch 1 Complete. Avg Loss: 4.8003


Epoch 2: 100%|██████████| 157/157 [00:49<00:00,  3.16it/s, loss=0.329]


Epoch 2 Complete. Avg Loss: 0.8238


Epoch 3: 100%|██████████| 157/157 [00:52<00:00,  3.01it/s, loss=0.339]


Epoch 3 Complete. Avg Loss: 0.4549


Epoch 4: 100%|██████████| 157/157 [00:54<00:00,  2.88it/s, loss=0.388]


Epoch 4 Complete. Avg Loss: 0.4069


Epoch 5: 100%|██████████| 157/157 [00:52<00:00,  2.99it/s, loss=0.556]


Epoch 5 Complete. Avg Loss: 0.3892


Epoch 6: 100%|██████████| 157/157 [00:51<00:00,  3.04it/s, loss=0.487]


Epoch 6 Complete. Avg Loss: 0.3818


Epoch 7: 100%|██████████| 157/157 [00:50<00:00,  3.10it/s, loss=0.731]


Epoch 7 Complete. Avg Loss: 0.3743


Epoch 8: 100%|██████████| 157/157 [00:50<00:00,  3.09it/s, loss=0.402]


Epoch 8 Complete. Avg Loss: 0.3699


Epoch 9: 100%|██████████| 157/157 [00:49<00:00,  3.15it/s, loss=0.371]


Epoch 9 Complete. Avg Loss: 0.3639


Epoch 10: 100%|██████████| 157/157 [00:49<00:00,  3.18it/s, loss=0.279]

Epoch 10 Complete. Avg Loss: 0.3556





In [13]:
def generate_text(model, seed_words, word2idx, idx2word, seq_len=10, max_new_tokens=20):
    model.eval()
    generated = seed_words[:]
    input_seq = [word2idx[w] for w in seed_words[-seq_len:]]

    hidden = model.init_hidden(1)

    for _ in range(max_new_tokens):
        input_tensor = torch.tensor(input_seq).unsqueeze(0).to(device)
        with torch.no_grad():
            output, hidden = model(input_tensor, hidden)

        last_logits = output[0, -1]
        probs = torch.softmax(last_logits, dim=0)
        next_idx = torch.multinomial(probs, num_samples=1).item()
        next_word = idx2word[next_idx]

        generated.append(next_word)
        input_seq = input_seq[1:] + [next_idx]

    return " ".join(generated)


In [14]:
seed = ["the", "city", "was"]
generated_text = generate_text(model, seed, word2idx, idx2word, seq_len=seq_len, max_new_tokens=50)
print(generated_text)

the city was called out, the reference to Obama and golf: Unlike Obama, we are working to fix the golf course. However, the people s noses, because they were full of hope and expectation because of the child more than the America Despite great will is set to release this ad, called Thank
