In [31]:
pip install transformers datasets torch pandas



In [2]:
import pandas as pd

df = pd.read_csv("/content/dataset_facebook-posts-scraper_2025-02-03_20-21-21-366.csv")

In [3]:
df = df.rename(columns={"text": "post_text", "link": "post_link"})

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time
import numpy as np

In [5]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [30]:
tokenized_dataset = tokenized_dataset.filter(lambda x: x["label"] is not None)

Filter:   0%|          | 0/196 [00:00<?, ? examples/s]

In [6]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

In [7]:
dataset = dataset.filter(lambda x: isinstance(x["post_text"], str))

Filter:   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
def tokenize_data(examples):
    text = examples.get("post_text", "")
    return tokenizer(text, padding="max_length", truncation=True, max_length=128)

In [9]:
tokenized_dataset = dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/196 [00:00<?, ? examples/s]

In [10]:
def assign_label(url):
    if "HumansOfNewYork" in url:
        return 1  # American English
    elif "HumansOfLondon" in url:
        return 0  # British English
    else:
        return None

tokenized_dataset = tokenized_dataset.add_column("label", [assign_label(url) for url in tokenized_dataset["facebookUrl"]])


In [11]:
tokenized_dataset.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])

In [12]:
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split["train"]
test_dataset = train_test_split["test"]

In [13]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

In [15]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, model_type="LSTM"):
        super(TextClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.model_type = model_type

        if model_type == "RNN":
            self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        elif model_type == "LSTM":
            self.rnn = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        elif model_type == "GRU":
            self.rnn = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        else:
            raise ValueError("model_type must be 'RNN', 'LSTM', or 'GRU'")

        self.fc = nn.Linear(hidden_dim, output_dim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask):
        embedded = self.embedding(input_ids)
        if self.model_type == "LSTM":
            output, (hidden, _) = self.rnn(embedded)
        else:
            output, hidden = self.rnn(embedded)

        hidden = hidden[-1]  # Take the last hidden state
        out = self.fc(hidden)
        return self.softmax(out)

In [16]:
vocab_size = 30522
embed_dim = 128
hidden_dim = 256
output_dim = 2  #American vs British English
model_type = "LSTM"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = TextClassifier(vocab_size, embed_dim, hidden_dim, output_dim, model_type).to(device)


In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [20]:
class VanillaRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(VanillaRNNCell, self).__init__()
        self.hidden_size = hidden_size
        self.Wx = nn.Linear(input_size, hidden_size)
        self.Wh = nn.Linear(hidden_size, hidden_size)
        self.tanh = nn.Tanh()

    def forward(self, x, h):
        h_next = self.tanh(self.Wx(x) + self.Wh(h))
        return h_next

class GRUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(GRUCell, self).__init__()
        self.hidden_size = hidden_size
        self.Wz = nn.Linear(input_size, hidden_size)
        self.Uz = nn.Linear(hidden_size, hidden_size)
        self.Wr = nn.Linear(input_size, hidden_size)
        self.Ur = nn.Linear(hidden_size, hidden_size)
        self.Wh = nn.Linear(input_size, hidden_size)
        self.Uh = nn.Linear(hidden_size, hidden_size)

    def forward(self, x, h):
        z = torch.sigmoid(self.Wz(x) + self.Uz(h))
        r = torch.sigmoid(self.Wr(x) + self.Ur(h))
        h_tilde = torch.tanh(self.Wh(x) + self.Uh(r * h))
        h_next = (1 - z) * h + z * h_tilde
        return h_next

class LSTMCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(LSTMCell, self).__init__()
        self.hidden_size = hidden_size
        self.Wf = nn.Linear(input_size, hidden_size)
        self.Uf = nn.Linear(hidden_size, hidden_size)
        self.Wi = nn.Linear(input_size, hidden_size)
        self.Ui = nn.Linear(hidden_size, hidden_size)
        self.Wo = nn.Linear(input_size, hidden_size)
        self.Uo = nn.Linear(hidden_size, hidden_size)
        self.Wc = nn.Linear(input_size, hidden_size)
        self.Uc = nn.Linear(hidden_size, hidden_size)

    def forward(self, x, h, c):
        f = torch.sigmoid(self.Wf(x) + self.Uf(h))
        i = torch.sigmoid(self.Wi(x) + self.Ui(h))
        o = torch.sigmoid(self.Wo(x) + self.Uo(h))
        c_tilde = torch.tanh(self.Wc(x) + self.Uc(h))
        c_next = f * c + i * c_tilde
        h_next = o * torch.tanh(c_next)
        return h_next, c_next


In [21]:
class PyTorchRNN(nn.Module):
    def __init__(self, rnn_type, input_size, hidden_size, output_size, vocab_size):
        super(PyTorchRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, input_size)  # Embedding layer
        if rnn_type == 'vanilla':
            self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        elif rnn_type == 'gru':
            self.rnn = nn.GRU(input_size, hidden_size, batch_first=True)
        elif rnn_type == 'lstm':
            self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.embedding(x)  # Convert input_ids to embeddings
        _, h = self.rnn(x)
        if isinstance(h, tuple):  # for LSTM
            h = h[0]
        return self.fc(h[-1])

In [22]:
# Define parameters
vocab_size = tokenizer.vocab_size  # Get the vocabulary size from the tokenizer
input_size = 128  # Embedding dimension size
hidden_size = 128
output_size = 2  # For binary classification
rnn_type = 'lstm'  # Define the RNN type you want to use

# Initialize model with all required parameters
model = PyTorchRNN(rnn_type=rnn_type, input_size=input_size, hidden_size=hidden_size, output_size=output_size, vocab_size=vocab_size)

In [23]:
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.W1 = nn.Linear(hidden_size, hidden_size)
        self.W2 = nn.Linear(hidden_size, hidden_size)
        self.v = nn.Linear(hidden_size, 1, bias=False)

    def forward(self, hidden, encoder_outputs):
        # hidden: [batch_size, hidden_size]
        # encoder_outputs: [batch_size, sequence_len, hidden_size]
        sequence_len = encoder_outputs.shape[1]
        hidden = hidden.unsqueeze(1).repeat(1, sequence_len, 1)

        energy = torch.tanh(self.W1(encoder_outputs) + self.W2(hidden))
        attention = self.v(energy).squeeze(2)  # [batch_size, sequence_len]
        attention_weights = torch.softmax(attention, dim=1)

        # Apply attention weights to encoder outputs to get context vector
        context = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs).squeeze(1)
        return context, attention_weights


class SimpleRecurrentNetworkWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, cell_type='RNN'):
        super(SimpleRecurrentNetworkWithAttention, self).__init__()

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.attention = Attention(hidden_size * 2)  # Use hidden_size * 2 for bidirectional LSTM

        if cell_type == 'LSTM':
            self.rnn = nn.LSTM(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        elif cell_type == 'GRU':
            self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True, bidirectional=True)
        else:
            self.rnn = nn.RNN(hidden_size, hidden_size, batch_first=True, bidirectional=True)

        self.fc = nn.Linear(hidden_size * 2, output_size)  # hidden_size * 2 for bidirectional

    def forward(self, inputs):
        embedded = self.embedding(inputs)
        rnn_output, hidden = self.rnn(embedded)

        if isinstance(hidden, tuple):  # LSTM returns (hidden, cell_state)
            hidden = hidden[0]

        # Since it's bidirectional, get the last layer's forward and backward hidden states
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)  # Concatenate forward and backward hidden states

        # Apply attention to the concatenated hidden state
        context, attention_weights = self.attention(hidden, rnn_output)

        # Pass the context vector to the fully connected layer
        output = self.fc(context)

        return output, attention_weights

In [28]:
def train_model(model, train_loader, epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters())
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    start_time = time.time()
    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, labels = batch['input_ids'].to(device), batch['label'].to(device)
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
        accuracy = correct / total * 100
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {accuracy:.2f}%")
    print(f"Training Time: {time.time() - start_time:.2f}s")


In [41]:
torch.save(model.state_dict(), "lstm_model.pth")
torch.save(model_gru.state_dict(), "gru_model.pth")


In [42]:
model.load_state_dict(torch.load("lstm_model.pth"))
model.eval()

  model.load_state_dict(torch.load("lstm_model.pth"))


PyTorchRNN(
  (embedding): Embedding(30522, 128)
  (rnn): LSTM(128, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=2, bias=True)
)

In [43]:
def evaluate_custom_sentences(model, sentences):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    inputs = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(inputs["input_ids"])
        predictions = torch.argmax(outputs, dim=1)
        labels = ["British English", "American English"]
        for i, sentence in enumerate(sentences):
            print(f"Sentence: {sentence}\nPrediction: {labels[predictions[i].item()]}\n")

# Test with some example sentences
custom_sentences = [
    "I love watching football at the weekend with my mates.",
    "I enjoy eating fries while watching a baseball game."
]

evaluate_custom_sentences(model, custom_sentences)


Sentence: I love watching football at the weekend with my mates.
Prediction: American English

Sentence: I enjoy eating fries while watching a baseball game.
Prediction: American English



In [45]:
# Define parameters
vocab_size = tokenizer.vocab_size
input_size = 128
hidden_size = 128
output_size = 2

# Initialize models separately
model_lstm = PyTorchRNN(rnn_type='lstm', input_size=input_size, hidden_size=hidden_size, output_size=output_size, vocab_size=vocab_size)
model_gru = PyTorchRNN(rnn_type='gru', input_size=input_size, hidden_size=hidden_size, output_size=output_size, vocab_size=vocab_size)
model_rnn = PyTorchRNN(rnn_type='vanilla', input_size=input_size, hidden_size=hidden_size, output_size=output_size, vocab_size=vocab_size)

In [46]:
def train_model(model, train_loader, epochs=5):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    print(f"Training {model.__class__.__name__}...")

    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, labels = batch['input_ids'].to(device), batch['label'].to(device)
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        accuracy = correct / total * 100
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}, Accuracy: {accuracy:.2f}%")

    print(f"Training for {model.__class__.__name__} completed!\n")


In [49]:
# Train LSTM
train_model(model_lstm, train_loader, epochs=5)

# Train GRU
train_model(model_gru, train_loader, epochs=5)

# Train Vanilla RNN
train_model(model_rnn, train_loader, epochs=5)

Training PyTorchRNN...


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

In [50]:
def evaluate_model(model, test_loader):
    model.eval()
    correct, total = 0, 0
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    with torch.no_grad():
        for batch in test_loader:
            input_ids, labels = batch['input_ids'].to(device), batch['label'].to(device)
            outputs = model(input_ids)
            _, predicted = torch.max(outputs, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total * 100
    print(f"Test Accuracy for {model.__class__.__name__}: {accuracy:.2f}%\n")

# Evaluate LSTM
evaluate_model(model_lstm, test_loader)

# Evaluate GRU
evaluate_model(model_gru, test_loader)

# Evaluate Vanilla RNN
evaluate_model(model_rnn, test_loader)


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'NoneType'>

In [51]:
torch.save(model_lstm.state_dict(), "lstm_model.pth")
torch.save(model_gru.state_dict(), "gru_model.pth")
torch.save(model_rnn.state_dict(), "rnn_model.pth")

In [52]:
def evaluate_custom_sentences(model, sentences):
    model.eval()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    inputs = tokenizer(sentences, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs = {key: value.to(device) for key, value in inputs.items()}

    with torch.no_grad():
        outputs = model(inputs["input_ids"])
        predictions = torch.argmax(outputs, dim=1)
        labels = ["British English", "American English"]

        for i, sentence in enumerate(sentences):
            print(f"Sentence: {sentence}\nPrediction by {model.__class__.__name__}: {labels[predictions[i].item()]}\n")

# Test with some sentences
custom_sentences = [
    "I love watching football at the weekend with my mates.",
    "I enjoy eating fries while watching a baseball game."
]

# Evaluate with different models
print("\nLSTM Predictions:")
evaluate_custom_sentences(model_lstm, custom_sentences)

print("\nGRU Predictions:")
evaluate_custom_sentences(model_gru, custom_sentences)

print("\nVanilla RNN Predictions:")
evaluate_custom_sentences(model_rnn, custom_sentences)


LSTM Predictions:
Sentence: I love watching football at the weekend with my mates.
Prediction by PyTorchRNN: American English

Sentence: I enjoy eating fries while watching a baseball game.
Prediction by PyTorchRNN: American English


GRU Predictions:
Sentence: I love watching football at the weekend with my mates.
Prediction by PyTorchRNN: American English

Sentence: I enjoy eating fries while watching a baseball game.
Prediction by PyTorchRNN: American English


Vanilla RNN Predictions:
Sentence: I love watching football at the weekend with my mates.
Prediction by PyTorchRNN: British English

Sentence: I enjoy eating fries while watching a baseball game.
Prediction by PyTorchRNN: British English



In [55]:
def model_parameter_summary(model, model_name="Model"):
    print(f"\n Model: {model_name} Parameters Summary:")
    total_params = 0
    for name, param in model.named_parameters():
        print(f" Parameter: {name}")
        print(f"   - Shape: {param.shape}")
        print(f"   - Number of parameters: {param.numel()}")
        total_params += param.numel()
    print(f"\n Total number of parameters in {model_name}: {total_params}\n")

In [56]:
print("\n LSTM Model:")
model_parameter_summary(model_lstm, "LSTM")

print("\n GRU Model:")
model_parameter_summary(model_gru, "GRU")

print("\n Vanilla RNN Model:")
model_parameter_summary(model_rnn, "Vanilla RNN")


 LSTM Model:

 Model: LSTM Parameters Summary:
 Parameter: embedding.weight
   - Shape: torch.Size([30522, 128])
   - Number of parameters: 3906816
 Parameter: rnn.weight_ih_l0
   - Shape: torch.Size([512, 128])
   - Number of parameters: 65536
 Parameter: rnn.weight_hh_l0
   - Shape: torch.Size([512, 128])
   - Number of parameters: 65536
 Parameter: rnn.bias_ih_l0
   - Shape: torch.Size([512])
   - Number of parameters: 512
 Parameter: rnn.bias_hh_l0
   - Shape: torch.Size([512])
   - Number of parameters: 512
 Parameter: fc.weight
   - Shape: torch.Size([2, 128])
   - Number of parameters: 256
 Parameter: fc.bias
   - Shape: torch.Size([2])
   - Number of parameters: 2

 Total number of parameters in LSTM: 4039170


 GRU Model:

 Model: GRU Parameters Summary:
 Parameter: embedding.weight
   - Shape: torch.Size([30522, 128])
   - Number of parameters: 3906816
 Parameter: rnn.weight_ih_l0
   - Shape: torch.Size([384, 128])
   - Number of parameters: 49152
 Parameter: rnn.weight_hh_l