In [31]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch import nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

In [32]:
df = pd.read_csv('scraped_data/BABE_scraped.csv')
df['content'] = df['content'].str.lower()  # Convert text to lowercase

df.dropna(subset=['content'], inplace=True) # Drop rows with missing values in the 'content' column
df['type_class'] = LabelEncoder().fit_transform(df['type_class'])
# Split the data into training, validation, and testing sets
train_val_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_val_data, test_size=0.25, random_state=42)

In [33]:
from torch.utils.data import DataLoader, Dataset
import torch

def encode_text(text, word2id):
    return [word2id.get(word, word2id['<pad>']) for word in text.split()]

class NewsDataset(Dataset):
    def __init__(self, data, word2id):
        self.data = data
        self.word2id = word2id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['content']
        label = self.data.iloc[idx]['type_class']
        encoded_text = encode_text(text, self.word2id)
        return torch.tensor(encoded_text, dtype=torch.long), torch.tensor(label, dtype=torch.long)

def collate_batch(batch):
    label_list, text_list = [], []
    for _text, _label in batch:
        label_list.append(_label)
        text_list.append(_text)
    text_list = pad_sequence(text_list, batch_first=True, padding_value=0)
    labels = torch.tensor(label_list, dtype=torch.long)
    return text_list, labels

# Create datasets and dataloaders for train, validation, and test sets
train_dataset = NewsDataset(train_data, word2id)
val_dataset = NewsDataset(val_data, word2id)
test_dataset = NewsDataset(test_data, word2id)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_batch)

In [34]:
glove_file = "../Homeworks/HW2_Language_Models_Neural_Networks/glove/glove.6B.50d.txt"

embeddings_dict = {}
with open(glove_file, 'r', encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_dict[word] = vector

print(f'Loaded {len(embeddings_dict)} words from GloVe.')

# Create a matrix for the word vectors
vocab_size = len(embeddings_dict) + 1
embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))

word2id = {'<pad>': 0}
for i, (word, vector) in enumerate(embeddings_dict.items(), 1):
    embedding_matrix[i] = vector
    word2id[word] = i


Loaded 400000 words from GloVe.


In [35]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pretrained_embeddings):
        super(SimpleRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings))
        self.embedding.weight.requires_grad = False  # We do not train the embedding layer
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Embedding layer
        x = self.embedding(x)
        
        # Initialize hidden state
        h0 = torch.zeros(1, x.size(0), self.rnn.hidden_size).to(x.device)
        
        # Forward propagate the RNN
        out, _ = self.rnn(x, h0)
        
        # Pass the output of the last time step
        out = self.fc(out[:, -1, :])
        return out


In [36]:
model = SimpleRNN(vocab_size, embedding_dim, hidden_dim=50, output_dim=3, pretrained_embeddings=embedding_matrix)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for texts, labels in train_loader:
        outputs = model(texts)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            outputs = model(texts)
            val_loss += criterion(outputs, labels).item()

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {loss.item():.4f}, Validation Loss: {val_loss / len(val_loader):.4f}')


Epoch [1/10], Training Loss: 0.7844, Validation Loss: 0.8999
Epoch [2/10], Training Loss: 1.0469, Validation Loss: 0.9298
Epoch [3/10], Training Loss: 0.9587, Validation Loss: 0.8998
Epoch [4/10], Training Loss: 0.9311, Validation Loss: 1.0338
Epoch [5/10], Training Loss: 1.1376, Validation Loss: 0.9605
Epoch [6/10], Training Loss: 0.8903, Validation Loss: 0.9115
Epoch [7/10], Training Loss: 0.9706, Validation Loss: 0.9323
Epoch [8/10], Training Loss: 1.0352, Validation Loss: 0.9816
Epoch [9/10], Training Loss: 1.1408, Validation Loss: 0.8975
Epoch [10/10], Training Loss: 0.8828, Validation Loss: 0.9070


In [37]:
model.eval()
total = 0
correct = 0
with torch.no_grad():
    for texts, labels in test_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the model on the test set: {100 * correct / total}%')

Accuracy of the model on the test set: 50.46153846153846%
