In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.vocab import GloVe
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torchtext
import warnings


In [34]:
df = pd.read_csv('IMDB Dataset.csv')
df = df.iloc[:1000,:]
df.shape


(1000, 2)

In [35]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment']) 

In [36]:
import re
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

df['review'] = df['review'].apply(preprocess_text)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Define a function to tokenize and pad sequences
from torchtext.data.utils import get_tokenizer
from torch.nn.utils.rnn import pad_sequence

tokenizer = get_tokenizer("basic_english")

def tokenize_and_pad(texts, vocab, max_length):
    tokens = [torch.tensor([vocab[word] for word in tokenizer(text) if word in vocab]) for text in texts]
    return pad_sequence(tokens, batch_first=True, padding_value=vocab['<pad>'])

# Load GloVe embeddings
glove = GloVe(name='6B', dim=100)

# Create vocabulary and embeddings matrix
vocab = glove.stoi.copy()
vocab['<pad>'] = len(vocab)  # Add <pad> token at the end
vocab['<unk>'] = len(vocab)  # Add <unk> token at the end

embedding_dim = 100
embedding_matrix = np.zeros((len(vocab), embedding_dim))

for word, idx in vocab.items():
    if idx < len(glove.vectors):  # Ensure the index does not exceed GloVe size
        embedding_matrix[idx] = glove[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))  # Random initialization for new tokens

max_length = 200
X_train = tokenize_and_pad(X_train, vocab, max_length)
X_test = tokenize_and_pad(X_test, vocab, max_length)

# Convert labels to tensors
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

# Create DataLoader
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(test_data, batch_size=64)

In [38]:
class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, pretrained_embeddings):
        super(VanillaRNN, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(pretrained_embeddings, dtype=torch.float32), freeze=False)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

# Initialize the model
rnn_model = VanillaRNN(len(vocab), embedding_dim, hidden_dim=128, output_dim=1, pretrained_embeddings=embedding_matrix)


In [39]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, pretrained_embeddings):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(torch.tensor(pretrained_embeddings, dtype=torch.float32), freeze=False)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden.squeeze(0))

# Initialize the model
lstm_model = LSTMModel(len(vocab), embedding_dim, hidden_dim=128, output_dim=1, pretrained_embeddings=embedding_matrix)


In [40]:
def train(model, loader, criterion, optimizer):
    model.train()
    total_loss = 0
    for inputs, labels in loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels.float())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for inputs, labels in loader:
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), labels.float())
            total_loss += loss.item()
    return total_loss / len(loader)


In [41]:
# Hyperparameters
num_epochs = 5
learning_rate = 0.001

# Loss and optimizer
criterion = nn.MSELoss()
rnn_optimizer = optim.Adam(rnn_model.parameters(), lr=learning_rate)
lstm_optimizer = optim.Adam(lstm_model.parameters(), lr=learning_rate)

In [42]:
for epoch in range(num_epochs):
    rnn_train_loss = train(rnn_model, train_loader, criterion, rnn_optimizer)
    rnn_val_loss = evaluate(rnn_model, test_loader, criterion)
    print(f"Epoch {epoch+1}, RNN Train Loss: {rnn_train_loss:.4f}, RNN Val Loss: {rnn_val_loss:.4f}")

Epoch 1, RNN Train Loss: 0.3244, RNN Val Loss: 0.2527
Epoch 2, RNN Train Loss: 0.2568, RNN Val Loss: 0.2534
Epoch 3, RNN Train Loss: 0.2554, RNN Val Loss: 0.2573
Epoch 4, RNN Train Loss: 0.2592, RNN Val Loss: 0.2564
Epoch 5, RNN Train Loss: 0.2554, RNN Val Loss: 0.2604


In [43]:
# Train LSTM
for epoch in range(num_epochs):
    lstm_train_loss = train(lstm_model, train_loader, criterion, lstm_optimizer)
    lstm_val_loss = evaluate(lstm_model, test_loader, criterion)
    print(f"Epoch {epoch+1}, LSTM Train Loss: {lstm_train_loss:.4f}, LSTM Val Loss: {lstm_val_loss:.4f}")

Epoch 1, LSTM Train Loss: 0.2773, LSTM Val Loss: 0.2550
Epoch 2, LSTM Train Loss: 0.2518, LSTM Val Loss: 0.2563
Epoch 3, LSTM Train Loss: 0.2524, LSTM Val Loss: 0.2532
Epoch 4, LSTM Train Loss: 0.2536, LSTM Val Loss: 0.2532
Epoch 5, LSTM Train Loss: 0.2525, LSTM Val Loss: 0.2526


Implement Models with On-the-Fly Embeddings


In [44]:
class VanillaRNNOnTheFly(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(VanillaRNNOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

# Initialize the model
rnn_on_the_fly = VanillaRNNOnTheFly(len(vocab), embedding_dim, hidden_dim=128, output_dim=1)


In [45]:
class LSTMModelOnTheFly(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(LSTMModelOnTheFly, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden.squeeze(0))

# Initialize the model
lstm_on_the_fly = LSTMModelOnTheFly(len(vocab), embedding_dim, hidden_dim=128, output_dim=1)


In [46]:
for epoch in range(num_epochs):
    rnn_train_loss = train(rnn_on_the_fly, train_loader, criterion, rnn_optimizer)
    rnn_val_loss = evaluate(rnn_on_the_fly, test_loader, criterion)
    print(f"Epoch {epoch+1}, RNN Train Loss: {rnn_train_loss:.4f}, RNN Val Loss: {rnn_val_loss:.4f}")

Epoch 1, RNN Train Loss: 1.0773, RNN Val Loss: 1.0408
Epoch 2, RNN Train Loss: 1.0785, RNN Val Loss: 1.0408
Epoch 3, RNN Train Loss: 1.0817, RNN Val Loss: 1.0408
Epoch 4, RNN Train Loss: 1.0817, RNN Val Loss: 1.0408
Epoch 5, RNN Train Loss: 1.0839, RNN Val Loss: 1.0408


In [47]:
# Train LSTM
for epoch in range(num_epochs):
    lstm_train_loss = train(lstm_on_the_fly, train_loader, criterion, lstm_optimizer)
    lstm_val_loss = evaluate(lstm_on_the_fly, test_loader, criterion)
    print(f"Epoch {epoch+1}, LSTM Train Loss: {lstm_train_loss:.4f}, LSTM Val Loss: {lstm_val_loss:.4f}")

Epoch 1, LSTM Train Loss: 0.8610, LSTM Val Loss: 0.8219
Epoch 2, LSTM Train Loss: 0.8517, LSTM Val Loss: 0.8219
Epoch 3, LSTM Train Loss: 0.8592, LSTM Val Loss: 0.8219
Epoch 4, LSTM Train Loss: 0.8536, LSTM Val Loss: 0.8219
Epoch 5, LSTM Train Loss: 0.8517, LSTM Val Loss: 0.8219
