In [4]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv('IMDB Dataset.csv')


nltk.download('punkt')


def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'<br />', ' ', text) 
    text = re.sub(r'[^a-z\s]', '', text)  
    text = nltk.word_tokenize(text)  
    return ' '.join(text)

# Apply preprocessing to the review column
df['cleaned_review'] = df['review'].apply(preprocess_text)

label_encoder = LabelEncoder()
df['sentiment'] = label_encoder.fit_transform(df['sentiment'])

# PyTorch tokenizer
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(df['cleaned_review']), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

def encode(text):
    return [vocab[token] for token in tokenizer(text)]

df['encoded_review'] = df['cleaned_review'].apply(encode)

def pad_sequences_torch(sequences, max_len):
    return pad_sequence([torch.tensor(seq[:max_len]) for seq in sequences], 
                        batch_first=True, padding_value=vocab["<pad>"])

max_seq_length = 100
padded_sequences = pad_sequences_torch(df['encoded_review'], max_seq_length)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['sentiment'], test_size=0.2, random_state=42)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
import numpy as np

def load_glove_embeddings(glove_file_path, vocab, embedding_dim=100):
    embeddings_index = {}
    with open(glove_file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coeffs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coeffs

    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for i, word in enumerate(vocab.get_itos()):
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return torch.tensor(embedding_matrix, dtype=torch.float32)

glove_file_path = 'glove.6B.100d.txt'  
embedding_dim = 100
embedding_matrix = load_glove_embeddings(glove_file_path, vocab, embedding_dim)


In [6]:
import torch.nn as nn
import torch.optim as optim

class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix=None):
        super(VanillaRNN, self).__init__()
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

hidden_dim = 128
output_dim = 1

model_rnn_glove = VanillaRNN(vocab_size=len(vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim, embedding_matrix=embedding_matrix)


In [7]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, embedding_matrix=None):
        super(LSTM, self).__init__()
        if embedding_matrix is not None:
            self.embedding = nn.Embedding.from_pretrained(embedding_matrix)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded)
        return self.fc(hidden.squeeze(0))

model_lstm_glove = LSTM(vocab_size=len(vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim, embedding_matrix=embedding_matrix)


In [9]:
y_train = torch.tensor(y_train.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

In [11]:
from torch.utils.data import DataLoader, TensorDataset

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

# Define batch size
batch_size = 64

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

def train_model_with_loader(model, train_loader, criterion, optimizer, epochs=5):
    model.train()
    for epoch in range(epochs):
        epoch_loss = 0
        for X_batch, y_batch in train_loader:
            optimizer.zero_grad()
            predictions = model(X_batch).squeeze(1)
            loss = criterion(predictions, y_batch.float())
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
        print(f'Epoch {epoch+1}, Loss: {epoch_loss / len(train_loader)}')

def evaluate_model(model, X_test, y_test):
    model.eval()
    with torch.no_grad():
        predictions = model(X_test).squeeze(1)
        predicted_labels = torch.round(torch.sigmoid(predictions))
        accuracy = (predicted_labels == y_test).float().mean()
        print(f'Accuracy: {accuracy.item()}')


criterion = nn.BCEWithLogitsLoss()
optimizer_rnn = optim.Adam(model_rnn_glove.parameters())
optimizer_lstm = optim.Adam(model_lstm_glove.parameters())


train_model_with_loader(model_rnn_glove, train_loader, criterion, optimizer_rnn)
evaluate_model(model_rnn_glove, X_test, y_test)

train_model_with_loader(model_lstm_glove, train_loader, criterion, optimizer_lstm)
evaluate_model(model_lstm_glove, X_test, y_test)


Epoch 1, Loss: 0.6912204095840454
Epoch 2, Loss: 0.6920831412315369
Epoch 3, Loss: 0.6879197552680969
Epoch 4, Loss: 0.6880878195762634
Epoch 5, Loss: 0.6868881858825684
Accuracy: 0.5249000191688538
Epoch 1, Loss: 0.6874509769439697
Epoch 2, Loss: 0.5404983124256134
Epoch 3, Loss: 0.4369895175457001
Epoch 4, Loss: 0.41177299323081973
Epoch 5, Loss: 0.3931473733663559
Accuracy: 0.8169000148773193


In [14]:
from torch.utils.data import DataLoader, TensorDataset

train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)

batch_size = 64

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, batch_size=batch_size)

model_rnn_learned = VanillaRNN(vocab_size=len(vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim)

# Optimizer
optimizer_rnn_learned = optim.Adam(model_rnn_learned.parameters())

train_model_with_loader(model_rnn_learned, train_loader, criterion, optimizer_rnn_learned)

evaluate_model(model_rnn_learned, X_test, y_test)


Epoch 1, Loss: 0.6945618580818176
Epoch 2, Loss: 0.6867848712921143
Epoch 3, Loss: 0.6694758591651917
Epoch 4, Loss: 0.656980727481842
Epoch 5, Loss: 0.6226174460887909
Accuracy: 0.5525000095367432


In [17]:
model_lstm_learned = LSTM(vocab_size=len(vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=output_dim)

# Optimizer
optimizer_lstm_learned = optim.Adam(model_lstm_learned.parameters())

train_model_with_loader(model_lstm_learned, train_loader, criterion, optimizer_lstm_learned)
evaluate_model(model_lstm_learned, X_test, y_test)


Epoch 1, Loss: 0.6868738242149353
Epoch 2, Loss: 0.6636881715774536
Epoch 3, Loss: 0.49238369221687317
Epoch 4, Loss: 0.33487696602344513
Epoch 5, Loss: 0.2565517388224602
Accuracy: 0.8098999857902527
