# **Recurrent Neural Network (RNN) Model**

#### Text Preprocessing

In [2]:
import os
import re
import time
import nltk

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from concurrent.futures import ThreadPoolExecutor, as_completed

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean and lemmatize a review
def clean_review(text):
    # start_time = time.time()
    # Convert to lowercase
    text = text.lower()
    # Remove digits and punctuation
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the review
    words = word_tokenize(text)
    # Remove stopwords and apply lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # print(f"Time taken for cleaning a review: {time.time() - start_time:.2f} seconds")
    return words

# Function to read and clean data for one review
def process_file(file_path, label):
    with open(file_path, 'r', encoding='utf-8') as file:
        review_text = file.read()
        # Clean and lemmatize the review
        cleaned_review = clean_review(review_text)
    return cleaned_review, label

# Function to read and clean data in parallel
def load_data(data_dir):
    reviews = []
    labels = []
    tasks = []
    
    with ThreadPoolExecutor() as executor:
        for label_type in ['pos', 'neg']:
            dir_name = os.path.join(data_dir, label_type)
            label = 1 if label_type == 'pos' else 0
            for file_name in os.listdir(dir_name):
                file_path = os.path.join(dir_name, file_name)
                # Submit tasks to process files in parallel
                tasks.append(executor.submit(process_file, file_path, label))
        
        # Collect results as tasks complete
        for task in as_completed(tasks):
            review, label = task.result()
            reviews.append(review)
            labels.append(label)
    
    return reviews, labels

# Load train and test datasets
# start_time = time.time()
train_reviews, train_labels = load_data('train')
# print(f"Time taken to load train data: {time.time() - start_time:.2f} seconds")

# start_time = time.time()
test_reviews, test_labels = load_data('test')
# print(f"Time taken to load test data: {time.time() - start_time:.2f} seconds")

# View an example of a cleaned review
print(train_reviews[0])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yingy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yingy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yingy\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['delightful', 'movie', 'overthetop', 'wife', 'daughter', 'found', 'irresistible', 'plot', 'crazy', 'ring', 'true', 'world', 'soap', 'opera', 'outrageous', 'improbability', 'impossibility', 'br', 'br', 'particularly', 'enjoyed', 'kevin', 'kline', 'sally', 'field', 'performance', 'dont', 'anyone', 'better', 'kline', 'playing', 'thickheaded', 'field', 'character', 'truly', 'desperate', 'need', 'attention', 'affirmation', 'almost', 'bipolar', 'swing', 'mood', 'played', 'nicely', 'background', 'field', 'famous', 'infamous', 'like', 'oscar', 'exclamation', 'people', 'take', 'large', 'grain', 'salt', 'rare', 'world', 'br', 'br', 'think', 'movie', 'didnt', 'find', 'impatient', 'whoopi', 'goldberg', 'characterization', 'thought', 'spot', 'every', 'note', 'struck', 'robert', 'downey', 'jr', 'teri', 'hatcher', 'cathy', 'moriarty', 'elizabeth', 'shue', 'also', 'firstrate', 'well', 'great', 'movie', 'youre', 'mood', 'go', 'along', 'ride', 'laugh']


#### Vocabulary Extraction

In [3]:
from collections import Counter

def build_vocab(reviews, max_vocab_size=10000):
    # Flatten the list of reviews into a single list of words
    all_words = [word for review in reviews for word in review]
    # Count word frequencies
    word_counts = Counter(all_words)
    # Get the most common words up to max_vocab_size
    most_common_words = word_counts.most_common(max_vocab_size)
    # Create a vocabulary dictionary, starting with special tokens
    vocab = {'<PAD>': 0, '<UNK>': 1}  # Padding token and unknown token
    for idx, (word, _) in enumerate(most_common_words, start=2):
        vocab[word] = idx
    return vocab

# Build vocabulary from train reviews
vocab = build_vocab(train_reviews)

# View the size of the vocabulary and an example of the vocab
print(f"Vocabulary size: {len(vocab)}")
print(f"First 10 words in the vocabulary: {list(vocab.items())[:10]}")

Vocabulary size: 10002
First 10 words in the vocabulary: [('<PAD>', 0), ('<UNK>', 1), ('br', 2), ('movie', 3), ('film', 4), ('one', 5), ('like', 6), ('time', 7), ('good', 8), ('character', 9)]


#### Model Training

In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

# Function to encode reviews as sequences of indices
def encode_review(review, vocab, max_len):
    encoded = [vocab.get(word, vocab['<UNK>']) for word in review]
    # Pad or truncate the encoded review
    if len(encoded) > max_len:
        return encoded[:max_len]
    else:
        return encoded + [vocab['<PAD>']] * (max_len - len(encoded))

# Encode and pad all reviews
max_len = 200  # Maximum sequence length
train_encoded = [encode_review(review, vocab, max_len) for review in train_reviews]
test_encoded = [encode_review(review, vocab, max_len) for review in test_reviews]

# Convert lists to PyTorch tensors
train_encoded = torch.tensor(train_encoded)
train_labels = torch.tensor(train_labels)
test_encoded = torch.tensor(test_encoded)
test_labels = torch.tensor(test_labels)

# Create a custom dataset class
class ReviewDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        return self.reviews[idx], self.labels[idx]

# Create DataLoader for batching
train_dataset = ReviewDataset(train_encoded, train_labels)
test_dataset = ReviewDataset(test_encoded, test_labels)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [5]:
# RNN model
class SentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SentimentRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)
        # RNN layer
        output, hidden = self.rnn(embedded)
        # Fully connected layer (we use the hidden state of the last RNN cell)
        return self.fc(hidden.squeeze(0))

# Hyperparameters
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
output_dim = 1  # Binary classification (positive or negative)

# Instantiate the model
model = SentimentRNN(vocab_size, embedding_dim, hidden_dim, output_dim)

# Loss function and optimizer
criterion = nn.BCEWithLogitsLoss()  # Binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam Optimizer

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import matplotlib.pyplot as plt

# Function to train model and print training convergence plot
def training(model):
    num_epochs = 10
    loss_values = []
    model.train()
    
    for epoch in range(num_epochs):
        all_preds = []
        all_labels = []
        running_loss = 0.0
        
        for reviews, labels in train_loader:
            output = model(reviews)
            loss = criterion(output.squeeze(), labels.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
            # Get the index of the max log-probability
            _, predicted = torch.max(output.data, 1)

            # Append predictions and labels to lists
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
        
        # Calculate average loss
        average_loss = running_loss / len(train_loader)
        loss_values.append(average_loss)
        
        # Convert the lists to numpy arrays
        all_preds = torch.tensor(all_preds)
        all_labels = torch.tensor(all_labels)
        
        # Calculate Accuracy, Precision, Recall, F1 Score
        accuracy = accuracy_score(all_labels, all_preds)
        precision = precision_score(all_labels, all_preds, average='weighted', zero_division=0)
        recall = recall_score(all_labels, all_preds, average='weighted', zero_division=0)
        f1 = f1_score(all_labels, all_preds, average='weighted', zero_division=0)

        print(f"Epoch [{epoch+1}/{num_epochs}], Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}, Loss: {average_loss:.4f}")
    
    # Plotting the convergence plot
    plt.plot(loss_values, label='Training Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training Convergence Plot (Loss Over Epochs)')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
training(model)

In [None]:
# Function to evaluate the model on test data
def evaluating(model, data_loader):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for reviews, labels in data_loader:
            output = model(reviews)
            predictions = torch.round(torch.sigmoid(output.squeeze())).cpu().numpy()
            all_preds.extend(predictions)
            all_labels.extend(labels.cpu().numpy())
    
    # Convert the lists to numpy arrays
    all_preds = torch.tensor(all_preds)
    all_labels = torch.tensor(all_labels)

    # Calculate Accuracy, Precision, Recall, F1 Score
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")


In [None]:
evaluating(model, test_loader)

### Ablation Studies

In [None]:
# RNN model 2
class SimpleSentimentRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(SimpleSentimentRNN, self).__init__()
        # Simpler embedding layer with reduced dimensions
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])
        # Simpler RNN with fewer hidden units
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        # Fully connected output layer
        self.fc = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        embedded = self.embedding(x)  # Word embeddings
        output, hidden = self.rnn(embedded)  # RNN layer
        return self.fc(hidden.squeeze(0))

# Instantiate the model with reduced embedding and hidden dimensions
model_2 = SimpleSentimentRNN(vocab_size=1000, embedding_dim=10, hidden_dim=32, output_dim=1)

training(model_2)

In [None]:
evaluating(model_2, test_loader)