In [1]:
# !pip install pandas
# !pip install numpy

import pandas as pd
import numpy as np

## Ensure reproducibility
Use a fixed seed such that all steps and results can be reproduced.

In [2]:
# seed handpicked to ensure all of the cleaning/pre-processing steps were visually shown
SEED = 544
np.random.seed(SEED)

# 5. Recurrent NN on my Word2Vec Model
In this section, we will train an Recurrent Neural Network for sentiment analysis classification for both the binary and ternary cases.

For part (a), use RNN cell with hidden state size of 50. Limit review length to 50, truncating longer ones and padding shorter reviews with zeros. For part (b), use a gated RNN cell instead.

## Load dataset
Load the pandas dataset from Q1.

In [10]:
# Load the data from disk
data = pd.read_pickle('dataset.pkl')

# Because this homework takes ages to run, I was forced to reduce the dataset's size by 60%
data = data.drop(data[data['label'] == 0].sample(frac=.6).index)
data = data.drop(data[data['label'] == 1].sample(frac=.6).index)
data = data.drop(data[data['label'] == 2].sample(frac=.6).index)

## Load my Word2Vec model
Load the w2v model from Q2.

In [11]:
from gensim.models import KeyedVectors
w2v_own = KeyedVectors.load('my_w2v.w2v')

## Create the vectors for the reviews
The input feature is the first 50 vectors of the review. Word with no encoding vectors are ignored. If a review has less than 50 vectors, the rest are filled with 0s.

In [12]:
# Transform the given review body text into the first50 Word2Vec vector using a given trained word2vec model
def create_first50_input_feature(text, wv):
    vectors = []
    # Will skip words that have no vectors
    for word in str(text).split():
        if word in wv:
            vec = np.array(wv[word], np.float32)
            vectors.append(vec)
            # If we have our first 50 vectors, we can exit the loop
            if len(vectors) == 50:
                break
    # The review does not have enough vectors, so we fill the rest with zeros
    while len(vectors) < 50:
        vectors.append(np.zeros((300,), dtype=np.float32))
    return vectors

# Replace the existing column for the review's input feature for both our w2v and googles w2v
data['own_input_features'] = data['cleaned_reviews'].apply( 
    lambda text: create_first50_input_feature(text, w2v_own)
)

## Common Functionality
Below are functions and classes that group up implementations for code reuse and understanding. It is used for Q5.

In [13]:
import torch
from torch.utils.data import TensorDataset, DataLoader

device = torch.device('cpu')

# Returns training and testing data loaders
def prepare_data(X_train, X_test, y_train, y_test, batch_size):
    train_data = torch.tensor(X_train, device=device)
    train_label = torch.tensor(y_train.values, dtype=torch.long, device=device)
    train_tensor = TensorDataset(train_data, train_label)
    train_loader = DataLoader(dataset=train_tensor, batch_size=batch_size, shuffle=True)
    
    test_data = torch.tensor(X_test, device=device)
    test_label = torch.tensor(y_test.values, dtype=torch.long, device=device)
    test_tensor = TensorDataset(test_data, test_label)
    test_loader = DataLoader(dataset=test_tensor, batch_size=batch_size, shuffle=True)
    
    return train_loader, test_loader

# Trains the model given the data_loader with max_epochs
def train_model(model, max_epochs, data_loader):
    model.train()
    for epoch in range(max_epochs):
        for idx, (X, y) in enumerate(data_loader):
            # Match the shape needed for GRU which is (50, 300)
            X = X.reshape(-1, model.sequence_length, model.input_size).to(device=device)
            y = y.to(device=device)
            y_pred = model(X)
            loss = model.criterion(y_pred, y)
            model.optimizer.zero_grad()
            loss.backward()
            model.optimizer.step()
            if (idx+1)%100 == 0:
                print (f'Epoch [{epoch+1}/{max_epochs}], Step [{idx+1}], Loss: {loss.item():.3f}')

# Returns the model's accuracy (0-1) given the data_loader
def evaluate_model(model, data_loader):
    model.eval()
    num_correct = 0
    num_samples = 0
    with torch.no_grad():
        for X, y in data_loader:
            X = X.reshape(-1, model.sequence_length, model.input_size).to(device=device)
            y = y.to(device=device)
            scores = model(X)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
    return float(num_correct) / float(num_samples)

# Reports the accuracy of the model
def report_accuracy(model, text, data_loader):
    accuracy = evaluate_model(model, data_loader)
    print(f'{text}: accuracy is {accuracy:.3f}.')
    print()

# A Recurrent Neural Network with 1 hidden layer that uses CrossEntropyLoss and Adam optimizer
class RNN(torch.nn.Module):
    # Initializes the model with a single RNN layer using CrossEntropyLoss and Adam optimizer
    def __init__(self, input_size, hidden_size, num_classes, learning_rate):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.sequence_length = 50
        self.rnn = torch.nn.RNN(input_size, hidden_size, 1, batch_first=True)
        self.linear = torch.nn.Linear(hidden_size, num_classes)
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
    
    # Performs a forward pass
    def forward(self, x):
        # Initial the hidden state to zeros
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(device)
        out, _ = self.rnn(x, h0)
        # Get the last output and discard all intermediate outputs
        out = out[:,-1,:]
        out = self.linear(out)
        return out
        
# A Gated Recurrent Neural Network with 1 hidden layer that uses CrossEntropyLoss and Adam optimizer
class GRU(torch.nn.Module):
    # Initializes the model with a single GRU layer using CrossEntropyLoss and Adam optimizer
    def __init__(self, input_size, hidden_size, num_classes, learning_rate):
        super(GRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.sequence_length = 50
        self.gru = torch.nn.GRU(input_size, hidden_size, 1, batch_first=True)
        self.linear = torch.nn.Linear(hidden_size, num_classes)
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
    
    # Performs a forward pass
    def forward(self, x):
        # Initial the hidden state to zeros
        h0 = torch.zeros(1, x.size(0), self.hidden_size).to(device)
        out, _ = self.gru(x, h0)
        # Get the last output and discard all intermediate outputs
        out = out[:,-1,:]
        out = self.linear(out)
        return out

## 5. RNN and GRU on My Word2Vec
In this section, we will use an (a)RNN and (b)GRU as the model.

### Training and Testing data split (Binary)
Split the data into two distinct parts (80% training, 20% testing).

In [14]:
from sklearn.model_selection import train_test_split

binary_data = data[data['label'] <= 1] # Only select class 0 (positive) and class 1 (negative)
own_input_features = binary_data['own_input_features']
binary_labels = binary_data['label']

# Perform an 80-20 split for training and testing data on the binary data only
X_train_own, X_test_own, y_train_own, y_test_own = train_test_split(
    own_input_features,
    binary_labels,
    test_size=0.2,
    random_state=SEED
)

# Reshape from (num_samples,) to (num_samples, 50, 300)
X_train_own = np.dstack(X_train_own) # (num_samples,) -> (50, 300, num_samples)
X_train_own = np.moveaxis(X_train_own, -1, 0) # (50, 300, num_samples) -> (num_samples, 50, 300)
X_test_own = np.dstack(X_test_own) # (num_samples,) -> (50, 300, num_samples)
X_test_own = np.moveaxis(X_test_own, -1, 0) # (50, 300, num_samples) -> (num_samples, 50, 300)

### 5a.1 RNN with My Word2Vec (Binary Case)
Train an RNN for binary classification using my Word2Vec model.

In [15]:
# Binary RNN - Own Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_own, X_test_own, y_train_own, y_test_own, batch_size=32)

# Create RNN for binary classification
model = RNN(input_size=300, hidden_size=50, num_classes=2, learning_rate=1e-4)

# Train the RNN
train_model(model, max_epochs=5, data_loader=train_loader)

# Evaluate RNN
report_accuracy(model, 'Own W2V - RNN - Binary', data_loader=test_loader)

Epoch [1/5], Step [100], Loss: 0.693
Epoch [1/5], Step [200], Loss: 0.705
Epoch [1/5], Step [300], Loss: 0.654
Epoch [1/5], Step [400], Loss: 0.687
Epoch [1/5], Step [500], Loss: 0.673
Epoch [1/5], Step [600], Loss: 0.684
Epoch [1/5], Step [700], Loss: 0.676
Epoch [1/5], Step [800], Loss: 0.674
Epoch [1/5], Step [900], Loss: 0.714
Epoch [1/5], Step [1000], Loss: 0.703
Epoch [1/5], Step [1100], Loss: 0.715
Epoch [1/5], Step [1200], Loss: 0.715
Epoch [1/5], Step [1300], Loss: 0.695
Epoch [1/5], Step [1400], Loss: 0.676
Epoch [1/5], Step [1500], Loss: 0.666
Epoch [1/5], Step [1600], Loss: 0.718
Epoch [1/5], Step [1700], Loss: 0.689
Epoch [1/5], Step [1800], Loss: 0.695
Epoch [1/5], Step [1900], Loss: 0.696
Epoch [2/5], Step [100], Loss: 0.569
Epoch [2/5], Step [200], Loss: 0.676
Epoch [2/5], Step [300], Loss: 0.586
Epoch [2/5], Step [400], Loss: 0.656
Epoch [2/5], Step [500], Loss: 0.680
Epoch [2/5], Step [600], Loss: 0.730
Epoch [2/5], Step [700], Loss: 0.764
Epoch [2/5], Step [800], Los

### 5b.1 GRU with My Word2Vec (Binary Case)
Train an GRU for binary classification using my Word2Vec model.

In [16]:
# Binary GRU - Own Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_own, X_test_own, y_train_own, y_test_own, batch_size=32)

# Create GRU for binary classification
model = GRU(input_size=300, hidden_size=50, num_classes=2, learning_rate=1e-4)

# Train the GRU
train_model(model, max_epochs=5, data_loader=train_loader)

# Evaluate RNN
report_accuracy(model, 'Own W2V - GRU - Binary', data_loader=test_loader)

Epoch [1/5], Step [100], Loss: 0.703
Epoch [1/5], Step [200], Loss: 0.700
Epoch [1/5], Step [300], Loss: 0.719
Epoch [1/5], Step [400], Loss: 0.692
Epoch [1/5], Step [500], Loss: 0.659
Epoch [1/5], Step [600], Loss: 0.681
Epoch [1/5], Step [700], Loss: 0.683
Epoch [1/5], Step [800], Loss: 0.674
Epoch [1/5], Step [900], Loss: 0.677
Epoch [1/5], Step [1000], Loss: 0.678
Epoch [1/5], Step [1100], Loss: 0.653
Epoch [1/5], Step [1200], Loss: 0.627
Epoch [1/5], Step [1300], Loss: 0.674
Epoch [1/5], Step [1400], Loss: 0.662
Epoch [1/5], Step [1500], Loss: 0.678
Epoch [1/5], Step [1600], Loss: 0.674
Epoch [1/5], Step [1700], Loss: 0.680
Epoch [1/5], Step [1800], Loss: 0.656
Epoch [1/5], Step [1900], Loss: 0.666
Epoch [2/5], Step [100], Loss: 0.542
Epoch [2/5], Step [200], Loss: 0.475
Epoch [2/5], Step [300], Loss: 0.811
Epoch [2/5], Step [400], Loss: 0.402
Epoch [2/5], Step [500], Loss: 0.476
Epoch [2/5], Step [600], Loss: 0.511
Epoch [2/5], Step [700], Loss: 0.487
Epoch [2/5], Step [800], Los

### Training and Testing data split (Ternary)
Split the data into two distinct parts (80% training, 20% testing).

In [17]:
own_input_features = data['own_input_features']
ternary_labels = data['label']

# Perform an 80-20 split for training and testing data on the complete dataset
X_train_own, X_test_own, y_train_own, y_test_own = train_test_split(
    own_input_features,
    ternary_labels,
    test_size=0.2,
    random_state=SEED
)

# Reshape from (num_samples,) to (num_samples, 50, 300)
X_train_own = np.dstack(X_train_own) # (num_samples,) -> (50, 300, num_samples)
X_train_own = np.moveaxis(X_train_own, -1, 0) # (50, 300, num_samples) -> (num_samples, 50, 300)
X_test_own = np.dstack(X_test_own) # (num_samples,) -> (50, 300, num_samples)
X_test_own = np.moveaxis(X_test_own, -1, 0) # (50, 300, num_samples) -> (num_samples, 50, 300)

### 5a.2 RNN with My Word2Vec (Ternary Case)
Train an RNN for ternary classification using my Word2Vec model.

In [18]:
# Ternary RNN - Own Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_own, X_test_own, y_train_own, y_test_own, batch_size=32)

# Create RNN for ternary classification
model = RNN(input_size=300, hidden_size=50, num_classes=3, learning_rate=1e-4)

# Train the RNN
train_model(model, max_epochs=5, data_loader=train_loader)

# Evaluate RNN
report_accuracy(model, 'Own W2V - RNN - Ternary', data_loader=test_loader)

Epoch [1/5], Step [100], Loss: 1.047
Epoch [1/5], Step [200], Loss: 0.997
Epoch [1/5], Step [300], Loss: 1.042
Epoch [1/5], Step [400], Loss: 1.106
Epoch [1/5], Step [500], Loss: 1.030
Epoch [1/5], Step [600], Loss: 1.113
Epoch [1/5], Step [700], Loss: 1.019
Epoch [1/5], Step [800], Loss: 1.049
Epoch [1/5], Step [900], Loss: 1.017
Epoch [1/5], Step [1000], Loss: 1.056
Epoch [1/5], Step [1100], Loss: 0.992
Epoch [1/5], Step [1200], Loss: 1.106
Epoch [1/5], Step [1300], Loss: 1.127
Epoch [1/5], Step [1400], Loss: 1.114
Epoch [1/5], Step [1500], Loss: 1.073
Epoch [1/5], Step [1600], Loss: 1.013
Epoch [1/5], Step [1700], Loss: 1.100
Epoch [1/5], Step [1800], Loss: 0.984
Epoch [1/5], Step [1900], Loss: 1.017
Epoch [1/5], Step [2000], Loss: 1.004
Epoch [1/5], Step [2100], Loss: 1.121
Epoch [1/5], Step [2200], Loss: 1.034
Epoch [1/5], Step [2300], Loss: 1.094
Epoch [1/5], Step [2400], Loss: 1.132
Epoch [2/5], Step [100], Loss: 1.048
Epoch [2/5], Step [200], Loss: 1.037
Epoch [2/5], Step [300]

### 5b.2 GRU with My Word2Vec (Ternary Case)
Train an GRU for ternary classification using my Word2Vec model.

In [19]:
# Ternary GRU - Own Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_own, X_test_own, y_train_own, y_test_own, batch_size=32)

# Create GRU for ternary classification
model = GRU(input_size=300, hidden_size=50, num_classes=3, learning_rate=1e-4)

# Train the GRU
train_model(model, max_epochs=5, data_loader=train_loader)

# Evaluate RNN
report_accuracy(model, 'Own W2V - GRU - Ternary', data_loader=test_loader)

Epoch [1/5], Step [100], Loss: 1.087
Epoch [1/5], Step [200], Loss: 1.091
Epoch [1/5], Step [300], Loss: 1.108
Epoch [1/5], Step [400], Loss: 1.086
Epoch [1/5], Step [500], Loss: 1.016
Epoch [1/5], Step [600], Loss: 1.034
Epoch [1/5], Step [700], Loss: 1.044
Epoch [1/5], Step [800], Loss: 1.005
Epoch [1/5], Step [900], Loss: 1.099
Epoch [1/5], Step [1000], Loss: 1.032
Epoch [1/5], Step [1100], Loss: 1.069
Epoch [1/5], Step [1200], Loss: 1.036
Epoch [1/5], Step [1300], Loss: 1.078
Epoch [1/5], Step [1400], Loss: 1.035
Epoch [1/5], Step [1500], Loss: 1.032
Epoch [1/5], Step [1600], Loss: 1.039
Epoch [1/5], Step [1700], Loss: 1.040
Epoch [1/5], Step [1800], Loss: 1.022
Epoch [1/5], Step [1900], Loss: 0.944
Epoch [1/5], Step [2000], Loss: 1.017
Epoch [1/5], Step [2100], Loss: 1.098
Epoch [1/5], Step [2200], Loss: 1.060
Epoch [1/5], Step [2300], Loss: 1.042
Epoch [1/5], Step [2400], Loss: 1.078
Epoch [2/5], Step [100], Loss: 1.091
Epoch [2/5], Step [200], Loss: 0.977
Epoch [2/5], Step [300]

### Conclusion
At first glance, the accuracies don't seem that great. However, keep in mind, that only 40% of the dataset is being used for training because my PC cannot handle the full dataset. Moreover, the number of epochs was limited to 5 because it was already too time consuming (hours). Having said that, accuracies in the 60s and 70s is decent and I strongly believe that given more RAM and computation power, the RNN and GRU models will reach the 90% accuracy territory because the loss is slowly decreasing throughout the 5 epochs as shown above. Moreover, GRU seems to outperform RNN in our experiment.