In [1]:
# !pip install pandas
# !pip install numpy

import pandas as pd
import numpy as np

## Ensure reproducibility
Use a fixed seed such that all steps and results can be reproduced.

In [2]:
# seed handpicked to ensure all of the cleaning/pre-processing steps were visually shown
SEED = 544
np.random.seed(SEED)

# 4. Feedforward NN
In this section, we will train a Multilayer Perceptron for sentiment analysis classification for both the binary and ternary cases. Per the homework requirements, our network will consists of two hidden layers, each with 50 and 10 nodes, respectively. We will use cross entropy loss and ADAM for optimizing.

## Common Functionality
Below are functions and classes that group up implementations for code reuse and understanding. It is used for Q4.

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Credit: From PyTorch's documentation
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

# Returns training and testing data loaders
def prepare_data(X_train, X_test, y_train, y_test, batch_size):
    train_data = torch.tensor(X_train, device=device)
    train_label = torch.tensor(y_train.values, dtype=torch.long, device=device)
    train_tensor = TensorDataset(train_data, train_label)
    train_loader = DataLoader(dataset=train_tensor, batch_size=batch_size, shuffle=True)
    
    test_data = torch.tensor(X_test, device=device)
    test_label = torch.tensor(y_test.values, dtype=torch.long, device=device)
    test_tensor = TensorDataset(test_data, test_label)
    test_loader = DataLoader(dataset=test_tensor, batch_size=batch_size, shuffle=True)
    
    return train_loader, test_loader

# A Multilayer Perceptron with 2 hidden layers that uses CrossEntropyLoss and Adam optimizer
class MLP:
    # Creates a model with Cross Entropy Loss and Adam optimizer
    def __init__(self, num_input, num_classes, learning_rate):
        self.model = MLP.create_model(num_input, 50, 10, num_classes)
        self.model.apply(MLP.initialize_weights)
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        
    # Trains the model given the data_loader with max_epochs
    def train(self, max_epochs, data_loader):
        self.model.train()
        for epoch in range(max_epochs):
            for idx, (X, y) in enumerate(data_loader):
                X = X.to(device=device)
                y = y.to(device=device)
                y_pred = self.model(X)
                loss = self.criterion(y_pred, y)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
    # Returns the model's accuracy (0-1) given the data_loader
    def evaluate(self, data_loader):
        self.model.eval()
        num_correct = 0
        num_samples = 0
        with torch.no_grad():
            for X, y in data_loader:
                X = X.to(device=device)
                y = y.to(device=device)
                scores = self.model(X)
                _, predictions = scores.max(1)
                num_correct += (predictions == y).sum()
                num_samples += predictions.size(0)
        return float(num_correct) / float(num_samples)

    # Reports the accuracy of the model
    def report_accuracy(self, text, data_loader):
        accuracy = self.evaluate(data_loader)
        print(f'{text}: accuracy is {accuracy:.3f}.')
        print()

    # Creates a 2 hidden layer NN with the specified sizes, with ReLU activations
    @staticmethod
    def create_model(n_input, n_h1, n_h2, n_classes):
        model = torch.nn.Sequential(
            torch.nn.Linear(n_input, n_h1),
            torch.nn.ReLU(),
            torch.nn.Linear(n_h1, n_h2),
            torch.nn.ReLU(),
            torch.nn.Linear(n_h2, n_classes)
        )
        model.to(device)
        return model
    
    # Initializes linear layers to 0 with a bias of 1
    @staticmethod
    def initialize_weights(m):
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.zeros_(m.weight)
            torch.nn.init.ones_(m.bias)

## Load dataset
Load the pandas dataset from Q1.

In [4]:
# Load the data from disk
data = pd.read_pickle('dataset.pkl')

## Load both Word2Vec models
Load the w2v models from Q2.

In [5]:
import gensim.downloader as api
w2v_google = api.load('word2vec-google-news-300')

from gensim.models import KeyedVectors
w2v_own = KeyedVectors.load('my_w2v.w2v')

## 4a. Averaged Input Features in Multi-Layer Perceptron
In this section, we shall build an MLP that takes in the averaged vectors as its input feature.

### 4a.1 Create the averaged input features for the reviews
Input feature is the average Word2Vec vector for each review. Words with no encoding vectors are ignored.

In [6]:
# Transform the given review body text into the averaged Word2Vec vector using a given trained word2vec model
def create_avg_input_feature(text, wv):
    vec = np.zeros((300,), dtype=np.float32)
    count = 0
    # Will skip words that have no vectors
    for word in str(text).split():
        if word in wv:
            vec += np.array(wv[word], dtype=np.float32)
            count += 1
    if count > 0:
        vec /= count
    return vec

# Create a new column for the review's input feature for both our w2v and googles w2v
data['own_input_features'] = data['cleaned_reviews'].apply(
    lambda text: create_avg_input_feature(text, w2v_own)
)

data['google_input_features'] = data['cleaned_reviews'].apply(
    lambda text: create_avg_input_feature(text, w2v_google)
)

### 4a.2 Training and Testing data split (Binary)
Split the data into two distinct parts (80% training, 20% testing) so that there is no overlap. This is done to ensure no data leakage nor bias influences the training and we can have a better view of the training process (if it overfitted for example).

In [7]:
from sklearn.model_selection import train_test_split

binary_data = data[data['label'] <= 1] # Only select class 0 (positive) and class 1 (negative)
own_input_features = binary_data['own_input_features']
google_input_features = binary_data['google_input_features']
binary_labels = binary_data['label']

# Perform an 80-20 split for training and testing data on the binary data only
X_train_own, X_test_own, y_train_own, y_test_own = train_test_split(
    own_input_features,
    binary_labels,
    test_size=0.2,
    random_state=SEED
)

# Reshape from (num_samples,) to (num_samples, 300)
X_train_own = np.vstack(X_train_own)
X_test_own = np.vstack(X_test_own)

X_train_google, X_test_google, y_train_google, y_test_google = train_test_split(
    google_input_features,
    binary_labels,
    test_size=0.2,
    random_state=SEED
)

# Reshape from (num_samples,) to (num_samples, 300)
X_train_google = np.vstack(X_train_google)
X_test_google = np.vstack(X_test_google)

### 4a.3 Averaged Features with My Word2Vec (Binary Case)
Use the averaged input features from our trained Word2Vec into the binary MLP.

In [8]:
# Binary MLP - Averaged Vectors - Own Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_own, X_test_own, y_train_own, y_test_own, batch_size=32)

# Create NN for binary classification
model = MLP(num_input=300, num_classes=2, learning_rate=1e-5)

# Train the NN
model.train(max_epochs=50, data_loader=train_loader)

# Evaluate NN
model.report_accuracy('Own W2V - Averaged MLP - Binary', data_loader=test_loader)

Own W2V - Averaged MLP - Binary: accuracy is 0.839.



### 4a.4 Averaged Features with Google Word2Vec (Binary Case)
Use the averaged input features from Google's pre-trained Word2Vec into the binary MLP.

In [9]:
# Binary MLP - Averaged Vectors - Google News Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_google, X_test_google, y_train_google, y_test_google, batch_size=32)

# Create NN for binary classification
model = MLP(num_input=300, num_classes=2, learning_rate=1e-5)

# Train the NN
model.train(max_epochs=50, data_loader=train_loader)

# Evaluate NN
model.report_accuracy('Google W2V - Averaged MLP - Binary', data_loader=test_loader)

Google W2V - Averaged MLP - Binary: accuracy is 0.818.



### 4a.5 Training and Testing data split (Ternary)
Split the data into two distinct parts (80% training, 20% testing).

In [10]:
from sklearn.model_selection import train_test_split

own_input_features = [[col for col in row] for row in data['own_input_features']]
google_input_features = [[col for col in row] for row in data['google_input_features']]
ternary_labels = data['label']

# Perform an 80-20 split for training and testing data on the complete dataset
X_train_own, X_test_own, y_train_own, y_test_own = train_test_split(
    own_input_features,
    ternary_labels,
    test_size=0.2,
    random_state=SEED
)

X_train_google, X_test_google, y_train_google, y_test_google = train_test_split(
    google_input_features,
    ternary_labels,
    test_size=0.2,
    random_state=SEED
)

### 4a.6 Averaged Features with My Word2Vec (Ternary Case)
Use the averaged input features from our trained Word2Vec into the ternary MLP.

In [11]:
# Ternary MLP - Averaged Vectors - Own Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_own, X_test_own, y_train_own, y_test_own, batch_size=32)

# Create NN for ternary classification
model = MLP(num_input=300, num_classes=3, learning_rate=1e-5)

# Train the NN
model.train(max_epochs=50, data_loader=train_loader)

# Evaluate NN
model.report_accuracy('Own W2V - Averaged MLP - Ternary', data_loader=test_loader)

Own W2V - Averaged MLP - Ternary: accuracy is 0.668.



### 4a.7 Averaged Features with Google Word2Vec (Ternary Case)
Use the averaged input features from Google's pre-trained Word2Vec into the ternary MLP.

In [12]:
# Ternary MLP - Averaged Vectors - Google News Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_google, X_test_google, y_train_google, y_test_google, batch_size=32)

# Create NN for binary classification
model = MLP(num_input=300, num_classes=3, learning_rate=1e-5)

# Train the NN
model.train(max_epochs=50, data_loader=train_loader)

# Evaluate NN
model.report_accuracy('Google W2V - Averaged MLP - Ternary', data_loader=test_loader)

Google W2V - Averaged MLP - Ternary: accuracy is 0.646.



### 4a. Averaged Features - Conclusion
Overall, both models, our Word2Vec and Google News Word2Vec, performed similarly. For the binary case, their accuracy was 0.839 and 0.818 respectively. Likewise, for the ternary case it was 0.668 and 0.646 respectively.

I believe the reason the ternary case has significantly lower accuracy is due to the way the data is imbalanced. The neutral class has half the data that the other classes have (50k vs 100k). Thus, this can skew the training and harm the model's learning.

Moreover, our w2v model has better accuracy than Google's model is because of the nature of the model; our model was trained on dataset specifically for reviews whereas Google used generic news articles.

Additionally, this accuracy is similar to that of earlier models in Q3 where they had an average accuracy of roughly 0.80 aswell. In order of better to worse it was SVM > Averaged MLP > Perceptron.