In [1]:
# !pip install pandas
# !pip install numpy

import pandas as pd
import numpy as np

## Ensure reproducibility
Use a fixed seed such that all steps and results can be reproduced.

In [2]:
# seed handpicked to ensure all of the cleaning/pre-processing steps were visually shown
SEED = 544
np.random.seed(SEED)

# 4. Feedforward NN
In this section, we will train a Multilayer Perceptron for sentiment analysis classification for both the binary and ternary cases. Per the homework requirements, our network will consists of two hidden layers, each with 50 and 10 nodes, respectively. We will use cross entropy loss and ADAM for optimizing.

## Common Functionality
Below are functions and classes that group up implementations for code reuse and understanding. It is used for Q4.

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Credit: From PyTorch's documentation
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu')

# Returns training and testing data loaders
def prepare_data(X_train, X_test, y_train, y_test, batch_size):
    train_data = torch.tensor(X_train, device=device)
    train_label = torch.tensor(y_train.values, dtype=torch.long, device=device)
    train_tensor = TensorDataset(train_data, train_label)
    train_loader = DataLoader(dataset=train_tensor, batch_size=batch_size, shuffle=True)
    
    test_data = torch.tensor(X_test, device=device)
    test_label = torch.tensor(y_test.values, dtype=torch.long, device=device)
    test_tensor = TensorDataset(test_data, test_label)
    test_loader = DataLoader(dataset=test_tensor, batch_size=batch_size, shuffle=True)
    
    return train_loader, test_loader

# A Multilayer Perceptron with 2 hidden layers that uses CrossEntropyLoss and Adam optimizer
class MLP:
    # Creates a model with Cross Entropy Loss and Adam optimizer
    def __init__(self, num_input, num_classes, learning_rate):
        self.model = MLP.create_model(num_input, 50, 10, num_classes)
        self.model.apply(MLP.initialize_weights)
        self.criterion = torch.nn.CrossEntropyLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        
    # Trains the model given the data_loader with max_epochs
    def train(self, max_epochs, data_loader):
        self.model.train()
        for epoch in range(max_epochs):
            for idx, (X, y) in enumerate(data_loader):
                X = X.to(device=device)
                y = y.to(device=device)
                y_pred = self.model(X)
                loss = self.criterion(y_pred, y)
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
                
    # Returns the model's accuracy (0-1) given the data_loader
    def evaluate(self, data_loader):
        self.model.eval()
        num_correct = 0
        num_samples = 0
        with torch.no_grad():
            for X, y in data_loader:
                X = X.to(device=device)
                y = y.to(device=device)
                scores = self.model(X)
                _, predictions = scores.max(1)
                num_correct += (predictions == y).sum()
                num_samples += predictions.size(0)
        return float(num_correct) / float(num_samples)

    # Reports the accuracy of the model
    def report_accuracy(self, text, data_loader):
        accuracy = self.evaluate(data_loader)
        print(f'{text}: accuracy is {accuracy:.3f}.')
        print()

    # Creates a 2 hidden layer NN with the specified sizes, with ReLU activations
    @staticmethod
    def create_model(n_input, n_h1, n_h2, n_classes):
        model = torch.nn.Sequential(
            torch.nn.Linear(n_input, n_h1),
            torch.nn.ReLU(),
            torch.nn.Linear(n_h1, n_h2),
            torch.nn.ReLU(),
            torch.nn.Linear(n_h2, n_classes)
        )
        model.to(device)
        return model
    
    # Initializes linear layers to 0 with a bias of 1
    @staticmethod
    def initialize_weights(m):
        if isinstance(m, torch.nn.Linear):
            torch.nn.init.zeros_(m.weight)
            torch.nn.init.ones_(m.bias)

## Load dataset
Load the pandas dataset from Q1.

In [4]:
# Load the data from disk
data = pd.read_pickle('dataset.pkl')

## Load both Word2Vec models
Load the w2v models from Q2.

In [5]:
import gensim.downloader as api
w2v_google = api.load('word2vec-google-news-300')

from gensim.models import KeyedVectors
w2v_own = KeyedVectors.load('my_w2v.w2v')

## 4b. First-10 Input Features in Multi-Layer Perceptron
In this section, we shall build an MLP that takes in the the first-10 vectors as its input feature.

### 4b.1 Create the first-10 input features for the reviews
Input feature is the first 10 Word2Vec vector concatenated for each review. Words with no encoding vectors are ignored. If a review has less than 10 vectors, the rest are filled with 0s.

In [6]:
# Transform the given review body text into the first10 Word2Vec vector using a given trained word2vec model
def create_first10_input_feature(text, wv):
    vectors = []
    # Will skip words that have no vectors
    for word in str(text).split():
        if word in wv:
            vec = np.array(wv[word], dtype=np.float32)
            vectors.append(vec)
            # If we have our first 10 vectors, we can exit the loop
            if len(vectors) == 10:
                break
    # The review does not have enough vectors, so we fill the rest with zeros
    while len(vectors) < 10:
        vectors.append(np.zeros((300,), dtype=np.float32))
    # returns all the vectors, flattened to a single array of 3000 elements
    return np.concatenate(vectors)

# Replace the existing column for the review's input feature for both our w2v and googles w2v
data['own_input_features'] = data['cleaned_reviews'].apply( 
    lambda text: create_first10_input_feature(text, w2v_own)
)

data['google_input_features'] = data['cleaned_reviews'].apply(
    lambda text: create_first10_input_feature(text, w2v_google)
)

### 4b.2 Training and Testing data split (Binary)
Split the data into two distinct parts (80% training, 20% testing).

In [7]:
from sklearn.model_selection import train_test_split

binary_data = data[data['label'] <= 1] # Only select class 0 (positive) and class 1 (negative)
own_input_features = binary_data['own_input_features']
google_input_features = binary_data['google_input_features']
binary_labels = binary_data['label']

# Perform an 80-20 split for training and testing data on the binary data only
X_train_own, X_test_own, y_train_own, y_test_own = train_test_split(
    own_input_features,
    binary_labels,
    test_size=0.2,
    random_state=SEED
)

# Reshape from (num_samples,) to (num_samples, 3000)
X_train_own = np.vstack(X_train_own)
X_test_own = np.vstack(X_test_own)

X_train_google, X_test_google, y_train_google, y_test_google = train_test_split(
    google_input_features,
    binary_labels,
    test_size=0.2,
    random_state=SEED
)

# Reshape from (num_samples,) to (num_samples, 3000)
X_train_google = np.vstack(X_train_google)
X_test_google = np.vstack(X_test_google)

### 4b.3 First-10 Features with My Word2Vec (Binary Case)
Use the first-10 input features from our trained Word2Vec into the binary MLP.

In [8]:
# Binary MLP - First-10 Vectors - Own Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_own, X_test_own, y_train_own, y_test_own, batch_size=32)

# Create NN for binary classification
model = MLP(num_input=300*10, num_classes=2, learning_rate=1e-5)

# Train the NN
model.train(max_epochs=50, data_loader=train_loader)

# Evaluate NN
model.report_accuracy('Own W2V - First-10 MLP - Binary', data_loader=test_loader)

Own W2V - First-10 MLP - Binary: accuracy is 0.779.



### 4b.4 First-10 Features with Google Word2Vec (Binary Case)
Use the first-10 input features from Google's pre-trained Word2Vec into the binary MLP.

In [9]:
# Binary MLP - First-10 Vectors - Google News Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_google, X_test_google, y_train_google, y_test_google, batch_size=32)

# Create NN for binary classification
model = MLP(num_input=300*10, num_classes=2, learning_rate=1e-5)

# Train the NN
model.train(max_epochs=50, data_loader=train_loader)

# Evaluate NN
model.report_accuracy('Google W2V - First-10 MLP - Binary', data_loader=test_loader)

Google W2V - First-10 MLP - Binary: accuracy is 0.765.



### 4b.5 Training and Testing data split (Ternary)
Split the data into two distinct parts (80% training, 20% testing).

In [10]:
from sklearn.model_selection import train_test_split

own_input_features = data['own_input_features']
google_input_features = data['google_input_features']
ternary_labels = data['label']

# Perform an 80-20 split for training and testing data on the complete dataset
X_train_own, X_test_own, y_train_own, y_test_own = train_test_split(
    own_input_features,
    ternary_labels,
    test_size=0.2,
    random_state=SEED
)

# Reshape from (num_samples,) to (num_samples, 3000)
X_train_own = np.vstack(X_train_own)
X_test_own = np.vstack(X_test_own)

X_train_google, X_test_google, y_train_google, y_test_google = train_test_split(
    google_input_features,
    ternary_labels,
    test_size=0.2,
    random_state=SEED
)

# Reshape from (num_samples,) to (num_samples, 3000)
X_train_google = np.vstack(X_train_google)
X_test_google = np.vstack(X_test_google)

### 4b.6 First-10 Features with My Word2Vec (Ternary Case)
Use the first-10 input features from our trained Word2Vec into the ternary MLP.

In [11]:
# Ternary MLP - First-10 Vectors - Own Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_own, X_test_own, y_train_own, y_test_own, batch_size=32)

# Create NN for ternary classification
model = MLP(num_input=300*10, num_classes=3, learning_rate=1e-5)

# Train the NN
model.train(max_epochs=50, data_loader=train_loader)

# Evaluate NN
model.report_accuracy('Own W2V - First-10 MLP - Ternary', data_loader=test_loader)

Own W2V - First-10 MLP - Ternary: accuracy is 0.622.



### 4b.7 First-10 Features with Google Word2Vec (Ternary Case)
Use the first-10 input features from Google's pre-trained Word2Vec into the ternary MLP.

In [12]:
# Ternary MLP - First-10 Vectors - Google News Trained Word2Vec

# Prepare the data for PyTorch
train_loader, test_loader = prepare_data(X_train_google, X_test_google, y_train_google, y_test_google, batch_size=32)

# Create NN for binary classification
model = MLP(num_input=300*10, num_classes=3, learning_rate=1e-5)

# Train the NN
model.train(max_epochs=50, data_loader=train_loader)

# Evaluate NN
model.report_accuracy('Google W2V - First-10 MLP - Ternary', data_loader=test_loader)

Google W2V - First-10 MLP - Ternary: accuracy is 0.611.



### 4b. First-10 Features - Conclusion
Overall, both models, our Word2Vec and Google News Word2Vec, performed similarly. For the binary case, their accuracy was 0.779 and 0.765 respectively. Likewise, for the ternary case it was 0.622 and 0.611 respectively. These values are similar to that of Q4a (using averaged vectors), but slightly lower.

Similar to my justification in Q4a, the reason the ternary case has significantly lower accuracy is due to the way the data is imbalanced. Moreover, the accuracy of First-10 is lower than Averaged input features; this is because when we take only the first 10 vectors, we discard all the rest, thus incomplete data for the model to work with (i.e. by taking only the first 10, we lost part of the data).