In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


NOTE: because the original datset was so large, approximately three million instances in the traing set, I will be instead sampling from the smaller test set (650,000 instances). Avoiding loading the training set saves me much needed memory. The sample I take from the testing set will only be 10000 entries, as this is what I have determined to be the best when balancing memory usage. I have however ensured that the distribution of labels within the sampled set is even.

In [5]:
## Reading the dataset

import pandas as pd
column_headers = ['Rating', 'Title', 'Review']

df = pd.read_csv('/content/drive/MyDrive/Datasets/amazon_sentiment_analysis/test.csv', header=None, names=column_headers)

#Data Analysis and Processing


In [6]:
def labelDistribution(df, label_name):
  for l in df[label_name].unique():
    n = df[label_name].value_counts()[l]
    print("{}: {:.2f}%".format(l, n/df.shape[0] * 100))

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

def sample(df, label_name, feature_name, sample_size):
  #Find the frequency of each rating
  label_freq = df[label_name].value_counts()

  # Calculate the sample size for each class based on the desired sample size
  class_sample_size = {label: min(sample_size, freq) for label, freq in label_freq.items()}

  #this is the proportion of the data that will be taken for the sample
  test_size = 1 - (sample_size / df.shape[0])

  # Perform sampling stratified by label
  sample_indexes = StratifiedShuffleSplit(n_splits=1, test_size = test_size, random_state=17)

  # sampled_data = None
  for train_idx, _ in sample_indexes.split(df[feature_name], df[label_name]):
      sampled_df = df.iloc[train_idx]

  return sampled_df

In [8]:
#sample a subset from the full dataset because the dataset is too large
df = sample(df, 'Rating', 'Review', 10000)

In [16]:
#determine the number of null entries in the dataset
df.isnull().sum()

Rating    0
Title     0
Review    0
dtype: int64

In [9]:
print('Data Distribution:\n')
labelDistribution(df, 'Rating')

Data Distribution:

1: 20.00%
4: 20.00%
2: 20.00%
3: 20.00%
5: 20.00%


##Autoencoder



In [10]:
import gensim

#initlaize the size of the embeddings
vector_size = 75

#isolate the text into its own dataframe
df_copy = df[['Rating','Review']].copy()
print("\rDATA LOADED", end = '')

#tokenize each of the reviews in the dataset
tokenized_data = df_copy.Review.apply(gensim.utils.simple_preprocess)
print("\rDATA TOKENIZED", end = '')

#initialize a word2Vec encoder model
autoencoder = gensim.models.Word2Vec(min_count=1, vector_size=vector_size, window=7, workers=10)
print("\rAUTOENCODER INTIALIZED", end = '')

#train the model on the tokenized data
autoencoder.build_vocab(tokenized_data, progress_per=1000) #constructs vocabulary
print("\rVOCABULARY CONSTRUCTED", end = '')

autoencoder.train(tokenized_data, total_examples=autoencoder.corpus_count, epochs=3) #corpus_count: # sentences, model.epochs: # epochs
print("\rAUTOENCODER TRAINING COMPLETE", end = '')

AUTOENCODER TRAINING COMPLETE

In [8]:
import torch
import numpy as np

#find the maximum length sequence in the dataset as determined by the number of words

def encode(tokenized_data, vector_size, encoder, max_len=-1):
  #find how many instances there are in the dataset
  instances = tokenized_data.shape[0]

  #compute the maximum number of tokens in a given instance
  if(max_len == -1):
    max_len = len(max(tokenized_data, key=len))

  #generate embeddings
  encoded_data = np.zeros((instances, max_len, vector_size))

  #initlaize the padding and embedding outside of the loop to reduce memory consumption
  padding = None
  raw_embedding = None
  for i in range(instances):
    print("\rinstances encoded: {}".format(i+1), end = '')

    #calculate the padding and raw embedding for the sequence
    padding = np.zeros((max_len - len(tokenized_data.iloc[i]), vector_size))
    raw_embedding = np.array([encoder.wv[token] for token in tokenized_data.iloc[i]])

    #add the padding to the raw embedding before incorporating it into enocded_data
    if(len(padding) == max_len):
      encoded_data[i] = padding
    else:
      encoded_data[i] = np.concatenate((raw_embedding, padding), axis=0)

  return encoded_data

In [9]:
#this is for seperating the training and testing datasets
ratio = 0.8
n = int(ratio * len(df))

#this is the maxium sequence size in the entire tokenized dataset
max_len = len(max(tokenized_data, key=len))

X_train, y_train = torch.FloatTensor(encode(tokenized_data[:n], vector_size, autoencoder, max_len)), torch.tensor(df.Rating[:n].values)
X_test, y_test = torch.FloatTensor(encode(tokenized_data[n:], vector_size, autoencoder, max_len)), torch.tensor(df.Rating[n:].values)



instances encoded: 2000

In [10]:
#decrement the ratings by 1 and one hot encode them
y_train = torch.nn.functional.one_hot(y_train-1, 5).float()
y_test = torch.nn.functional.one_hot(y_test-1, 5).float()

#Task 2

## Model Description

NOTE: This model, as you will see, is extremely innaccurate. I have tried fixing this in a number of ways, including modifying all the hyperparameters and double checking the data. That all being said, unless I have missed something, I have come to the conclusion that this innaccuracy may be the result of the model simply not being complex enough for the task at hand. Unfortunately due the limitations of my hardward, I cannot not increase the complexity of my model without also crashing my system, I have tried though.

Because I am only classifying text, in this case reviews, into one of five categories each of which represents a customer rating ranging from one to five starts, I went ahead and used a many to one RNN. In this scenario, I am passing in a sequence of word embeddings, which were generated from the reviews. The output of the RNN is a vector of size five. This final layer has a softmax applied to it before it is returned so that each value in the vector represents a probability corresponding to a rating. At each timestep, a tanh activation is applied to the output of each cell. Because my loss was remaining the same across epochs, I initially tried ReLU. My hope was that the stagnation in loss was a result of a vanishing gradient. However, this did nothing really to solve my problem, so I moved to tanh. Tanh did not really fix the problem but it did demonstrate lower loss in general than ReLU so I stuck with it as my activation function.  


In [22]:
import time

import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import train_test_split

#because this is sentiment analysis, this will have to be a many in single out RNN
class RNN(nn.Module):
  def __init__(self, dim_in, dim_h, dim_out, layers):
    super(RNN, self).__init__()

    self.input_dim = dim_in #the size of the embedding
    self.hidden_dim = dim_h #the size of the hidden layer
    self.output_dim = dim_out #the number of classifiers

    self.num_layers = layers #the number of layers in the RNN

    #The RNN itself
    self.rnn = nn.RNN(self.input_dim, self.hidden_dim, self.num_layers, batch_first=True, nonlinearity = 'tanh')

    #Intialize the output layer
    self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)

  def forward(self, X):
    #Initlaize the first hidden layer based on the size of the batch
    h0 = torch.rand(self.num_layers, X.size(0), self.hidden_dim)

    #iterate one time step forward
    out, hn = self.rnn(X, h0)
    out = self.output_layer(out)[:, -1, :]

    #apply a softmax to the raw output
    out = torch.nn.functional.softmax(out, dim = 1)

    return out

In [12]:
class LSTM(nn.Module):
  def __init__(self, dim_in, dim_h, dim_out, layers):
    super(LSTM, self).__init__()

    self.input_dim = dim_in #the size of the embedding
    self.hidden_dim = dim_h #the size of the hidden layer
    self.output_dim = dim_out #the number of classifiers

    self.num_layers = layers #the number of layers in the RNN

    #The RNN itself
    self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers, batch_first=True)

    #Intialize the output layer
    self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)

  def forward(self, X):
    #Initlaize the first hidden layer and cell state
    h0 = torch.rand(self.num_layers, X.size(0), self.hidden_dim)
    c0 = torch.rand(self.num_layers, X.size(0), self.hidden_dim)

    #iterate one time step forward
    out, hn = self.lstm(X, (h0, c0))
    out = self.output_layer(out)[:, -1, :]

    #apply a softmax to the raw output
    out = torch.nn.functional.softmax(out, dim = 1)

    return out

In [13]:
class GRU(nn.Module):
  def __init__(self, dim_in, dim_h, dim_out, layers):
    super(GRU, self).__init__()

    self.input_dim = dim_in #the size of the embedding
    self.hidden_dim = dim_h #the size of the hidden layer
    self.output_dim = dim_out #the number of classifiers

    self.num_layers = layers #the number of layers in the RNN

    #The RNN itself
    self.gru = nn.GRU(self.input_dim, self.hidden_dim, self.num_layers, batch_first=True)

    #Intialize the output layer
    self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)

  def forward(self, X):
    #Initlaize the first hidden layer and cell state
    h0 = torch.rand(self.num_layers, X.size(0), self.hidden_dim)

    #iterate one time step forward
    out, hn = self.gru(X, h0)

    out = self.output_layer(out)[:, -1, :]

    #apply a softmax to the raw output
    out = torch.nn.functional.softmax(out, dim = 1)

    return out

In [14]:
def train(model, X_train, y_train, num_epochs, batch_size, learning_rate):
  start_time = time.time()

  #initalize the loss function and the optimizer
  criterion = nn.CrossEntropyLoss()
  optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

  #transform the training data into tensor
  training_dataset = TensorDataset(X_train, y_train)

  for epoch_idx in range(num_epochs):
    print(f"EPOCH: {epoch_idx + 1}")

    epoch = DataLoader(training_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    total_loss = 0

    for batch_idx, (X_minibatch, y_minibatch) in enumerate(epoch):

      #zero the gradient and
      optimizer.zero_grad()  # Clear gradients from the previous iteration
      y_preds = model(X_minibatch)

      #compute the loss
      loss = criterion(y_preds, y_minibatch)
      total_loss += loss

      #preform backwards propogation and update the weights
      loss.backward()
      optimizer.step()

      #print the batch loss
      print("\tminibatch {:<3d} LOSS: {:.6f}   [{:5d}/{:5d}]".format(batch_idx, loss, (batch_idx + 1) * batch_size, X_train.size(0)))

    #print the average loss for the epoch
    print("\n\tAVG. LOSS: {:.6f}".format(total_loss/(X_train.size(0) // batch_size)))


  end_time = time.time()
  print("\nTIME: {:.2f} sec".format(end_time-start_time))

In [23]:
#initialize the models
model_RNN = RNN(vector_size, 125, 5, 1)
model_LSTM = LSTM(vector_size, 125, 5, 1)
model_GRU = GRU(vector_size, 125, 5, 1)

In [24]:
#train the basic RNN
train(model_RNN, X_train, y_train, 4, 1000, 0.01)

EPOCH: 1
	minibatch 0   LOSS: 1.609866   [ 1000/ 8000]
	minibatch 1   LOSS: 1.610573   [ 2000/ 8000]
	minibatch 2   LOSS: 1.609855   [ 3000/ 8000]
	minibatch 3   LOSS: 1.613003   [ 4000/ 8000]
	minibatch 4   LOSS: 1.613163   [ 5000/ 8000]
	minibatch 5   LOSS: 1.629937   [ 6000/ 8000]
	minibatch 6   LOSS: 1.613784   [ 7000/ 8000]
	minibatch 7   LOSS: 1.610171   [ 8000/ 8000]

	AVG. LOSS: 1.613794
EPOCH: 2
	minibatch 0   LOSS: 1.616458   [ 1000/ 8000]
	minibatch 1   LOSS: 1.618359   [ 2000/ 8000]
	minibatch 2   LOSS: 1.613925   [ 3000/ 8000]
	minibatch 3   LOSS: 1.617797   [ 4000/ 8000]
	minibatch 4   LOSS: 1.611846   [ 5000/ 8000]
	minibatch 5   LOSS: 1.617542   [ 6000/ 8000]
	minibatch 6   LOSS: 1.613588   [ 7000/ 8000]
	minibatch 7   LOSS: 1.618344   [ 8000/ 8000]

	AVG. LOSS: 1.615983
EPOCH: 3
	minibatch 0   LOSS: 1.620070   [ 1000/ 8000]
	minibatch 1   LOSS: 1.615829   [ 2000/ 8000]
	minibatch 2   LOSS: 1.616445   [ 3000/ 8000]
	minibatch 3   LOSS: 1.610554   [ 4000/ 8000]
	minibatc

In [25]:
#train the LMST model
train(model_LSTM, X_train, y_train, 4, 1000, 0.01)

EPOCH: 1
	minibatch 0   LOSS: 1.609538   [ 1000/ 8000]
	minibatch 1   LOSS: 1.610042   [ 2000/ 8000]
	minibatch 2   LOSS: 1.609119   [ 3000/ 8000]
	minibatch 3   LOSS: 1.611339   [ 4000/ 8000]
	minibatch 4   LOSS: 1.610391   [ 5000/ 8000]
	minibatch 5   LOSS: 1.609948   [ 6000/ 8000]
	minibatch 6   LOSS: 1.609818   [ 7000/ 8000]
	minibatch 7   LOSS: 1.609499   [ 8000/ 8000]

	AVG. LOSS: 1.609962
EPOCH: 2
	minibatch 0   LOSS: 1.609586   [ 1000/ 8000]
	minibatch 1   LOSS: 1.609543   [ 2000/ 8000]
	minibatch 2   LOSS: 1.609465   [ 3000/ 8000]
	minibatch 3   LOSS: 1.609315   [ 4000/ 8000]
	minibatch 4   LOSS: 1.609344   [ 5000/ 8000]
	minibatch 5   LOSS: 1.609257   [ 6000/ 8000]
	minibatch 6   LOSS: 1.610003   [ 7000/ 8000]
	minibatch 7   LOSS: 1.609361   [ 8000/ 8000]

	AVG. LOSS: 1.609484
EPOCH: 3
	minibatch 0   LOSS: 1.609243   [ 1000/ 8000]
	minibatch 1   LOSS: 1.609559   [ 2000/ 8000]
	minibatch 2   LOSS: 1.609758   [ 3000/ 8000]
	minibatch 3   LOSS: 1.609546   [ 4000/ 8000]
	minibatc

In [26]:
#train the GRU model
train(model_GRU, X_train, y_train, 4, 1000, 0.01)

EPOCH: 1
	minibatch 0   LOSS: 1.609718   [ 1000/ 8000]
	minibatch 1   LOSS: 1.609511   [ 2000/ 8000]
	minibatch 2   LOSS: 1.610316   [ 3000/ 8000]
	minibatch 3   LOSS: 1.610785   [ 4000/ 8000]
	minibatch 4   LOSS: 1.610297   [ 5000/ 8000]
	minibatch 5   LOSS: 1.609922   [ 6000/ 8000]
	minibatch 6   LOSS: 1.609207   [ 7000/ 8000]
	minibatch 7   LOSS: 1.609796   [ 8000/ 8000]

	AVG. LOSS: 1.609944
EPOCH: 2
	minibatch 0   LOSS: 1.609279   [ 1000/ 8000]
	minibatch 1   LOSS: 1.610250   [ 2000/ 8000]
	minibatch 2   LOSS: 1.609377   [ 3000/ 8000]
	minibatch 3   LOSS: 1.609338   [ 4000/ 8000]
	minibatch 4   LOSS: 1.609563   [ 5000/ 8000]
	minibatch 5   LOSS: 1.609644   [ 6000/ 8000]
	minibatch 6   LOSS: 1.609384   [ 7000/ 8000]
	minibatch 7   LOSS: 1.610492   [ 8000/ 8000]

	AVG. LOSS: 1.609666
EPOCH: 3
	minibatch 0   LOSS: 1.608796   [ 1000/ 8000]
	minibatch 1   LOSS: 1.608806   [ 2000/ 8000]
	minibatch 2   LOSS: 1.609390   [ 3000/ 8000]
	minibatch 3   LOSS: 1.610155   [ 4000/ 8000]
	minibatc

## Metrics
As for my metrics, I am using three. My first metric is just regular accuracy which serves as the baseline to communicate how my model is doing overall. Beyond that, I am also comparing the training time for each model. This is mainly to keep track of the difference between the three models I am contracting in terms of their complexity. Finally, I would like to say that it was my original intent to use the area under the ROC curve as another metric, but unfortunately this ended up requiring an extraordinary amount of memory, so this metric was scrapped at the last minute. My hope was to determine my model’s confidence in its choices using this metric. Though the actual processing has been left out of the notebook, the function for computing the ROC area is still there.


In [2]:
from sklearn.metrics import roc_auc_score

def accuracy(model, X_test, y_test):
  #generate predictions and convert them to labels
  y_preds = model(X_test) #probabilites
  y_preds = torch.argmax(y_preds, dim=1)

  #compute the accuracy
  return 100 * torch.eq(torch.argmax(y_test, dim=1), y_preds).sum()/y_test.size(0)

def roc(model, X_test, y_test):
  #generate predictions and convert them to labels
  y_preds = model(X_test) #probabilites
  _, y_preds = torch.max(y_preds, 1) #find the highest probabilities
  y_preds = y_preds.detach().numpy()

  print(y_preds)

  #compute the accuracy
  return roc_auc_score(y_test, y_preds)

In [29]:
#find the accuracy for each model for the train set
accuracy_RNN_train = accuracy(model_RNN, X_train, y_train)
accuracy_LSTM_train = accuracy(model_LSTM, X_train, y_train)
accuracy_GRU_train = accuracy(model_GRU, X_train, y_train)

#display the results
print("TRAIN:\nBASIC RNN ACCURACY: {:.2f}%\nLSTM RNN ACCURACY: {:.2f}%\nGRU RNN ACCURACY: {:.2f}%".format(accuracy_RNN_train,
                                                                                                              accuracy_LSTM_train,
                                                                                                              accuracy_GRU_train))
#find the accuracy for each model for the test set
accuracy_RNN_test = accuracy(model_RNN, X_test, y_test)
accuracy_LSTM_test = accuracy(model_LSTM, X_test, y_test)
accuracy_GRU_test = accuracy(model_GRU, X_test, y_test)

#display the results
print("\nVALIDATION:\nBASIC RNN ACCURACY: {:.2f}%\nLSTM RNN ACCURACY: {:.2f}%\nGRU RNN ACCURACY: {:.2f}%".format(accuracy_RNN_test,
                                                                                                              accuracy_LSTM_test,
                                                                                                              accuracy_GRU_test))

TRAIN:
BASIC RNN ACCURACY: 20.05%
LSTM RNN ACCURACY: 20.33%
GRU RNN ACCURACY: 20.30%

VALIDATION:
BASIC RNN ACCURACY: 19.80%
LSTM RNN ACCURACY: 18.90%
GRU RNN ACCURACY: 18.40%


## Observations about LSTM and GRU
After running the LSTM and GRU RNNs the main thing that noticed was difference in terms of how long it took for each to train. While the basic RNN and GRU took 118 and 220 seconds respectively, the LSTM took much longer, finishing at roughly 1100 seconds or 18 minutes. In terms of accuracy, they all preformed roughly the same, which in this case is poorly. As stated before however, this is more likely a result of the limited complexity of the models when compared to the problem, and the limitations of my hardware. As for why it took the LSTM and GRU RNNs longer, both processes complicate the standard RNN by incorporating gates. The difference is that GRU incorporates significantly few operations than LSTM which would probably explain why it took so much longer.


## Could you use a feed-forward network?
For this problem you almost certainly could not you a standard feed-forward network.  This is because the order of the words in the text is a feature of the data that your model needs to capture. Depending on how the words are arranged, the resulting rating could be completely different. This kind of sequential information cannot be capture by a traditional feed-forward network, and this is why a sequential model like an RNN is necessary.



#Task 3

## Similarity/Dissimilarity
Ok I just used the standard cosine similarity and dissimilarity score equation. That is defined by

$$  \frac{A \cdot B}{\left\lVert A \right\rVert \left\lVert B \right\rVert } $$

and

$$  1 - \frac{A \cdot B}{\left\lVert A \right\rVert \left\lVert B \right\rVert } $$

This is not something that I got from a paper so much as it is something that I just know from this class and prior classes.


In [11]:
#computes the cosine similarity and dissimilarity and returns them
def similiarity(encoder, word1, word2):
  #generate the emebeddings for each word and store them as tensors
  embedding1 = encoder.wv[word1.lower()]
  embedding2 = encoder.wv[word2.lower()]

  #compute the dot product
  numerator = embedding1 @ embedding2.T

  #compute the products of the euclidian norms of each embedding
  denominator = np.linalg.norm(embedding1) * np.linalg.norm(embedding2)

  similarity = numerator/denominator

  return similarity, 1-similarity

In [13]:
import numpy as np

word1 = input("ENTER WORD 1: ")
word2 = input("ENTER WORD 2: ")

similarity, dissimilarity = similiarity(autoencoder, word1, word2)

print("\nCOSINE SIMILARITY: {:.6f}\nCOSINE DISSIMILARITY: {:.6f}".format(similarity, dissimilarity))

ENTER WORD 1: plastic
ENTER WORD 2: good

COSINE SIMILARITY: 0.759757
COSINE DISSIMILARITY: 0.240243
