In [None]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
import re
from bs4 import BeautifulSoup
import gensim.models
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
! pip install contractions



## 1. Dataset Generation

In this section, we will load the data, keep the impoartant features and remove the missing values. 

We assume there 3 different sentiments: 
*   rating star less than 3 denote negative (class 2) labeled as 0
*   rating star more than 3 denote positive (class 1) labeled as 1
*   rating star equals 3 denote neutral sentiment

We will spilt the data later after we perform data cleaning and pre-processing later in section 3.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv("/content/drive/MyDrive/amazon_reviews_us_Kitchen_v1_00.tsv", 
                 sep='\t',
                 #usecols = ['star_rating','review_body'],df
                 error_bad_lines=False,
                 warn_bad_lines=False)
np.random.seed(2021)

In [None]:
# Keep Rviews and Ratings
df = df[["star_rating","review_body"]]

# Drop na 
df_withoutmissing = df.dropna()

# select 250k,  50k for each star_rating group 
data = df_withoutmissing.groupby('star_rating').apply(lambda x:
                                                      x.sample(50000, random_state = 100)).reset_index(drop=True)

In [None]:
# Labelling Reviews
review_df = data.copy()
review_df.loc[3, 'sentiment'] = None

row_idx_0 = review_df[review_df['star_rating']<=2].index
row_idx_1 = review_df[review_df['star_rating']>=4].index
row_idx_2 = review_df[review_df['star_rating']==3].index

review_df.loc[row_idx_0,'sentiment'] = 0
review_df.loc[row_idx_1,'sentiment'] = 1
review_df.loc[row_idx_2,'sentiment'] = 2

## 2. Word Embedding

### (a) Load Pretrained word2wec Model

In [None]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)

In [None]:
from gensim.models import KeyedVectors
pretrained_model = KeyedVectors.load_word2vec_format(path, binary=True)

In [None]:
# example 1
w1="dirty"
w2="smelly"
print("The similarity between", w1, "and", w2, "on pretrained model is: ", pretrained_model.wv.similarity(w1, w2))

# example 2 
w3="dog"
w4="puppy"
print("The similarity between", w3, "and", w4, "on pretrained model is: ", pretrained_model.wv.similarity(w3, w4))

The similarity between dirty and smelly on pretrained model is:  0.57099473
The similarity between dog and puppy on pretrained model is:  0.81064284


### (b) Train Word2Vec model using own dataset.

In [None]:
documents = []
for d in review_df.review_body:
    documents.append(gensim.models.utils.simple_preprocess(d))
# documents = [row.lower().split() for row in review_df['review_body']]

In [None]:
from gensim.models import Word2Vec

w2v_model = Word2Vec(documents, size=300, window=11, min_count=10)
#w2v_model.build_vocab(documents)
#w2v_model.train(documents, total_examples=len(documents), epochs=10)

In [None]:
"""run here"""
#path = '/content/drive/MyDrive/DSCI 544/hw2/w2v_model'
#w2v_model.save(path)
w2v_model = gensim.models.Word2Vec.load('/content/drive/MyDrive/DSCI 544/hw2/w2v_model')

In [None]:
# example 1
print("The similarity between", w1, "and", w2, "on my word2vec model is: ", w2v_model.wv.similarity(w1, w2))

# example 2 
print("The similarity between", w3, "and", w4, "on my word2vec model is: ", w2v_model.wv.similarity(w3, w4))

The similarity between dirty and smelly on my word2vec model is:  0.4769745
The similarity between dog and puppy on my word2vec model is:  0.2914985


From Section 2 part (a), we get that the similarity between **dirty** and **smelly**, and the similarity between **dog** and **puppy** are higher than what we get on our own word2vec model.  Hence we can conclude that the pretrained model encoding semantic similarities between words better. 

I think the main reason it is happening is that the pretrained model has been trained on a ton of data and encodes the contextual/semantic similarities between words.

## 3. Simple models

In this section, we will perform data cleaning and preprocessing, which is the same as HW1, to `review_df`. 
For simple models, we will only use positive and negative sentiment data. Ealier we have assigned positive (class 1) as label 1 and negative (class 2) as label 0. We know have 250k data which also include neutral sentiment named `review_df`, and 200k data which only have postive and negative named  `samples`.

After cleaning, we will split the data into 80% training and 20% testing for modeling on 200k data.

### Data cleaning
1. convert to lower case
2. remove html and url
3. perform contractions
4. remove non-alphabetical characters
5. remove the extra spaces between the words


In [None]:
##### lower case 
review_df.loc[:, "review_body"] = review_df.loc[:, "review_body"].str.lower()

##### remove html and url
def remove_url(text):
    url=re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r" ",text)
def remove_html(text):
    cleanr = re.compile('<.*?>')
    return cleanr.sub(r" ",text)
    
review_df["review_body"] = review_df["review_body"].map(lambda x:remove_url(x))
review_df["review_body"] = review_df["review_body"].map(lambda x:remove_html(x))

##### perform contractions
import contractions
review_df["review_body"] = review_df["review_body"].apply(
    lambda x: ' '.join(contractions.fix(word) for word in x.split()))

##### remove non-alphabetical characters
regex = '[^a-zA-Z]'
review_df["review_body"] = review_df["review_body"].replace(regex, ' ', regex=True)

##### remove the extra spaces between the words
review_df["review_body"] = review_df["review_body"].replace('\s+', ' ', regex=True)

### Data pre-processing
1. remove stop word
2. perform lemmatization



In [None]:
##### remove stop word 
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

review_df["review_body"] = review_df["review_body"].apply(
    lambda x: ' '.join(word for word in x.split() if word not in stop_words))

##### perform lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

review_df["review_body"] = review_df["review_body"].apply(
    lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))


Select only postive and negative reviews.

In [None]:
# keep postive (class 1) and negative rating (class 2) from data
#positive = review_df[review_df['sentiment']==1]
#negative = review_df[review_df['sentiment']==0]
samples = pd.concat([review_df[review_df['sentiment']==1], review_df[review_df['sentiment']==0]])
samples = samples.reset_index(drop=True)

In [None]:
# split data into 80% training dataset and 20% testing dataset.
from sklearn.model_selection import train_test_split
training_review, testing_review = train_test_split(samples, test_size=.2, random_state=42)
training_review_250k, testing_review_250k = train_test_split(review_df, test_size=.2, random_state=42)

In [None]:
# convert reviews into list 
training_X = training_review["review_body"].to_list() 
training_y = training_review["sentiment"].values

testing_X = testing_review["review_body"].to_list() 
testing_y = testing_review["sentiment"].values

training_X_250k = training_review_250k["review_body"].to_list() 
training_y_250k = training_review_250k["sentiment"].values

testing_X_250k = testing_review_250k["review_body"].to_list() 
testing_y_250k = testing_review_250k["sentiment"].values

Get the mean Word2Vec for each review whihc is saying that each word corresponding to one vector, one sentence eqauls the average of vectors (words)


In [None]:
"""def get_mean_vector(review):
    output = []
    for x in review:
        if x not in w2v_model.wv.vocab:
            output.append(np.zeros(300))
        else:
            output.append(w2v_model[x])
    return np.array(output).mean(axis=0) """

def get_mean_vector(model, words):
    word = np.zeros(300)
    count = 0 # count of valid word
    for i in words.split():    
        if i in model:
            word += model[i]
            count += 1
    if count == 0:
        return word
    if count > 0:
        return word/count
        # np.mean(word, axis=0)

### My word2vec model 

In [None]:
# apply mean vector to 200k data
my_x_train  = []
my_x_test = []
for x in training_X:
    my_x_train.append(get_mean_vector(w2v_model, x))
for x in testing_X:
    my_x_test.append(get_mean_vector(w2v_model, x))

# apply mean vector to 250k data
my_x_train_250k  = []
my_x_test_250k = []
for x in training_X_250k:
    my_x_train_250k.append(get_mean_vector(w2v_model, x))
for x in testing_X_250k:
    my_x_test_250k.append(get_mean_vector(w2v_model, x))

### Google word2vec model

In [None]:
# apply mean vector to 200k data
google_x_train  = []
google_x_test = []
for x in training_X:
    google_x_train.append(get_mean_vector(pretrained_model, x))
for x in testing_X:
    google_x_test.append(get_mean_vector(pretrained_model, x))

# apply mean vector to 200k data
google_x_train_250k  = []
google_x_test_250k = []
for x in training_X_250k:
    google_x_train_250k.append(get_mean_vector(pretrained_model, x))
for x in testing_X_250k:
    google_x_test_250k.append(get_mean_vector(pretrained_model, x))


### (a) Perceptron: Model 1 & Model 2

In [None]:
from sklearn.linear_model import Perceptron

# fit the training data on google word2vec model
Percet1= Perceptron()
Percet1.fit(google_x_train, training_y)

# fit the training data on my word2vec model
Percet2= Perceptron()
Percet2.fit(my_x_train, training_y)

Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
           fit_intercept=True, max_iter=1000, n_iter_no_change=5, n_jobs=None,
           penalty=None, random_state=0, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
predictions_Percet_train_google = Percet1.predict(google_x_train)
predictions_Percet_test_google = Percet1.predict(google_x_test)

predictions_Percet_train = Percet2.predict(my_x_train)
predictions_Percet_test = Percet2.predict(my_x_test)

In [None]:
print("My Word2Vec model Perceptron Training Accuracy Score:", accuracy_score(predictions_Percet_train_google, training_y))
print("My Word2Vec model Perceptron Testing Accuracy Score::", accuracy_score(predictions_Percet_test_google, testing_y))

print("Pretrained model Perceptron Training Accuracy Score:", accuracy_score(predictions_Percet_train, training_y))
print("Pretrained model Perceptron Testing Accuracy Score:", accuracy_score(predictions_Percet_test, testing_y))

print("TF-IDF model Perceptron Training Accuracy Score: 0.99358125")
print("TF-IDF model Perceptron Testing Accuracy Score: 0.89785")

My Word2Vec model Perceptron Training Accuracy Score: 0.72443125
My Word2Vec model Perceptron Testing Accuracy Score:: 0.7204
Pretrained model Perceptron Training Accuracy Score: 0.7895875
Pretrained model Perceptron Testing Accuracy Score: 0.787475
TF-IDF model Perceptron Training Accuracy Score: 0.99358125
TF-IDF model Perceptron Testing Accuracy Score: 0.89785


### (b) SVM: Model 3 & Model 4

In [None]:
from sklearn import svm
from sklearn.svm import LinearSVC

# fit the training data on google model
SVM1 = svm.LinearSVC()
SVM1.fit(google_x_train, training_y)

# fit the training data on my word2vec model
SVM2 = svm.LinearSVC()
SVM2.fit(my_x_train, training_y)

In [None]:
predictions_SVM_train_google = SVM1.predict(google_x_train)
predictions_SVM_test_google = SVM1.predict(google_x_test)

predictions_SVM_train = SVM2.predict(my_x_train)
predictions_SVM_test = SVM2.predict(my_x_test)

In [None]:
import sys
np.set_printoptions(threshold=sys.maxsize)
# predictions_SVM_test_google

In [None]:
print("My Word2Vec model SVM Training Accuracy Score:", accuracy_score(predictions_SVM_train_google, training_y))
print("My Word2Vec model SVM Testing Accuracy Score", accuracy_score(predictions_SVM_test_google, testing_y))

print("Pretrained model SVM Training Accuracy Score:", accuracy_score(predictions_SVM_train, training_y))
print("Pretrained model SVM Testing Accuracy Score:", accuracy_score(predictions_SVM_test, testing_y))

print("TF-IDF model SVM Training Accuracy Score: 0.9949125")
print("TF-IDF model SVM Testing Accuracy Score: 0.91585")

From this question, we get that the accuracy scores:

On Perceptron: 
*  word2vec-google-news-300: **0.7204**
*  self trained Word2Vec: **0.787475**
*  TF-IDF model: **0.89785**

On SVM
*  word2vec-google-news-300: **0.845325**
*  self trained Word2Vec: **0.816**
*  TF-IDF model: **0.91585**

I also reported the training accuracy with testing accuracy, and we may see that the testing scores are a bit lower than training which tells us that our model is not overfitting. Besides, we also can observe that the SVM model relatively gives higher scores than perceptron. 

In [None]:
del my_x_train
del my_x_test
del my_x_test_250k
del my_x_train_250k

del google_x_test
del google_x_train
del google_x_test_250k
del google_x_train_250k

del predictions_Percet_train_google
del predictions_SVM_test_google
del predictions_Percet_train
del predictions_Percet_test

del predictions_SVM_train_google
del predictions_SVM_test_google
del predictions_SVM_train
del predictions_SVM_test

del Percet1
del Percet2
del SVM1
del SVM2

## 4. Feedforward Neural Networks

### (a) Part 1: FF Network for Binary Classiffication

In [None]:
is_cuda = torch.cuda.is_available()

# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GPU is available


In [None]:
# create Tensor datasets
#train_data = TensorDataset(torch.from_numpy(np.asarray(rnn_my_x_train)).float(), torch.from_numpy(training_y).float())
#test_data = TensorDataset(torch.from_numpy(np.asarray(rnn_my_x_test)).float(), torch.from_numpy(testing_y).float())
training_y_long=torch.from_numpy(training_y)
training_y_long=torch.tensor(training_y, dtype=torch.long)

testing_y_long=torch.from_numpy(testing_y)
testing_y_long=torch.tensor(testing_y, dtype=torch.long)

train_dat_long = list(zip(training_X,training_y_long))
test_dat_long = list(zip(testing_X,testing_y_long))

# dataloaders
batch_size = 200

# make sure to shuffle data
train_loader_long = DataLoader(train_dat_long, shuffle=True, batch_size=batch_size)
test_loader_long = DataLoader(test_dat_long, shuffle=True, batch_size=batch_size)

In [None]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = preds.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch_id, (text, y) in enumerate(iterator):
        optimizer.zero_grad()
        predictions = model(text)#.squeeze(1)
        #print(predictions)
        
        loss = criterion(predictions, y.to(device))
        acc = categorical_accuracy(predictions, y.to(device))
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        #print(f'Batch ID: {batch_id}/{len(iterator)}, Training Accuracy: {acc}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch_id, (text, y) in enumerate(iterator):

            predictions = model(text)#.squeeze(1)
            loss = criterion(predictions, y.to(device))
            acc = categorical_accuracy(predictions, y.to(device))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            #print(f'Batch ID: {batch_id}/{len(iterator)}, Test Accuracy: {acc}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

#### Model 1: Using google pretrained model

In [None]:
class binary_FNN1(nn.Module):
    def __init__(self):
        super(binary_FNN1, self).__init__()
        # number of hidden nodes in each layer 50 and 10
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(300, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        # dropout layer (p=0.2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text):
        embedded = []
        for i in text:
            embedded.append(get_mean_vector(pretrained_model, i))  
        embedded=torch.from_numpy(np.asarray(embedded)).float().to(device)
        #embedded.to(device)
        #print(embedded.device )

        #embedded = [batch size, emb dim]
        #print(embedded.shape)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.fc3(x)
        #x = F.tanh(self.fc2(x))  # tanh activation
        #print(x.shape)
        return x
        
binary_FNN_model1= binary_FNN1()
binary_FNN_model1.to(device)
print(binary_FNN_model1)
#loss function 
criterion = nn.CrossEntropyLoss()
#optimizer Adam and learning rate 
optimizer = torch.optim.Adam(binary_FNN_model1.parameters(), lr=0.01)
optimizer2 = torch.optim.Adam(binary_FNN_model1.parameters(), lr=0.02)
optimizer3 = torch.optim.Adam(binary_FNN_model1.parameters(), lr=0.1)

binary_FNN1(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(binary_FNN_model1, train_loader_long, optimizer, criterion)
    valid_loss, valid_acc = evaluate(binary_FNN_model1, test_loader_long, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 31s
	Train Loss: 0.413 | Train Acc: 81.19%
	Test Loss: 0.397 |  Test Acc: 82.50%
Epoch: 02 | Epoch Time: 0m 32s
	Train Loss: 0.384 | Train Acc: 82.83%
	Test Loss: 0.383 |  Test Acc: 82.99%


#### Model 2: Using my word2vec model

In [None]:
class binary_FNN2(nn.Module):
    def __init__(self):
        super(binary_FNN2, self).__init__()
        # number of hidden nodes in each layer 50 and 10
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(300, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        # dropout layer (p=0.2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text):
        embedded = []
        for i in text:
            embedded.append(get_mean_vector(w2v_model, i))  
        embedded=torch.from_numpy(np.asarray(embedded)).float().to(device)
        #embedded = [batch size, emb dim]
        #print(embedded.shape)
        # add hidden layer, with relu activation function
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.fc3(x)
        #x = F.tanh(self.fc2(x))  # tanh activation
        #print(x.shape)
        return x 
        
binary_FNN_model2 = binary_FNN2()
binary_FNN_model2.to(device)
print(binary_FNN_model2)
#loss function 
criterion = nn.CrossEntropyLoss()
#optimizer Adam and learning rate 
optimizer = torch.optim.Adam(binary_FNN_model2.parameters(), lr=0.01)
optimizer2 = torch.optim.Adam(binary_FNN_model2.parameters(), lr=0.02)
optimizer3 = torch.optim.Adam(binary_FNN_model2.parameters(), lr=0.1)

binary_FNN2(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(binary_FNN_model2, train_loader_long, optimizer3, criterion)
    valid_loss, valid_acc = evaluate(binary_FNN_model2, test_loader_long, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

  
  from ipykernel import kernelapp as app


Epoch: 01 | Epoch Time: 1m 24s
	Train Loss: 0.425 | Train Acc: 81.88%
	Test Loss: 0.407 |  Test Acc: 82.62%
Epoch: 02 | Epoch Time: 1m 22s
	Train Loss: 0.402 | Train Acc: 83.29%
	Test Loss: 0.382 |  Test Acc: 83.81%


### (a) Part 2: FF Network for Ternary Classiffication

In [None]:
##################################################################################################

In [None]:
#def accuracy(predictions, labels):
#    classes = torch.argmax(predictions, dim=1)
#    return torch.mean((classes == labels).float())
def accuracy(predictions, labels):
    Y_prediction = labels
    accuracy = ((Y_prediction.data == predictions.data).float().mean())    
    return accuracy.item()

def categorical_accuracy(predictions, labels):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    top_pred = predictions.argmax(1, keepdim = True)
    correct = top_pred.eq(labels.view_as(top_pred)).sum()
    acc = correct.float() / labels.shape[0]
    return acc

def one_hot_embedding(labels, num_classes):
    y = torch.eye(num_classes) 
    return y[labels] 

def train_model2(model, x, y, test_x, test_y, optimizer, epochs = 30):
    running_accuracy = 0.00
    for epoch in range(epochs):
        # calculate the loss from forward pass 
        loss = criterion(model(x), y)
        loss_test = criterion(model(test_x), test_y)
        # accuarcy 
        train_acc = accuracy(model(x), y)
        train_acc_test = accuracy(model(test_x), test_y)

        print('Epoch: {} \tTrain Loss: {:.4f} \tTrain Acc: {:.4f} \tTest Loss: {:.4f} \tTest Acc: {:.4f}'
        .format(epoch+1, loss, train_acc, loss_test, train_acc_test))

        optimizer.zero_grad()
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
    #running_accuracy += accuracy(model(x), y)
    #print("The overall testing accuracy:", running_accuracy)

In [None]:
training_y_250k=torch.from_numpy(training_y_250k)
training_y_250k=torch.tensor(training_y_250k, dtype=torch.long)

testing_y_250k=torch.from_numpy(testing_y_250k)
testing_y_250k=torch.tensor(testing_y_250k, dtype=torch.long)

train_dat_250k = list(zip(training_X_250k,training_y_250k))
test_dat_250k = list(zip(testing_X_250k,testing_y_250k))

# dataloaders
batch_size = 200

# make sure to shuffle data
train_loader_250k = DataLoader(train_dat_250k, shuffle=True, batch_size=batch_size)
test_loader_250k = DataLoader(test_dat_250k, shuffle=True, batch_size=batch_size)

TypeError: ignored

#### Model 3: Using google pretrained model

In [None]:
class ternary_FNN1(nn.Module):
    def __init__(self):
        super(ternary_FNN1, self).__init__()
        # number of hidden nodes in each layer 50 and 10
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(300, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        # dropout layer (p=0.2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text):
        embedded = []
        for i in text:
            embedded.append(get_mean_vector(pretrained_model, i))  
        embedded=torch.from_numpy(np.asarray(embedded)).float().to(device)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        x = self.fc3(x)
        return x 

ternary_FNN_model1 = ternary_FNN1()
ternary_FNN_model1.to(device)
print(ternary_FNN_model1)
#loss function 
criterion = nn.CrossEntropyLoss()
#optimizer Adam and learning rate 
optimizer = torch.optim.Adam(ternary_FNN_model1.parameters(), lr=0.01)

ternary_FNN1(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(ternary_FNN_model1, train_loader_250k, optimizer, criterion)
    valid_loss, valid_acc = evaluate(ternary_FNN_model1, test_loader_250k, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 40s
	Train Loss: 0.811 | Train Acc: 65.23%
	Test Loss: 0.776 |  Test Acc: 66.30%
Epoch: 02 | Epoch Time: 0m 39s
	Train Loss: 0.785 | Train Acc: 66.31%
	Test Loss: 0.763 |  Test Acc: 67.33%


#### Model 4: Using my word2vec model

In [None]:
class ternary_FNN2(nn.Module):
    def __init__(self):
        super(ternary_FNN2, self).__init__()
        # number of hidden nodes in each layer 50 and 10
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(300, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        # dropout layer (p=0.2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text):
        embedded = []
        for i in text:
            embedded.append(get_mean_vector(w2v_model, i))  
        embedded=torch.from_numpy(np.asarray(embedded)).float().to(device)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        x = self.fc3(x)
        return x 

ternary_FNN_model2 = ternary_FNN2()
ternary_FNN_model2.to(device)
print(ternary_FNN_model2)
#loss function 
criterion = nn.CrossEntropyLoss()
#optimizer Adam and learning rate 
optimizer = torch.optim.Adam(ternary_FNN_model2.parameters(), lr=0.01)

ternary_FNN2(
  (fc1): Linear(in_features=300, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
N_EPOCHS = 1

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(ternary_FNN_model2, train_loader_250k, optimizer, criterion)
    valid_loss, valid_acc = evaluate(ternary_FNN_model2, test_loader_250k, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

  
  from ipykernel import kernelapp as app


Epoch: 01 | Epoch Time: 1m 40s
	Train Loss: 0.766 | Train Acc: 67.12%
	Test Loss: 0.728 |  Test Acc: 68.44%


### (b) Part 1: FF Network for First 10 Word2Vec and Binary Classification




In [None]:
def first10_vector(model, words):
    word = []
    count = 0
    for i in words.split():
        if i in model:
            word.append(model[i])
            count += 1
            if count == 10:
                break
    while count != 10:
        word.append(np.zeros(300))
        count += 1
    #return word
    return np.reshape(word,3000)

In [None]:
### work on my word2vec
# apply first10_vector to 200k data 
#first10_my_x_train  = []
#first10_my_x_test = []
#for x in training_X:
#    first10_my_x_train.append(first10_vector(w2v_model, x))
#for x in testing_X:
#    first10_my_x_test.append(first10_vector(w2v_model, x))

# apply first10_vector to 250k data
#first10_my_x_train_250k  = []
#first10_my_x_test_250k = []
#for x in training_X_250k:
#    first10_my_x_train_250k.append(first10_vector(w2v_model, x))
#for x in testing_X_250k:
#    first10_my_x_test_250k.append(first10_vector(w2v_model, x))

In [None]:
### work on google pretrainded word2vec
# apply mean vector to 200k data
#first10_google_x_train  = []
#first10_google_x_test = []
#for x in training_X:
#    first10_google_x_train.append(first10_vector(pretrained_model, x))
#for x in testing_X:
#    first10_google_x_test.append(first10_vector(pretrained_model, x))

# apply mean vector to 200k data
#first10_google_x_train_250k  = []
#first10_google_x_test_250k = []
#for x in training_X_250k:
#    first10_google_x_train_250k.append(first10_vector(pretrained_model, x))
#for x in testing_X_250k:
#    first10_google_x_test_250k.append(first10_vector(pretrained_model, x))

#### Model 5: Using google pretrained model

In [None]:
class binary_FNN3(nn.Module):
    def __init__(self):
        super(binary_FNN3, self).__init__()
        # number of hidden nodes in each layer 50 and 10
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(3000, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        # dropout layer (p=0.2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text):
        embedded = []
        for i in text:
            embedded.append(first10_vector(pretrained_model, i))  
        embedded=torch.from_numpy(np.asarray(embedded)).float().to(device)
        #embedded = [batch size, emb dim]
        #print(embedded.shape)
        # add hidden layer, with relu activation function
        x = embedded.view(-1,3000)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.fc3(x)
        #x = F.tanh(self.fc2(x))  # tanh activation
        #print(x.shape)
        return x 
        
binary_FNN_model3 = binary_FNN3()
binary_FNN_model3.to(device)
print(binary_FNN_model3)
#loss function 
criterion = nn.CrossEntropyLoss()
#optimizer Adam and learning rate 
optimizer = torch.optim.Adam(binary_FNN_model3.parameters(), lr=0.01)
optimizer2 = torch.optim.Adam(binary_FNN_model3.parameters(), lr=0.02)
optimizer3 = torch.optim.Adam(binary_FNN_model3.parameters(), lr=0.1)

binary_FNN3(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(binary_FNN_model3, train_loader_long, optimizer, criterion)
    valid_loss, valid_acc = evaluate(binary_FNN_model3, test_loader_long, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 12s
	Train Loss: 0.488 | Train Acc: 75.98%
	Test Loss: 0.470 |  Test Acc: 77.36%
Epoch: 02 | Epoch Time: 0m 12s
	Train Loss: 0.443 | Train Acc: 78.85%
	Test Loss: 0.462 |  Test Acc: 77.80%


#### Model 6: Using my word2vec model

In [None]:
class binary_FNN4(nn.Module):
    def __init__(self):
        super(binary_FNN4, self).__init__()
        # number of hidden nodes in each layer 50 and 10
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(3000, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 2)
        # dropout layer (p=0.2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text):
        embedded = []
        for i in text:
            embedded.append(first10_vector(w2v_model, i))  
        embedded=torch.from_numpy(np.asarray(embedded)).float().to(device)
        #embedded = [batch size, emb dim]
        #print(embedded.shape)
        # add hidden layer, with relu activation function
        x = embedded.view(-1,3000)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.fc3(x)
        #x = F.tanh(self.fc2(x))  # tanh activation
        #print(x.shape)
        return x 
        
binary_FNN_model4 = binary_FNN4()
binary_FNN_model4.to(device)
print(binary_FNN_model4)
#loss function 
criterion = nn.CrossEntropyLoss()
#optimizer Adam and learning rate 
optimizer = torch.optim.Adam(binary_FNN_model4.parameters(), lr=0.01)

binary_FNN4(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(binary_FNN_model4, train_loader_long, optimizer, criterion)
    valid_loss, valid_acc = evaluate(binary_FNN_model4, test_loader_long, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

  """
  


Epoch: 01 | Epoch Time: 0m 22s
	Train Loss: 0.477 | Train Acc: 76.93%
	Test Loss: 0.453 |  Test Acc: 78.53%
Epoch: 02 | Epoch Time: 0m 21s
	Train Loss: 0.428 | Train Acc: 79.98%
	Test Loss: 0.448 |  Test Acc: 79.01%


### (b) Part 2: FF Network for First 10 Word2Vec and Ternary Classification

#### Model 7: Using google pretrained model

In [None]:
class ternary_FNN3(nn.Module):
    def __init__(self):
        super(ternary_FNN3, self).__init__()
        # number of hidden nodes in each layer 50 and 10
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(3000, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        # dropout layer (p=0.2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text):
        embedded = []
        for i in text:
            embedded.append(first10_vector(pretrained_model, i))  
        embedded=torch.from_numpy(np.asarray(embedded)).float().to(device)
        x = embedded.view(-1,3000)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        x = self.fc3(x)
        return x 

ternary_FNN_model3 = ternary_FNN3()
ternary_FNN_model3.to(device)
print(ternary_FNN_model3)
#loss function 
criterion = nn.CrossEntropyLoss()
#optimizer Adam and learning rate 
optimizer = torch.optim.Adam(ternary_FNN_model3.parameters(), lr=0.01)

ternary_FNN3(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(ternary_FNN_model3, train_loader_250k, optimizer, criterion)
    valid_loss, valid_acc = evaluate(ternary_FNN_model3, test_loader_250k, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTestLoss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 15s
	Train Loss: 0.885 | Train Acc: 60.42%
	TestLoss: 0.848 |  Test Acc: 62.36%
Epoch: 02 | Epoch Time: 0m 15s
	Train Loss: 0.843 | Train Acc: 62.84%
	TestLoss: 0.839 |  Test Acc: 62.81%


#### Model 8: Using my word2vec model

In [None]:
class ternary_FNN4(nn.Module):
    def __init__(self):
        super(ternary_FNN4, self).__init__()
        # number of hidden nodes in each layer 50 and 10
        hidden_1 = 50
        hidden_2 = 10

        self.fc1 = nn.Linear(3000, hidden_1)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        self.fc3 = nn.Linear(hidden_2, 3)
        # dropout layer (p=0.2)
        self.dropout = nn.Dropout(0.2)

    def forward(self, text):
        embedded = []
        for i in text:
            embedded.append(first10_vector(w2v_model, i))  
        embedded=torch.from_numpy(np.asarray(embedded)).float().to(device)
        x = embedded.view(-1,3000)
        x = F.relu(self.fc1(embedded))
        x = F.relu(self.fc2(x))
        # add dropout layer
        x = self.dropout(x)
        x = self.fc3(x)
        return x 

ternary_FNN_model4 = ternary_FNN4()
ternary_FNN_model4.to(device)
print(ternary_FNN_model4)
#loss function 
criterion = nn.CrossEntropyLoss()
#optimizer Adam and learning rate 
optimizer = torch.optim.Adam(ternary_FNN_model4.parameters(), lr=0.01)

ternary_FNN4(
  (fc1): Linear(in_features=3000, out_features=50, bias=True)
  (fc2): Linear(in_features=50, out_features=10, bias=True)
  (fc3): Linear(in_features=10, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(ternary_FNN_model4, train_loader_250k, optimizer, criterion)
    valid_loss, valid_acc = evaluate(ternary_FNN_model4, test_loader_250k, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  """
  


Epoch: 01 | Epoch Time: 0m 27s
	Train Loss: 0.889 | Train Acc: 60.53%
	 Val. Loss: 0.847 |  Val. Acc: 62.50%
Epoch: 02 | Epoch Time: 0m 27s
	Train Loss: 0.847 | Train Acc: 62.78%
	 Val. Loss: 0.837 |  Val. Acc: 62.85%


## 5. Recurrent Neural Networks

In [None]:
def trunc_padding_review(model, words):
    word = []
    count = 0
    for i in words.lower().split():
        if i in model:
            word.append(model[i])
        else:
            word.append(np.zeros(300))
        count += 1
        if count == 20:
            break
    while count != 20:
        # adding np.zeros in front of word vector
        word.insert(0, np.zeros(300))
        #word.append(np.zeros(300))
        count += 1
    return word

In [None]:
training_X = training_review["review_body"]#.to_list()  #training_review
training_y = training_review["sentiment"].values

testing_X = testing_review["review_body"]#.to_list()  #testing_review
testing_y = testing_review["sentiment"].values

train_dat = list(zip(training_X,training_y))
test_dat = list(zip(testing_X,testing_y))

# dataloaders
batch_size = 200

# make sure to shuffle data
train_loader = DataLoader(train_dat, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dat, shuffle=True, batch_size=batch_size)

### (a) Part 1: RNN for Binary Classification

In [None]:
def train2(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch_id, (text, y) in enumerate(iterator):
        
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, y.to(device))
        acc = binary_accuracy(predictions, y.to(device))
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        #print(f'Batch ID: {batch_id}/{len(iterator)}, Training Accuracy: {acc}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate2(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch_id, (text, y) in enumerate(iterator):

            predictions = model(text).squeeze(1)
            loss = criterion(predictions, y.to(device))
            acc = binary_accuracy(predictions, y.to(device))

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            #print(f'Batch ID: {batch_id}/{len(iterator)}, Test Accuracy: {acc}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#### Model 1: Using google pretrained model

In [None]:
class RNN1(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
      
    def forward(self, text):
        #text = [sent len, batch size]
        embedded = []
        for x in text:
            embedded.append(trunc_padding_review(pretrained_model, x))        
        #embedded = [sent len, batch size, emb dim]
        embedded=torch.from_numpy(np.asarray(embedded)).float()
        embedded=embedded.permute(1, 0, 2)
        # here text is embedded since the input training data has been embedded 
        output, hidden = self.rnn(embedded.to(device))
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

EMBEDDING_DIM = 300
HIDDEN_DIM = 50
OUTPUT_DIM = 1

model1 = RNN1(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model1.to(device)

optimizer = optim.Adam(model1.parameters(), lr=3e-3)
criterion = nn.BCEWithLogitsLoss()

In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train2(model1, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate2(model1, test_loader, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 17s
	Train Loss: 0.491 | Train Acc: 77.07%
	Test Loss: 0.475 |  Test Acc: 78.39%
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 0.476 | Train Acc: 78.21%
	Test Loss: 0.470 |  Test Acc: 78.36%


#### Model 2: Using my work2vec model

In [None]:
class RNN2(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
      
    def forward(self, text):
        #text = [sent len, batch size]
        embedded = []
        for x in text:
            embedded.append(trunc_padding_review(w2v_model, x))        
        #embedded = [sent len, batch size, emb dim]
        embedded=torch.from_numpy(np.asarray(embedded)).float()
        embedded=embedded.permute(1, 0, 2)
        # here text is embedded since the input training data has been embedded 
        output, hidden = self.rnn(embedded.to(device))
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        return self.fc(hidden.squeeze(0))

EMBEDDING_DIM = 300
HIDDEN_DIM = 50
OUTPUT_DIM = 1

model2 = RNN2(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model2.to(device)

optimizer = optim.Adam(model2.parameters(), lr=3e-3)
criterion = nn.BCEWithLogitsLoss()

In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train2(model2, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate2(model2, test_loader, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\tTest Loss: {valid_loss:.3f} |  Test Acc: {valid_acc*100:.2f}%')

  """
  


Epoch: 01 | Epoch Time: 0m 32s
	Train Loss: 0.532 | Train Acc: 74.58%
	Test Loss: 0.498 |  Test Acc: 77.40%
Epoch: 02 | Epoch Time: 0m 33s
	Train Loss: 0.504 | Train Acc: 76.93%
	Test Loss: 0.568 |  Test Acc: 71.45%


### (a) Part 2: RNN for Ternary Classification

#### Model 3: Using google pretrained model

In [None]:
class RNN3(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
      
    def forward(self, text):
        #text = [sent len, batch size]
        embedded = []
        for x in text:
            embedded.append(trunc_padding_review(pretrained_model, x))        
        embedded=torch.from_numpy(np.asarray(embedded)).float()
        embedded=embedded.permute(1, 0, 2)
        #embedded = [sent len, batch size, emb dim]

        output, hidden = self.rnn(embedded.to(device))
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

EMBEDDING_DIM = 300
HIDDEN_DIM = 50
OUTPUT_DIM = 3

model3 = RNN3(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model3.to(device)

optimizer = optim.Adam(model3.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model3, train_loader_250k, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model3, test_loader_250k, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 21s
	Train Loss: 0.835 | Train Acc: 63.32%
	 Val. Loss: 0.818 |  Val. Acc: 64.28%
Epoch: 02 | Epoch Time: 0m 21s
	Train Loss: 0.829 | Train Acc: 63.64%
	 Val. Loss: 0.798 |  Val. Acc: 65.42%


#### Model 4: Using my work2vec model

In [None]:
class RNN4(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
      
    def forward(self, text):
        #text = [sent len, batch size]
        embedded = []
        for x in text:
            embedded.append(trunc_padding_review(w2v_model, x))        
        embedded=torch.from_numpy(np.asarray(embedded)).float()
        embedded=embedded.permute(1, 0, 2)
        #embedded = [sent len, batch size, emb dim]

        output, hidden = self.rnn(embedded.to(device))
        #output = [sent len, batch size, hid dim]
        #hidden = [1, batch size, hid dim]

        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.fc(hidden.squeeze(0))

EMBEDDING_DIM = 300
HIDDEN_DIM = 50
OUTPUT_DIM = 3

model4 = RNN4(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model4.to(device)

optimizer = optim.Adam(model4.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model4, train_loader_250k, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model4, test_loader_250k, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  """
  


Epoch: 01 | Epoch Time: 0m 40s
	Train Loss: 0.923 | Train Acc: 59.07%
	 Val. Loss: 0.893 |  Val. Acc: 61.30%
Epoch: 02 | Epoch Time: 0m 41s
	Train Loss: 0.897 | Train Acc: 60.96%
	 Val. Loss: 0.886 |  Val. Acc: 61.89%


### (b) Part 1: Gated RNN Binary Classification

In [None]:
def train3(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch_id, (text, y) in enumerate(iterator):
        
        h = model.init_hidden(batch_size)
        h = h.data
        
        optimizer.zero_grad()
        predictions = model(text,h).squeeze(1)
        # print(predictions.shape)
        loss = criterion(predictions, y.to(device)) 
        acc = binary_accuracy(predictions, y.to(device)) 
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        #print(f'Batch ID: {batch_id}/{len(iterator)}, Training Accuracy: {acc}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate3(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch_id, (text, y) in enumerate(iterator):
            h = model.init_hidden(batch_size)
            h = h.data

            predictions = model(text,h).squeeze(1)
            loss = criterion(predictions, y.to(device)) 
            acc = binary_accuracy(predictions, y.to(device)) 

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            #print(f'Batch ID: {batch_id}/{len(iterator)}, Test Accuracy: {acc}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#### Model 5: Using google pretrained model

In [None]:
class GRUNet1(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers = 2, dropout=0.1):
        
        super().__init__()
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim

        self.GRN = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, text, h):
        #text = [sent len, batch size]
        embedded = []
        for x in text:
            embedded.append( trunc_padding_review(pretrained_model, x) )  

        # embedded = [sent len, batch size, hidden dim]
        # h = [layer, batch size, hidden dim]
        # https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
        embedded=torch.from_numpy(np.asarray(embedded)).float()
        embedded=embedded.permute(1, 0, 2)      

        #print(embedded.shape)
        #print(h.shape)
        output, hidden = self.GRN(embedded.to(device), h)
        # output = [sent len, batch size, hidden dim] 
        return self.fc(self.relu(output[-1,:]))


    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)

        return hidden

EMBEDDING_DIM = 300
HIDDEN_DIM = 50
OUTPUT_DIM = 1

model5 = GRUNet1(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model5.to(device)
optimizer = optim.Adam(model5.parameters(), lr=3e-3)
criterion = nn.BCEWithLogitsLoss()

In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train3(model5, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate3(model5, test_loader, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 18s
	Train Loss: 0.402 | Train Acc: 81.62%
	 Val. Loss: 0.363 |  Val. Acc: 83.81%
Epoch: 02 | Epoch Time: 0m 18s
	Train Loss: 0.343 | Train Acc: 84.91%
	 Val. Loss: 0.338 |  Val. Acc: 85.16%


#### Model 6: Using my work2vec model

In [None]:
class GRUNet2(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers = 2, dropout=0.1):
        
        super().__init__()
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim

        self.GRN = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, text, h):
        #text = [sent len, batch size]
        embedded = []
        for x in text:
            embedded.append( trunc_padding_review(w2v_model, x) )  

        # embedded = [sent len, batch size, hidden dim]
        # h = [layer, batch size, hidden dim]
        # https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
        embedded=torch.from_numpy(np.asarray(embedded)).float()
        embedded=embedded.permute(1, 0, 2)      

        #print(embedded.shape)
        #print(h.shape)
        output, hidden = self.GRN(embedded.to(device), h)
        # output = [sent len, batch size, hidden dim] 
        return self.fc(self.relu(output[-1,:]))


    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)

        return hidden

EMBEDDING_DIM = 300
HIDDEN_DIM = 50
OUTPUT_DIM = 1

model6 = GRUNet2(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model6.to(device)
optimizer = optim.Adam(model6.parameters(), lr=3e-3)
criterion = nn.BCEWithLogitsLoss()

In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train3(model6, train_loader, optimizer, criterion)
    valid_loss, valid_acc = evaluate3(model6, test_loader, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  """
  


Epoch: 01 | Epoch Time: 0m 33s
	Train Loss: 0.374 | Train Acc: 83.20%
	 Val. Loss: 0.344 |  Val. Acc: 84.90%
Epoch: 02 | Epoch Time: 0m 33s
	Train Loss: 0.331 | Train Acc: 85.52%
	 Val. Loss: 0.342 |  Val. Acc: 85.10%


### (b) Part 2: Gated RNN for Ternary Classification

In [None]:
def train4(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch_id, (text, y) in enumerate(iterator):
        
        h = model.init_hidden(batch_size)
        h = h.data
        
        optimizer.zero_grad()
        predictions = model(text,h).squeeze(1)
        # print(predictions.shape)
        loss = criterion(predictions, y.to(device)) 
        acc = categorical_accuracy(predictions, y.to(device)) 
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        #print(f'Batch ID: {batch_id}/{len(iterator)}, Training Accuracy: {acc}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate4(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch_id, (text, y) in enumerate(iterator):
            h = model.init_hidden(batch_size)
            h = h.data

            predictions = model(text,h).squeeze(1)
            loss = criterion(predictions, y.to(device)) 
            acc = categorical_accuracy(predictions, y.to(device)) 

            epoch_loss += loss.item()
            epoch_acc += acc.item()
            #print(f'Batch ID: {batch_id}/{len(iterator)}, Test Accuracy: {acc}')

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

#### Model 7: Using google pretrained model

In [None]:
class GRUNet3(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers = 2, dropout=0.1):
        
        super().__init__()
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim

        self.GRN = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, text, h):
        #text = [sent len, batch size]
        embedded = []
        for x in text:
            embedded.append(trunc_padding_review(pretrained_model, x))  

        # embedded = [sent len, batch size, hidden dim]
        # h = [layer, batch size, hidden dim]
        # https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
        embedded=torch.from_numpy(np.asarray(embedded)).float()
        embedded=embedded.permute(1, 0, 2)      

        #print(embedded.shape)
        #print(h.shape)
        output, hidden = self.GRN(embedded.to(device), h)
        # output = [sent len, batch size, hidden dim] 
        return self.fc(self.relu(output[-1,:]))


    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)

        return hidden

EMBEDDING_DIM = 300
HIDDEN_DIM = 50
OUTPUT_DIM = 3

model7 = GRUNet3(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model7.to(device)
optimizer = optim.Adam(model7.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train4(model7, train_loader_250k, optimizer, criterion)
    valid_loss, valid_acc = evaluate4(model7, test_loader_250k, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

Epoch: 01 | Epoch Time: 0m 23s
	Train Loss: 0.774 | Train Acc: 66.32%
	 Val. Loss: 0.726 |  Val. Acc: 68.62%
Epoch: 02 | Epoch Time: 0m 23s
	Train Loss: 0.712 | Train Acc: 69.59%
	 Val. Loss: 0.709 |  Val. Acc: 69.71%


#### Model 8: Using my work2vec model

In [None]:
class GRUNet4(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, n_layers = 2, dropout=0.1):
        
        super().__init__()
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim

        self.GRN = nn.GRU(embedding_dim, hidden_dim, n_layers, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, text, h):
        #text = [sent len, batch size]
        embedded = []
        for x in text:
            embedded.append(trunc_padding_review(w2v_model, x))  

        # embedded = [sent len, batch size, hidden dim]
        # h = [layer, batch size, hidden dim]
        # https://pytorch.org/docs/stable/generated/torch.nn.GRU.html
        embedded=torch.from_numpy(np.asarray(embedded)).float()
        embedded=embedded.permute(1, 0, 2)      

        #print(embedded.shape)
        #print(h.shape)
        output, hidden = self.GRN(embedded.to(device), h)
        # output = [sent len, batch size, hidden dim] 
        return self.fc(self.relu(output[-1,:]))


    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device)

        return hidden

EMBEDDING_DIM = 300
HIDDEN_DIM = 50
OUTPUT_DIM = 3

model8 = GRUNet4(EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)
model8.to(device)
optimizer = optim.Adam(model8.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()

In [None]:
N_EPOCHS = 2

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train4(model8, train_loader_250k, optimizer, criterion)
    valid_loss, valid_acc = evaluate4(model8, test_loader_250k, criterion)
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        #torch.save(model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

  """
  


Epoch: 01 | Epoch Time: 0m 42s
	Train Loss: 0.747 | Train Acc: 67.86%
	 Val. Loss: 0.714 |  Val. Acc: 69.39%
Epoch: 02 | Epoch Time: 0m 42s
	Train Loss: 0.702 | Train Acc: 69.96%
	 Val. Loss: 0.703 |  Val. Acc: 69.86%
