In [75]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jadriantan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [76]:
# Might need this 
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jadriantan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [77]:
! pip3 install bs4 # in case you don't have it installed



# Data Generation

In [78]:
df = pd.read_csv("data.csv", sep=',', on_bad_lines='skip')

  df = pd.read_csv("data.csv", sep=',', on_bad_lines='skip')


In [79]:
# Extract Reviews and Ratings fields
df = df.loc[:, ['review_body', 'star_rating']]

print(df.head())

                                         review_body star_rating
0                                     Great product.           5
1  What's to say about this commodity item except...           5
2    Haven't used yet, but I am sure I will like it.           5
3  Although this was labeled as &#34;new&#34; the...           1
4                    Gorgeous colors and easy to use           4


In [80]:
# Converting into binary classification problem
df['label'] = df['star_rating'].apply(lambda x: 1 if x in [1,2,3] else 2)
df['review'] = df['review_body']

# Selecting 50,000 random reviews from each rating class
# Randomizing to avoid biases 
df_class_1 = df[df['label'] == 1].sample(n=50000, random_state=55)
df_class_2 = df[df['label'] == 2].sample(n=50000, random_state=55)

# Creating a new df concatenating both classes 
balanced_df = pd.concat([df_class_1, df_class_2])

print(balanced_df.head(10))

                                              review_body star_rating  label  \
98510   These do not work for refill ink it states one...           1      1   
233824  Once the ink cartriges were installed, things ...           3      1   
877033  Nothing special to say here - they are postcar...           3      1   
929541               It is a machine for counting bills.?           1      1   
201881  I guess you get what you pay for.  The magenta...           2      1   
456205               Not the best quality. Pale in color.           3      1   
645041               They do what they're supposed to do.           3      1   
862255               Very light weight, barely hold paper           1      1   
633612                          Only for 5 sheets or less           1      1   
736944  Not happy with these. I am not able to print w...           1      1   

                                                   review  
98510   These do not work for refill ink it states one...  

# Preprocessing

In [81]:
average_len_before = balanced_df['review'].str.len().mean()

# 1)converting all reviews into lowercase. 
# Ensures consistency: "Hello" and "hello" are now the same. 
balanced_df['review'] = balanced_df['review'].str.lower()

# 2)removing the HTML and URLs from the reviews
# HTML/URLs don't provide valuable information for sentiment analysis, so we remove them. 
balanced_df['review'] = balanced_df['review'].str.replace(r'<.*?>', '', regex=True)
balanced_df['review'] = balanced_df['review'].str.replace(r'http\S+', '', regex=True)

# 5)performing contractions on the reviews
    # Need to process this before before removing non-alphanum chars and extra spaces 
# This task provides uniformity and simplifies tokenization
def contractions_helper(ss):

    # To avoid attributionError
    if type(ss) != str: 
        return
    contractions_dict = {
        "won't": "will not",
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "hasn't": "has not",
        "coudn't": "could not",
        "they're": "they are",
        "you're": "you are",
        "we'll": "we will",
        "it's": "it is",
        "i'll": "i will",
        "he's": "he is",
        "she's": "she is"
    }
    # loop through the string and replace all contractions
    for cont, exp in contractions_dict.items():
            if cont in ss:
                 ss = ss.replace(cont, exp)
    return ss

balanced_df['review'] = balanced_df['review'].apply(contractions_helper)

# 3)removing non-alphabetical characters
# Removing non-alphanum characters since they could be noise in sentiment analysis
balanced_df['review'] = balanced_df['review'].str.replace(r'[^a-zA-Z\s]', '', regex=True)

# 4)removing extra spaces
balanced_df['review'] = balanced_df['review'].str.replace(r'\s+', ' ', regex=True)


average_len_after = balanced_df['review'].str.len().mean()

# average length decreased after cleaning due to the removal of unwanted characters, spaces, and expansion of contractions
print(f'Average length before data cleaning:{average_len_before:.4f}, Average length after data cleaning:{average_len_after:.4f}')

Average length before data cleaning:189.4582, Average length after data cleaning:179.7509


# Feature Extraction

In [None]:
# imports 
from gensim.models import Word2Vec
import gensim.downloader as api


In [84]:
# Filtering out rows where 'review' is not a string
balanced_df = balanced_df[balanced_df['review'].apply(lambda x: isinstance(x, str))]

# Word2Vec model trained with amazon reviews
sentences = balanced_df['review'].str.split().tolist() # get review sentences
# train word2vec model
my_model = Word2Vec(sentences, vector_size=300, window=13, min_count=9, workers=4)
my_model.train(sentences, total_examples=len(sentences), epochs=10)

# Pre-trained word2vec model
wv_model = api.load('word2vec-google-news-300')

In [85]:
# Pretrained: Check for semantic similarities
print("Word2Vec Model: Similarity for words 'excellent' and 'outstanding':", wv_model.similarity('excellent', 'outstanding'))

# Pretrained: Check for analogy: King - Man + Woman
result = wv_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print("Word2Vec Model: King - Man + Woman = ", result[0][0])


# My model: Check semantic similarities
if 'excellent' in my_model.wv and 'outstanding' in my_model.wv:
    print("My Model: Similarity between 'excellent' and 'outstanding':", my_model.wv.similarity('excellent', 'outstanding'))

# My model: Check analogy: King - Man + Woman
if all(word in my_model.wv for word in ['woman', 'king', 'man']):
    result = my_model.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
    print("My Model: King - Man + Woman = ", result[0][0])

Word2Vec Model: Similarity for words 'excellent' and 'outstanding': 0.55674857
Word2Vec Model: King - Man + Woman =  queen
My Model: Similarity between 'excellent' and 'outstanding': 0.67850655
My Model: King - Man + Woman =  comfortable


# Simple Models: Perceptron and SVM

In [86]:
# imports
import numpy as np
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.metrics import f1_score, precision_score, recall_score

# TFIDF 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


TFIDF Features

In [87]:
from sklearn.model_selection import train_test_split

# TFIDF Features
tfidf_vector = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vector.fit_transform(balanced_df['review'])

# Splitting data into train and test sets
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    tfidf_features, 
    balanced_df['label'],
    test_size= 0.2,
    random_state=55
)

Word2Vec Features

In [88]:
# Word2Vec Features

# Calculate average Word2Vec for each review
def average_word2vec(review, model, dimension):
    avg_w2v = np.zeros((dimension,))
    num_words = 0
    for word in review:
        if word in model:  
            avg_w2v += model[word]  
            num_words += 1
    if num_words > 0:
        avg_w2v /= num_words
    return avg_w2v

sentences = balanced_df['review'].str.split().tolist()

# Convert reviews into feature vectors using average Word2Vec
word2vec_features = np.array([average_word2vec(review, wv_model, 300) for review in sentences])

# Splitting into train and test sets
x_train_word2vec, x_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(
    word2vec_features, 
    balanced_df['label'],
    test_size= 0.2,
    random_state=55
)

Perceptron w/ TFIDF features and word2vec features

In [89]:
# Perceptron on tfidf features
tfidf_perceptron = Perceptron(max_iter=1000, random_state=55)

# Fit training set into Perceptron Model 
tfidf_perceptron.fit(x_train_tfidf, y_train_tfidf)

# Make predictions w/ testing set
tfidf_prediction = tfidf_perceptron.predict(x_test_tfidf)

# Report Precision, Recall, and f1-score
tfidf_test_precision = precision_score(y_test_tfidf, tfidf_prediction, average='binary')
tfidf_test_recall = recall_score(y_test_tfidf, tfidf_prediction, average='binary')
tfidf_test_f1 = f1_score(y_test_tfidf, tfidf_prediction, average='binary')

# Print
print(f"TF-IDF Perceptron Model~ Precision:{tfidf_test_precision:.4f}, Recall:{tfidf_test_recall:.4f}, F1-Score:{tfidf_test_f1:.4f}")


# Perceptron on word2vec features
word2vec_perceptron = Perceptron(max_iter=1000, random_state=55)

# Fit training set into Perceptron Model
word2vec_perceptron.fit(x_train_word2vec, y_train_word2vec)

# Make predictions w/ testing set
word2vec_prediction = word2vec_perceptron.predict(x_test_word2vec)

# Report Precision, Recall, and f1-score
word2vec_test_precision = precision_score(y_test_word2vec, word2vec_prediction, average='binary')
word2vec_test_recall = recall_score(y_test_word2vec, word2vec_prediction, average='binary')
word2vec_test_f1 = f1_score(y_test_word2vec, word2vec_prediction, average='binary')

# Print
print(f"Word2Vec Perceptron Model~ Precision:{word2vec_test_precision:.4f}, Recall:{word2vec_test_recall:.4f}, F1-Score:{word2vec_test_f1:.4f}")

TF-IDF Perceptron Model~ Precision:0.7969, Recall:0.8345, F1-Score:0.8153
Word2Vec Perceptron Model~ Precision:0.8374, Recall:0.7370, F1-Score:0.7840


My explanation:
- Deep learning models perform better with embeddings like Word2Vec than with sparse representations like TF-IDF. 
- Linear models like perceptrons might sometimes favor the discriminative power of TF-IDF.

# Feedforward Neural Networks

In [90]:
# import
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [91]:
# Convert to PyTorch tensors
x_train_tensor = torch.FloatTensor(x_train_word2vec)
y_train_tensor = torch.LongTensor(y_train_word2vec.values) 
x_test_tensor = torch.FloatTensor(x_test_word2vec)
y_test_tensor = torch.LongTensor(y_test_word2vec.values)

# Check before moving forward
print("Unique values in y_train_tensor:", torch.unique(y_train_tensor))
print("Unique values in y_test_tensor:", torch.unique(y_test_tensor))

print("Shape of X_train_tensor:", x_train_tensor.shape)
# print("Sample values from X_train_tensor:", X_train_tensor[:5])
print("Shape of X_test_tensor:", x_test_tensor.shape)
# print("Sample values from X_test_tensor:", X_test_tensor[:5])

y_train_tensor -= 1
y_test_tensor -= 1

print("Updated unique values in y_train_tensor:", torch.unique(y_train_tensor))
print("Updated unique values in y_test_tensor:", torch.unique(y_test_tensor))

Unique values in y_train_tensor: tensor([1, 2])
Unique values in y_test_tensor: tensor([1, 2])
Shape of X_train_tensor: torch.Size([79988, 300])
Shape of X_test_tensor: torch.Size([19998, 300])
Updated unique values in y_train_tensor: tensor([0, 1])
Updated unique values in y_test_tensor: tensor([0, 1])


In [92]:
train_data = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(300, 50),
            nn.ReLU(),
            nn.Linear(50, 5),
            nn.ReLU(),
            nn.Linear(5, 2),
            nn.Softmax(dim=1)
        )
    
    def forward(self, x):
        return self.layers(x)

model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Testing
with torch.no_grad():
    test_outputs = model(x_test_tensor)
    _, predicted = test_outputs.max(1)
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_word2vec)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 1/10, Loss: 0.4475921392440796
Epoch 2/10, Loss: 0.5642859935760498
Epoch 3/10, Loss: 0.46849915385246277
Epoch 4/10, Loss: 0.44438159465789795
Epoch 5/10, Loss: 0.5127902030944824
Epoch 6/10, Loss: 0.4660440683364868
Epoch 7/10, Loss: 0.4226842522621155
Epoch 8/10, Loss: 0.5010776519775391
Epoch 9/10, Loss: 0.5440772175788879
Epoch 10/10, Loss: 0.4745391309261322
Test Accuracy: 83.72%


Concatenate the first 10 Word2Vec vectors for each review as the input feature

In [93]:
def concat_first_10_word2vec(review, model, dimension):
    feature_vec = []
    for i in range(10):
        if i < len(review) and review[i] in model:
            feature_vec.extend(model[review[i]])
        else:
            feature_vec.extend(np.zeros((dimension,)))
    return feature_vec

sentences = balanced_df['review'].str.split().tolist()

# Convert reviews into feature vectors using concatenated Word2Vec
word2vec_10words_features = np.array([concat_first_10_word2vec(review, wv_model, 300) for review in sentences])

In [94]:
# split into train and test sets
x_train_10words, x_test_10words, y_train_10words, y_test_10words = train_test_split(
    word2vec_10words_features, 
    balanced_df['label'],
    test_size= 0.2,
    random_state=55
)

x_train_tensor = torch.FloatTensor(x_train_10words)
y_train_tensor = torch.LongTensor(y_train_10words.values) 
x_test_tensor = torch.FloatTensor(x_test_10words)
y_test_tensor = torch.LongTensor(y_test_10words.values)

y_train_tensor -= 1
y_test_tensor -= 1



In [95]:
train_data = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)

class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(3000, 50),
            nn.ReLU(),
            nn.Linear(50, 5),
            nn.ReLU(),
            nn.Linear(5, 2),
            nn.Softmax(dim=1)
        )
    
    def forward(self, x):
        return self.layers(x)

model = MLP()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 10
for epoch in range(epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, target)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Testing
with torch.no_grad():
    test_outputs = model(x_test_tensor)
    _, predicted = test_outputs.max(1)
    accuracy = (predicted == y_test_tensor).sum().item() / len(y_test_10words)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")

Epoch 1/10, Loss: 0.4758097231388092
Epoch 2/10, Loss: 0.467458575963974
Epoch 3/10, Loss: 0.475098580121994
Epoch 4/10, Loss: 0.48342689871788025
Epoch 5/10, Loss: 0.4066973328590393
Epoch 6/10, Loss: 0.49539807438850403
Epoch 7/10, Loss: 0.4893752336502075
Epoch 8/10, Loss: 0.41361528635025024
Epoch 9/10, Loss: 0.38959094882011414
Epoch 10/10, Loss: 0.39206886291503906
Test Accuracy: 80.78%


# Recurrent Neural Network

In [96]:
# imports
import torch.nn.functional as F

A little bit of preprocessing

In [116]:
# Using this: x_train_word2vec, x_test_word2vec, y_train_word2vec, y_test_word2vec

# To feed your data into our RNN, limit the maximum review length to 10 
# by truncating longer reviews and padding shorter reviews with a null value (0)

def get_word2vec_sequence(review, model, dimension):
    w2v_sequence = []
    for word in review:
        if word in model:
            w2v_sequence.append(model[word])
        else:
            w2v_sequence.append(np.zeros((dimension,)))  # Using a zero vector for unknown words
    return w2v_sequence

sentences = balanced_df['review'].str.split().tolist()

# Convert reviews into sequences of Word2Vec vectors
word2vec_sequences = [get_word2vec_sequence(review, wv_model, 300) for review in sentences]

max_sequence_length = 10  # or whatever length you decide

def pad_or_truncate_sequence(sequence, max_length):
    if len(sequence) > max_length:
        sequence = sequence[:max_length]
    else:
        while len(sequence) < max_length:
            sequence.append(np.zeros((300,)))  # Padding with zero vectors
    return sequence

padded_word2vec_sequences = np.array([pad_or_truncate_sequence(seq, max_sequence_length) for seq in word2vec_sequences])

x_train_word2vec, x_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(
    padded_word2vec_sequences, 
    balanced_df['label'],
    test_size= 0.2,
    random_state=55
)

In [117]:
# check 
print(x_train_word2vec.shape, x_test_word2vec.shape)


(79988, 300) (19998, 300)


### Simple RNN

In [104]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        h0 = torch.zeros(1, x.size(0), hidden_size).to(x.device)  # Initialize hidden state
        out, _ = self.rnn(x, h0)
        out = self.fc(out[:, -1, :])  # Only take the last time step's output for classification
        return out

input_size = 300  # Word2Vec dimension
hidden_size = 50
output_size = 2   # Number of classes
model = SimpleRNN(input_size, hidden_size, output_size)

In [118]:
x_train_rnn_tensor = torch.FloatTensor(x_train_word2vec)
x_test_rnn_tensor = torch.FloatTensor(x_test_word2vec)
y_train_rnn_tensor = torch.LongTensor(y_train_word2vec.values)
y_test_rnn_tensor = torch.LongTensor(y_test_word2vec.values)

print("Unique values in y_train_tensor:", torch.unique(y_train_rnn_tensor))
print("Unique values in y_test_tensor:", torch.unique(y_test_rnn_tensor))


Unique values in y_train_tensor: tensor([0, 1])
Unique values in y_test_tensor: tensor([0, 1])
