In [1]:
# !pip install pandas
# !pip install numpy

import pandas as pd
import numpy as np

## Ensure reproducibility
Use a fixed seed such that all steps and results can be reproduced.

In [2]:
# seed handpicked to ensure all of the cleaning/pre-processing steps were visually shown
SEED = 544
np.random.seed(SEED)

# Q3. Simple Models
In this section, we will use the word2vec features in the section above to train a perceptron and SVM for binary models. The input feature will be the average Word2Vec vectors for each review. Words with no encoding vector will be ignored. Furthermore, data cleaning and preprocessing steps similar of that in HW1 will be utilized. Finally, the dataset is split int 80%-20% training and testing split.

## Load dataset
Load the pandas dataset from Q1.

In [3]:
# Load the data from disk
data = pd.read_pickle('dataset.pkl')

## Load both Word2Vec models
Load the w2v models from Q2.

In [6]:
import gensim.downloader as api
w2v_google = api.load('word2vec-google-news-300')

from gensim.models import KeyedVectors
w2v_own = KeyedVectors.load('my_w2v.w2v')

## Create the averaged input features for the reviews
Input feature is the average Word2Vec vector for each review. Words with no encoding vectors are ignored.

In [7]:
# Transform the given review body text into the averaged Word2Vec vector using a given trained word2vec model
def create_avg_input_feature(text, wv):
    vec = np.zeros((300,), dtype=float)
    count = 0
    # Will skip words that have no vectors
    for word in str(text).split():
        if word in wv:
            vec += np.array(wv[word])
            count += 1
    if count > 0:
        vec /= count
    return vec.tolist()

# Create a new column for the review's input feature for both our w2v and googles w2v
data['own_input_features'] = data['cleaned_reviews'].apply(
    lambda text: create_avg_input_feature(text, w2v_own)
)

data['google_input_features'] = data['cleaned_reviews'].apply(
    lambda text: create_avg_input_feature(text, w2v_google)
)

## Helper function to report results for each model

In [8]:
from sklearn.metrics import accuracy_score

def report_accuracy(text, y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    print(f'{text}: accuracy is {accuracy:.3f}.')
    print()

## Training and Testing data split
Split the data into two distinct parts (80% training, 20% testing) so that there is no overlap. This is done to ensure no data leakage nor bias influences the training and we can have a better view of the training process (if it overfitted for example).

In [9]:
from sklearn.model_selection import train_test_split

binary_data = data[data['label'] <= 1] # Only select class 0 (positive) and class 1 (negative)
own_input_features = [[col for col in row] for row in binary_data['own_input_features']]
google_input_features = [[col for col in row] for row in binary_data['google_input_features']]
binary_labels = binary_data['label']

# Perform an 80-20 split for training and testing data on the binary data only
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(
    binary_data['cleaned_reviews'],
    binary_labels,
    test_size=0.2,
    random_state=SEED
)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_tfidf)
X_test_tfidf = vectorizer.transform(X_test_tfidf)

X_train_own, X_test_own, y_train_own, y_test_own = train_test_split(
    own_input_features,
    binary_labels,
    test_size=0.2,
    random_state=SEED
)

X_train_google, X_test_google, y_train_google, y_test_google = train_test_split(
    google_input_features,
    binary_labels,
    test_size=0.2,
    random_state=SEED
)

## Perceptron - My Word2Vec Model
Use our Word2Vec model as an input to the Perceptron.

In [10]:
from sklearn.linear_model import Perceptron

model = Perceptron()
model.fit(X_train_own, y_train_own)

y_test_pred = model.predict(X_test_own)

report_accuracy('Own W2C - Perceptron - Binary', y_test_own, y_test_pred)

Own W2C - Perceptron - Binary: accuracy is 0.737.



## Perceptron - Google News Word2Vec Model
Use pre-trained Word2Vec model as an input to the Perceptron.

In [11]:
model = Perceptron()
model.fit(X_train_google, y_train_google)

y_test_pred = model.predict(X_test_google)

report_accuracy('Google News W2V - Perceptron - Binary', y_test_google, y_test_pred)

Google News W2V - Perceptron - Binary: accuracy is 0.787.



## Perceptron - TF-IDF (HW1)
Use TF-IDF Feature Extraction to train the Perceptron; identical to HW1.

In [12]:
model = Perceptron()
model.fit(X_train_tfidf, y_train_tfidf)

y_test_pred = model.predict(X_test_tfidf)

report_accuracy('TF-IDF - Perceptron - Binary', y_test_tfidf, y_test_pred)

TF-IDF - Perceptron - Binary: accuracy is 0.822.



## Perceptron - Conclusion
Our Word2Vec model achieved a testing accuracy of 0.737, whereas Google's model got 0.787. This difference is miniscule and we can conclude that our model is decent. When compared to HW1's TF-IDF approach, which scored testing accuracy of 0.822, we can see that Word2Vec falls shy behind. Nevertheless, they're all approximately 80% accuracy.

Initially, I believed our model should be better than Google's model because our model was tailored specifically for the reviews and the dataset it used was that of the same domain, whereas Google's model used dataset for news and thus the semantic relationships could differ. However, our w2v was trained on data that had a lot of typos and other features that could harm the training whereas Googles had a more professional tone to it.

## SVM - My Word2Vec Model
Use our Word2Vec model as an input to the SVM.

In [13]:
from sklearn.svm import LinearSVC

model = LinearSVC(max_iter=2500)
model.fit(X_train_own, y_train_own)

y_test_pred = model.predict(X_test_own)

report_accuracy('Own W2C - SVM - Binary', y_test_own, y_test_pred)



Own W2C - SVM - Binary: accuracy is 0.836.



## SVM - Google News Word2Vec Model
Use pre-trained Word2Vec model as an input to the SVM.

In [14]:
model = LinearSVC(max_iter=2500)
model.fit(X_train_google, y_train_google)

y_test_pred = model.predict(X_test_google)

report_accuracy('Google News W2V - SVM - Binary', y_test_google, y_test_pred)

Google News W2V - SVM - Binary: accuracy is 0.819.



## SVM - TF-IDF (HW1)
Use TF-IDF Feature Extraction to train the SVM; identical to HW1.

In [15]:
model = LinearSVC(max_iter=2500)
model.fit(X_train_tfidf, y_train_tfidf)

y_test_pred = model.predict(X_test_tfidf)

report_accuracy('TF-IDF - SVM - Binary', y_test_tfidf, y_test_pred)

TF-IDF - SVM - Binary: accuracy is 0.870.



## SVM - Conclusion
Our Word2Vec model achieved a testing accuracy of 0.836, whereas Google's model got 0.819. While this difference is small, our model performed better in SVM than in Perceptron. Moreover, when compared to HW1's TF-IDF approach, which scored testing accuracy of 0.870, we can see that Word2Vec falls shy behind aswell. Nevertheless, they're all approximately 85% accuracy. TF-IDF still reigns supreme in both SVM and Perceptron.

Unlike before, I believe our model is better than Google's model because our model used a domain-specific dataset that was tailored for reviews whereas Google used a news dataset which is generic for this task