In [1]:
# !pip install pandas
# !pip install numpy
# !pip install nltk
# !pip install bs4

import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
import re
from bs4 import BeautifulSoup

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sanavesa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Ensure reproducibility
Use a fixed seed such that all steps and results can be reproduced.

In [2]:
SEED = 544
np.random.seed(SEED)

# Q1. Dataset Generation

## Read Data
Load the data locally while skipping on lines that contains errors.

In [3]:
# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz
# Load the data from disk
url = 'https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz'
data = pd.read_table(url,sep = '\t', on_bad_lines='skip')

## Keep Reviews and Ratings
Strip the dataset to use only the two columns we're interested in, while dropping all rows that have missing values.

In [4]:
# Keep only the 2 columns we need and remove rows that have missing values
data = data[['review_body', 'star_rating']].copy()
data.dropna(inplace=True)

## Build a Balanced Dataset
Only keep 250K reviews along with their ratings. That is 50K reviews for each rating score (1-5).

In [5]:
rating_samples = []
for rating in range(1,6):
    sample = data[ data['star_rating'] == rating].sample(50000)
    rating_samples.append(sample)
data = pd.concat(rating_samples)

## Labelling Reviews
Positive sentiment (class 1) is a rating of more than 3. Negative sentiment (class 2) is a rating less than 3. Neutral sentiment (class 3) have a rating of 3. Here, we create a new column for the review's sentiment label.

In [6]:
# Transform the given rating (1-5) into its respective sentiment class
def label_review(rating):
    if rating > 3:
        return 0
    elif rating < 3:
        return 1
    else:
        return 2

# Create a new column for the review's sentiment label
data['label'] = data['star_rating'].apply(label_review).astype('int8')

## Data Cleaning
Clean and pre-process the data to improve performance, right before generating input features for each review.

In [7]:
# Create a new column for the cleaned/processed reviews
# It shall follow the same procedure from HW1.

###### Convert to lower case
data['cleaned_reviews'] = data['review_body'].str.lower()

###### Remove HTML tags and URLs from a string
def sanitize_review(text):
    # remove HTML tags
    text = BeautifulSoup(str(text), 'html.parser').get_text()
    # remove URLS
    text = re.sub(r'http\S+', '', str(text))
    return text
data['cleaned_reviews'] = data['cleaned_reviews'].apply(sanitize_review)

###### Use a library to expand the contractions as it includes a plethora of pre-defined contractions
# !pip install contractions
import contractions
def fix_contractions(text):
    return contractions.fix(text)
data['cleaned_reviews'] = data['cleaned_reviews'].apply(fix_contractions)

###### Remove all characters but keep english characters and space
data['cleaned_reviews'] = data['cleaned_reviews'].str.replace('[^a-zA-Z\s]', ' ')

###### Remove all unnecessary spaces
def remove_extra_spaces(text):
    return ' '.join(str(text).split())
data['cleaned_reviews'] = data['cleaned_reviews'].apply(remove_extra_spaces)

###### Remove stopwords
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')
# Split each review into a list of words, then eliminate those words that are in the stopwords set as provided by nltk
def remove_stop_words(text):
    return ' '.join([word for word in str(text).split() if word not in (stop)])
data['cleaned_reviews'] = data['cleaned_reviews'].apply(remove_stop_words)

###### Lemmatize reviews
from nltk.stem import WordNetLemmatizer
tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
    return ' '.join([lemmatizer.lemmatize(word, pos='v') for word in tokenizer.tokenize(text)])
# Use NLTK lemmatizer with verb as its part of speech to reduce inflections
data['cleaned_reviews'] = data['cleaned_reviews'].apply(lemmatize)

###### Drop rows that have no text after cleaning and preprocessing
data = data[data['cleaned_reviews'] != '']

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sanavesa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Q2. Word Embedding

## 2a. Google News Word2Vec

### Load Google News Word2Vec

In [8]:
import gensim.downloader as api
w2v_google = api.load('word2vec-google-news-300')

### Check semantic similarities

In [9]:
# Reference: https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

# king - man + woman =~ queen
print('king-man+woman =~ ', w2v_google.most_similar(positive=['woman', 'king'], negative=['man'], topn=3))

# car =~ vehicle
print('car =~ ', w2v_google.most_similar(positive=['car'], topn=3))

# excellent =~ terrific
print('excellent =~ ', w2v_google.most_similar(positive=['excellent'], topn=3))

# beautiful =~ gorgeous
print('beautiful =~ ', w2v_google.most_similar(positive=['beautiful'], topn=3))

# angry =~ irate
print('angry =~ ', w2v_google.most_similar(positive=['angry'], topn=3))

king-man+woman =~  [('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951)]
car =~  [('vehicle', 0.7821096181869507), ('cars', 0.7423831224441528), ('SUV', 0.7160962224006653)]
excellent =~  [('terrific', 0.7409726977348328), ('superb', 0.7062715888023376), ('exceptional', 0.681470513343811)]
beautiful =~  [('gorgeous', 0.8353005051612854), ('lovely', 0.8106936812400818), ('stunningly_beautiful', 0.7329413294792175)]
angry =~  [('irate', 0.8138925433158875), ('enraged', 0.7705066800117493), ('indignant', 0.7013434171676636)]


## 2b. My Word2Vec

### Train Word2Vec on our dataset
Use gensim's Word2Vec implementation to train our dataset and learn vector encodings from the unprocessed reviews. Then save the model to disk to use it in other parts of the assignment.

In [10]:
from gensim import utils

# Stream the reviews one by one
class MyCorpus:
    def __init__(self, data):
        self.data = data
        
    def __iter__(self):
        for review in self.data['review_body']:
            yield utils.simple_preprocess(review)

# Train a Word2Vec model using the UNPROCESSED reviews with the specified parameters
from gensim.models import Word2Vec
sentences = MyCorpus(data)
# Using a single worker with a fixed seed to ensure the results are identical on every machine
w2v_own = Word2Vec(sentences=sentences, min_count=10, vector_size=300, window=11, epochs=10, seed=SEED, workers=1)

### Check semantic similarities

In [11]:
# king - man + woman =~ queen
print('king-man+woman =~ ', w2v_own.wv.most_similar(positive=['woman', 'king'], negative=['man'], topn=3))

# car =~ vehicle
print('car =~ ', w2v_own.wv.most_similar(positive=['car'], topn=3))

# excellent =~ terrific
print('excellent =~ ', w2v_own.wv.most_similar(positive=['excellent'], topn=3))

# beautiful =~ gorgeous
print('beautiful =~ ', w2v_own.wv.most_similar(positive=['beautiful'], topn=3))

# angry =~ irate
print('angry =~ ', w2v_own.wv.most_similar(positive=['angry'], topn=3))

king-man+woman =~  [('queen', 0.4555205702781677), ('arthur', 0.38737979531288147), ('kaiser', 0.37041571736335754)]
car =~  [('backpack', 0.7148886919021606), ('purse', 0.7141773104667664), ('vehicle', 0.6302756071090698)]
excellent =~  [('outstanding', 0.7944633960723877), ('exceptional', 0.7194543480873108), ('incredible', 0.6430865526199341)]
beautiful =~  [('gorgeous', 0.7997910976409912), ('lovely', 0.7912963032722473), ('stunning', 0.7248261570930481)]
angry =~  [('upset', 0.7107461094856262), ('annoyed', 0.7074971795082092), ('irritated', 0.6425274014472961)]


### Save My Word2Vec to disk
Save the keyed vectors of the trained word2vec to be used for the other parts of the assignment.

In [12]:
w2v_own.wv.save('my_w2v.w2v')

## Conclusion: Comparison between Google W2V and My W2V
Overall, the vectors generated by our model and the pretrained model are similar.

In both models, the 'excellent', 'beautiful', and 'angry' cases resulted in appropriately similar results with great semantic similarities (i.e., beautiful ~= gorgeous with 0.80 similarity). Likewise, the 'king-man+woman' results in 'queen' in both models; however, the Google News model has a higher semantic similarity than our model, 0.711 vs 0.439 respectively. This is partly due to the differing nature of the dataset: our dataset is on reviews whereas the other used Google News written professionally.

Finally, in the Google News model, the 'car' word was similar to 'vehicle' with 0.78 semantic similarity while being the most similar, whereas in our model the word 'car' and 'vehicle' had a 0.65 semantic similarity but it ranked 3rd after 'backpack' and 'purse'.

# 3. Save dataset to disk
Save the trimmed, processed dataset to disk to be used in the other parts of the assignment.

## 3.1 Remove unnecessary columns in the dataset

In [13]:
# Remove rating and review_body columns as they are not needed to save space
data.drop('review_body', 1, inplace=True)
data.drop('star_rating', 1, inplace=True)

  
  This is separate from the ipykernel package so we can avoid doing imports until


## 3.2 Save processed dataset to disk

In [14]:
data.to_pickle('dataset.pkl')