# Beta Testing Opinions | Sentiment Analysis Model

_Author: Karolina Mamczarz_

_Based on: [Deep Learning Nanodegree Program | Udacity](https://www.udacity.com/course/deep-learning-nanodegree--nd101)_

## Description

PyTorch is used as a training tool. It is an open source machine learning framweork.

## Load dataset

Reaserch will use [Amazon Review Data (2018)](https://nijianmo.github.io/amazon/index.html) datasets (downloaded on March 4th, 2020):
* [Video Games subset](http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Video_Games_5.json.gz)
* [Software subset](http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Software_5.json.gz)

See citiation below:

> Jianmo Ni, Jiacheng Li, Julian McAuley, **Justifying recommendations using distantly-labeled reviews and fined-grained aspects**, _Empirical Methods in Natural Language Processing (EMNLP)_, 2019

### Read sentiment data

In [1]:
import gzip
import json

def parse_dataset(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [2]:
def get_sentiment_data(path):
    data = {'pos': [], 'neg': []}
    labels = {'pos': [], 'neg': []}

    for review in parse_dataset(path):
        if 'reviewText' in review:
            if review['overall'] >= 4.0:
                data['pos'].append(review['reviewText'])
                labels['pos'].append(1)
            elif review['overall'] <= 2.0:
                data['neg'].append(review['reviewText'])
                labels['neg'].append(0)
    
    for sentiment in ['pos', 'neg']:
        assert len(data[sentiment]) == len(labels[sentiment]), \
                    "{} data size does not match labels size".format(sentiment)
    
    return data, labels   

In [3]:
video_games_data, video_games_labels = get_sentiment_data('./data/Video_Games_5.json.gz')
software_data, software_labels = get_sentiment_data('./data/Software_5.json.gz')

print('Reviews Video Games: {} pos / {} neg'.format(len(video_games_data['pos']), len(video_games_data['neg'])))
print('Reviews Software: {} pos / {} neg'.format(len(software_data['pos']), len(software_data['neg'])))

Reviews Video Games: 393267 pos / 55012 neg
Reviews Software: 8987 pos / 2219 neg


In [4]:
def join_sentiment_data(data1, data2, labels1, labels2):
    data = {'pos': [], 'neg': []}
    labels = {'pos': [], 'neg': []}
    
    for sentiment in ['pos', 'neg']:
        data[sentiment] = data1[sentiment] + data2[sentiment]
        labels[sentiment] = labels1[sentiment] + labels2[sentiment]
    
    return data, labels

In [5]:
pre_data, pre_labels = join_sentiment_data(video_games_data, software_data, video_games_labels, software_labels)

print('Data: {} pos / {} neg'.format(len(pre_data['pos']), len(pre_data['neg'])))

Data: 402254 pos / 57231 neg


In [6]:
def crop_sentiment_data(data, limit=25000):
    new_data = {'pos': [], 'neg': []}

    for sentiment in ['pos', 'neg']:
        new_data[sentiment] = data[sentiment][0:limit]
        
    return new_data

In [7]:
data = crop_sentiment_data(pre_data)
labels = crop_sentiment_data(pre_labels)

print('Data: {} pos / {} neg'.format(len(data['pos']), len(data['neg'])))
print('Labels: {} pos / {} neg'.format(len(labels['pos']), len(labels['neg'])))

Data: 25000 pos / 25000 neg
Labels: 25000 pos / 25000 neg


In [8]:
def combine_sentiment_data(data, labels):
    all_data = data['pos'] + data['neg']
    all_labels = labels['pos'] + labels['neg']
    
    return all_data, all_labels

In [9]:
all_data, all_labels = combine_sentiment_data(data, labels)
print('Data: {}'.format(len(all_data)))
print('Labels: {}'.format(len(all_labels)))

Data: 50000
Labels: 50000


### Clean up sentiment data

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *

import re
from bs4 import BeautifulSoup

def review_to_words(review):
    nltk.download("stopwords", quiet=True)
    stemmer = PorterStemmer()
    
    text = BeautifulSoup(review, "html.parser").get_text() # Remove HTML tags
    text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower()) # Convert to lower case
    words = text.split() # Split string into words
    words = [w for w in words if w not in stopwords.words("english")] # Remove stopwords
    words = [PorterStemmer().stem(w) for w in words] # stem
    
    return words

In [11]:
print(review_to_words(all_data[1266]))

['rememb', 'first', 'time', 'laid', 'eye', 'game', 'hear', 'awesom', 'nintendo', '64', 'go', 'schoolmat', 'eagerli', 'anticip', 'usa', 'releas', 'system', 'rememb', 'hear', 'demand', 'ensu', 'shortag', 'upcom', 'christma', 'even', 'saw', 'one', 'day', 'christma', 'shop', 'dad', 'happen', 'come', 'across', 'demo', 'set', 'mall', 'final', 'got', 'see', 'massiv', 'hype', 'game', 'bright', 'color', 'mario', 'fulli', '3d', 'larg', 'crowd', 'gather', 'around', 'catch', 'glimps', 'latest', 'video', 'game', 'technolog', 'linger', 'moment', 'yet', 'first', 'impress', 'still', 'fresh', 'memori', 'mario', '64', 'look', 'like', 'noth', 'ever', 'seen', 'also', 'look', 'leap', 'bound', 'better', 'anyth', 'ever', 'seen', 'playstat', 'sega', 'saturn', 'system', 'impress', 'sever', 'month', 'later', 'final', 'save', 'enough', 'money', 'purchas', 'system', 'bundl', 'mario', '64', 'love', 'game', 'dearli', 'back', 'day', 'still', 'love', 'game', 'still', 'look', 'feel', 'play', 'unlik', 'game', 'came', '

In [14]:
import pickle, os

cache_dir = os.path.join("./cache", "sentiment_analysis")
os.makedirs(cache_dir, exist_ok=True)

def reviews_to_words(data, cache_dir=cache_dir, cache_file="preprocessed_data.pkl"):
    cache_data = None
    if cache_file is not None:
        try:
            with open(os.path.join(cache_dir, cache_file), "rb") as f:
                cache_data = pickle.load(f)
            print("Read preprocessed data from cache file:", cache_file)
        except:
            pass
    
    if cache_data is None:
        words = [review_to_words(review) for review in data]
        
        if cache_file is not None:
            cache_data = dict(words=words)
            with open(os.path.join(cache_dir, cache_file), "wb") as f:
                pickle.dump(cache_data, f)
            print("Wrote preprocessed data to cache file:", cache_file)
    else:
        words = cache_data['words']
    
    return words

In [29]:
reviews_words = reviews_to_words(all_data)
all_words = [item for sublist in reviews_words for item in sublist]

Read preprocessed data from cache file: preprocessed_data.pkl


### Tokenize words

In [30]:
from collections import Counter

def tokenize_words(all_words, reviews_words):
    counts = Counter(all_words)
    vocab = sorted(counts, key=counts.get, reverse=True)
    vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
    reviews_ints = []
    for review_words in words:
        reviews_ints.append([vocab_to_int[word] for word in review_words])
        
    return vocab_to_int, reviews_ints

In [43]:
vocab_to_int, pre_reviews_ints = tokenize_words(all_words, reviews_words)

print('Unique words: {}'.format(len((vocab_to_int))))
print('Tokenized review: \n {}'.format(pre_reviews_ints[:1]))

Unique words: 50770
Tokenized review: 
 [[1, 137, 83, 4, 1219, 11]]


### Remove zero-length reviews

In [44]:
review_lens = Counter([len(x) for x in pre_reviews_ints])

print('Zero-length reviews: {}'.format(review_lens[0]))
print('Maximum review length: {}'.format(max(review_lens)))

Zero-length reviews: 55
Maximum review length: 2996


In [45]:
import numpy as np

def remove_zero_length_reviews(review_ints, labels):
    non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

    new_reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
    labels = np.array([labels[ii] for ii in non_zero_idx])
    
    return new_reviews_ints, labels

In [46]:
reviews_ints, encoded_labels = remove_zero_length_reviews(pre_reviews_ints, all_labels)

print('Number of reviews after removing zero-length review: {}'.format(len(reviews_ints)))
print('Number of labels after removing zero-length review: {}'.format(len(encoded_labels)))

Number of reviews after removing zero-length review: 49945
Number of labels after removing zero-length review: 49945


### Pad features

In [50]:
def pad_features(reviews_ints, seq_length=200):
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
        
    assert len(features) == len(reviews_ints), "Features should have as many rows as reviews."
    assert len(features[0]) == seq_length, "Each feature row should contain seq_length values."
    
    return features

In [51]:
features = pad_features(reviews_ints)
print(features[:40,:10])

[[    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [  269   488   339  3626   184    17    42   339   110    35]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0