# 01 - Modeling

## 1. Import Packages <a name="import"></a>

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import gzip
import pickle

In [2]:
import glob
import os
from collections import defaultdict
import re

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import naive_bayes
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords

In [61]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
import gzip

In [4]:
import spacy

spacy.prefer_gpu()
nlp = spacy.load('en_core_web_sm')

In [7]:
import gensim

# Uncomment below if running for first time. 
# Setup nltk corpora path and Google Word2Vec location
file_dir = os.path.abspath('.')
folder = 'data'
google_vec_file = os.path.join(file_dir, folder, 'GoogleNews-vectors-negative300.bin')

model = gensim.models.KeyedVectors.load_word2vec_format(google_vec_file, binary=True)

## Table of Contents <a name="table"></a>
1. [Import Packages](#import)
2. [Load Data](#load)
3. [Clean Data](#clean)
4. [Tokenize](#token)
5. [Text Cleaning](#text)
6. [Modeling](#model)

## 2. Load Data <a name="load"></a>

The data set can be found at https://s3.amazonaws.com/amazon-reviews-pds/readme.html

In [4]:
def file_to_df(filepath):  
    """
        Function to parse an Amazon file for a product category.
        Some Amazon files have some characters that prevent delimiters
        from working correctly for the pandas method.
        
        :param filepath: path to the file
        
        :returns df: dataframe version of the file
    """
    
    filename = filepath.split('/')[-1]
    
    print(filename)
    
    with gzip.open(filepath, 'r') as f:
        for row_idx, row in enumerate(f):
            if row_idx == 0:
                row = row.decode('UTF-8')
                row = row.strip().split('\t')
                col_names = row

    print('Number of Rows: ' + str(row_idx))
    
    entry_dict = {name:[None]*row_idx for name in col_names}
    
    with gzip.open(filepath, 'r') as f:
        for row_idx, row in enumerate(f):
            if row_idx == 0:
                continue

            row = row.decode('UTF-8')
            row = row.strip().split('\t')

            for col_idx, name in enumerate(col_names):
                entry_dict[name][row_idx-1] = row[col_idx]
    
    print('Making entry_dict\n')

    df = pd.DataFrame()
    for name in col_names:
        df[name] = entry_dict[name]
        
    entry_dict = None
        
    return df

In [5]:
product_df = file_to_df('./data/amazon_reviews_us_PC_v1_00.tsv.gz')

amazon_reviews_us_PC_v1_00.tsv.gz
Number of Rows: 6908554
Making entry_dict



In [6]:
product_df.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,22873041,R3ARRMDEGED8RD,B00KJWQIIC,335625766,Plemo 14-Inch Laptop Sleeve Case Waterproof Fa...,PC,5,0,0,N,Y,Pleasantly surprised,I was very surprised at the high quality of th...,2015-08-31
1,US,30088427,RQ28TSA020Y6J,B013ALA9LA,671157305,TP-Link OnHub AC1900 Wireless Wi-Fi Router,PC,5,24,31,N,N,OnHub is a pretty no nonsense type router that...,I am a Google employee and had to chance to us...,2015-08-31
2,US,20329786,RUXJRZCT6953M,B00PML2GQ8,982036237,AmazonBasics USB 3.0 A Male to A Male Cable - ...,PC,1,2,2,N,N,None of them worked. No functionality at all.,"Bought cables in 3ft, 6ft and 9ft. NONE of th...",2015-08-31
3,US,14215710,R7EO0UO6BPB71,B001NS0OZ4,576587596,Transcend P8 15-in-1 USB 2.0 Flash Memory Card...,PC,1,0,0,N,Y,just keep searching.,"nope, cheap and slow",2015-08-31
4,US,38264512,R39NJY2YJ1JFSV,B00AQMTND2,964759214,Aleratec SATA Data Cable 2.0 20in Serial ATA S...,PC,5,0,0,N,Y,Five Stars,Excellent! Great value and does the job.,2015-08-31


Return to [Table of Contents](#table)

## 3. Clean Data <a name="clean"></a>

We will remove some columns: <br>
- 'marketplace' because it is US for all products
- 'review_id' because it is unique to reviews
- 'product_id' because 'product_parent' supersedes it

In [10]:
product_df = product_df.drop(['marketplace', 'review_id', 'product_id'], axis = 1)

In [8]:
#helpful_votes and total_votes need to be cast as numbers
product_df = product_df.astype({"helpful_votes":np.int64, 
                                "total_votes":np.int64})

In [11]:
product_df.head()

Unnamed: 0,customer_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,22873041,335625766,Plemo 14-Inch Laptop Sleeve Case Waterproof Fa...,PC,5,0,0,N,Y,Pleasantly surprised,I was very surprised at the high quality of th...,2015-08-31
1,30088427,671157305,TP-Link OnHub AC1900 Wireless Wi-Fi Router,PC,5,24,31,N,N,OnHub is a pretty no nonsense type router that...,I am a Google employee and had to chance to us...,2015-08-31
2,20329786,982036237,AmazonBasics USB 3.0 A Male to A Male Cable - ...,PC,1,2,2,N,N,None of them worked. No functionality at all.,"Bought cables in 3ft, 6ft and 9ft. NONE of th...",2015-08-31
3,14215710,576587596,Transcend P8 15-in-1 USB 2.0 Flash Memory Card...,PC,1,0,0,N,Y,just keep searching.,"nope, cheap and slow",2015-08-31
4,38264512,964759214,Aleratec SATA Data Cable 2.0 20in Serial ATA S...,PC,5,0,0,N,Y,Five Stars,Excellent! Great value and does the job.,2015-08-31


In order to improve the results of NLP modeling, we will only focus on select reviews. <br>
We will only look at reviews with at least 5 helpful upvotes. <br>
We will also consider reviews written by Vine reviewers since their feedback is usually more accredited. 

In [12]:
helpful_df = product_df[(product_df['helpful_votes'] >= 5) | (product_df['vine'] == 'Y')]

In [13]:
#find the word laptop in title to focus on laptop products

laptop_brands = ['ASUS', 'Acer', 'Alienware', 'Apple', 'Dell', 'HP', 'Lenovo', 'Samsung']
laptop_types = ['Laptop', 'Chromebook', 'MacBook']
accessory_types = ['Case', 'Adapter', 'Charger']

def find_laptops(title):
    """ Helper function to find only laptops within the Amazon file
        
        :params title: the title of the product
        
        :returns boolean: returns whether the product is a laptop or not
    """
    
    brand = title.split()[0]
    
    #check if the product is within the laptop brands of interest
    if brand in laptop_brands:
        title_words = title.split(' ')
        for word in title_words:
            #if the product is a laptop accessory, we will ignore it
            if word in accessory_types:
                return False
        #otherwise, we will say it is a laptop
        for word in title_words:
            if word in laptop_types:
                return True
    return False


laptop_products_mask = helpful_df['product_title'].apply(find_laptops)

helpful_df = helpful_df[laptop_products_mask]

helpful_df.head()

Unnamed: 0,customer_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
561,33788671,942854828,"ASUS C201 11.6 Inch Chromebook (Rockchip, 4 GB...",PC,4,5,5,N,Y,Awesome Chromebook!,I was worried that the Rockchip processor woul...,2015-08-31
622,108604,218670083,ASUS K501UX 15-inch Gaming Laptop (Intel Core ...,PC,1,6,6,N,Y,Don't buy this junk.,"Don't buy this laptop, it's good at first but ...",2015-08-31
996,39145251,774078247,Apple MacBook Pro Laptop,PC,1,7,7,N,Y,Bitter experience. Expensive too!,"I almost forgot about this order, found it acc...",2015-08-31
1454,25575022,516262603,Lenovo S41 Laptop,PC,4,2,4,Y,N,Laptop of The Times - No DVD Drive,Its a great little laptop. I like the size and...,2015-08-31
4110,52924529,251834115,"HP Chromebook 14 Inch Laptop (NVIDIA Tegra K1,...",PC,2,6,7,N,N,"Will Never, Ever Purchase an HP Product…","Will Never, Ever Purchase an HP Product…<br />...",2015-08-31


There are cases where a review may receive at least 5 helpful upvotes, but many more downvotes. <br>
Downvoting was a previously removed feature of Amazon that indicated unhelpful reviews. <br>
However, we can still account for the presence of downvotes by subtracting the number of upvotes from the total number of votes. <br>  We will use a ratio of upvotes to downvotes to filter out more reviews.

In [14]:
helpful_df['ratio'] = np.round(helpful_df['helpful_votes']/(helpful_df['total_votes'] - helpful_df['helpful_votes']), 2)

helpful_df = helpful_df[(helpful_df['ratio'] > 1) | (helpful_df['vine'] == 'Y')]

helpful_df.head()

Unnamed: 0,customer_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,ratio
561,33788671,942854828,"ASUS C201 11.6 Inch Chromebook (Rockchip, 4 GB...",PC,4,5,5,N,Y,Awesome Chromebook!,I was worried that the Rockchip processor woul...,2015-08-31,inf
622,108604,218670083,ASUS K501UX 15-inch Gaming Laptop (Intel Core ...,PC,1,6,6,N,Y,Don't buy this junk.,"Don't buy this laptop, it's good at first but ...",2015-08-31,inf
996,39145251,774078247,Apple MacBook Pro Laptop,PC,1,7,7,N,Y,Bitter experience. Expensive too!,"I almost forgot about this order, found it acc...",2015-08-31,inf
1454,25575022,516262603,Lenovo S41 Laptop,PC,4,2,4,Y,N,Laptop of The Times - No DVD Drive,Its a great little laptop. I like the size and...,2015-08-31,1.0
4110,52924529,251834115,"HP Chromebook 14 Inch Laptop (NVIDIA Tegra K1,...",PC,2,6,7,N,N,"Will Never, Ever Purchase an HP Product…","Will Never, Ever Purchase an HP Product…<br />...",2015-08-31,6.0


We will now focus on products with at least 25 reviews that were either mostly helpful or written by those in the Amazon vine program. <br>

In [15]:
helpful_review_count = helpful_df['product_parent'].value_counts()
select_product_parents = set(helpful_review_count[helpful_review_count >= 25].index.values)

In [16]:
helpful_df = helpful_df[helpful_df['product_parent'].isin(select_product_parents)]
helpful_df = helpful_df.sort_values('product_title')

In [17]:
helpful_df

Unnamed: 0,customer_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,ratio
304725,28267438,519804412,"ASUS Chromebook 11.6""",PC,3,495,529,N,N,"Great Chromebook, but not so great for watchin...",My old Chromebook was the very first HP 11&#34...,2015-07-21,14.56
588697,51997017,519804412,"ASUS Chromebook 11.6""",PC,5,1,2,Y,N,Great Chromebook,"I’ve always been a traditional laptop guy, but...",2015-06-09,1.00
782848,52717702,519804412,"ASUS Chromebook 11.6""",PC,5,3,4,Y,N,ASUS Chromebook C201PA-DS02 11.6-Inch Laptop,I’ve been a long time user of tablets (Kindle ...,2015-05-11,3.00
407648,50450152,519804412,"ASUS Chromebook 11.6""",PC,5,6,8,N,Y,Super pleased!,"Just to be clear upfront, this is my first Chr...",2015-07-07,3.00
786258,47792218,519804412,"ASUS Chromebook 11.6""",PC,5,4,5,Y,N,Best Chromebook At The Best Price That I Have ...,This is a simple to set up chromebook that so ...,2015-05-11,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6167364,25840050,904063220,Samsung Series 9 NP900X3A-A03US 13.3-Inch Lapt...,PC,1,17,21,N,N,OVERPRICED & TERRIBLE CUSTOMER SERVICE,I bought this laptop for work after considerin...,2011-06-25,4.25
6155600,48561582,904063220,Samsung Series 9 NP900X3A-A03US 13.3-Inch Lapt...,PC,2,12,13,N,N,"Amazing Laptop, Finally an Air2 Contender ...B...","This is a great laptop, and I really wanted it...",2011-07-08,12.00
6145948,14700391,904063220,Samsung Series 9 NP900X3A-A03US 13.3-Inch Lapt...,PC,4,37,37,N,N,"Good, BUT Know What You're Getting Into",Fist let me tell you what/where I am coming fr...,2011-07-18,inf
6200288,52323503,904063220,Samsung Series 9 NP900X3A-A03US 13.3-Inch Lapt...,PC,1,15,20,N,N,Weak Wifi is a killer,I have owned this for over a month now and I h...,2011-05-19,3.00


We then further narrow the products to those with negative reviews. <br>
We define a postive review as a review that issued a product ratings greater than 3. <br>
We define a negative review as a review that issued a product rating less than or equal to 3. <br>
Otherwise, we will not be able to produce pros and cons for each product. <br>

In [19]:
helpful_df = helpful_df.astype({"star_rating":np.int8})

In [20]:
missing_pos = set() #list of product parents missing positive reviews
missing_neg = set() #list of product parents missing negative reviews

for product_parent in select_product_parents:
    pos_review_mask = helpful_df['star_rating'] > 3
    id_mask = helpful_df['product_parent'] == product_parent

    positive_reviews = helpful_df[pos_review_mask & id_mask]
    negative_reviews = helpful_df[~pos_review_mask & id_mask]

    if len(positive_reviews) == 0:
        print('No positive reviews for product_parent ' + product_parent)
        missing_pos.add(product_parent)

    if len(negative_reviews) == 0:
        print('No negative reviews for product_parent ' + product_parent)
        missing_neg.add(product_parent)

#we only want products with negative reviews
helpful_df = helpful_df[~helpful_df['product_parent'].isin(missing_neg)] 

No negative reviews for product_parent 204894625


In [21]:
helpful_df

Unnamed: 0,customer_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,ratio
304725,28267438,519804412,"ASUS Chromebook 11.6""",PC,3,495,529,N,N,"Great Chromebook, but not so great for watchin...",My old Chromebook was the very first HP 11&#34...,2015-07-21,14.56
588697,51997017,519804412,"ASUS Chromebook 11.6""",PC,5,1,2,Y,N,Great Chromebook,"I’ve always been a traditional laptop guy, but...",2015-06-09,1.00
782848,52717702,519804412,"ASUS Chromebook 11.6""",PC,5,3,4,Y,N,ASUS Chromebook C201PA-DS02 11.6-Inch Laptop,I’ve been a long time user of tablets (Kindle ...,2015-05-11,3.00
407648,50450152,519804412,"ASUS Chromebook 11.6""",PC,5,6,8,N,Y,Super pleased!,"Just to be clear upfront, this is my first Chr...",2015-07-07,3.00
786258,47792218,519804412,"ASUS Chromebook 11.6""",PC,5,4,5,Y,N,Best Chromebook At The Best Price That I Have ...,This is a simple to set up chromebook that so ...,2015-05-11,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6167364,25840050,904063220,Samsung Series 9 NP900X3A-A03US 13.3-Inch Lapt...,PC,1,17,21,N,N,OVERPRICED & TERRIBLE CUSTOMER SERVICE,I bought this laptop for work after considerin...,2011-06-25,4.25
6155600,48561582,904063220,Samsung Series 9 NP900X3A-A03US 13.3-Inch Lapt...,PC,2,12,13,N,N,"Amazing Laptop, Finally an Air2 Contender ...B...","This is a great laptop, and I really wanted it...",2011-07-08,12.00
6145948,14700391,904063220,Samsung Series 9 NP900X3A-A03US 13.3-Inch Lapt...,PC,4,37,37,N,N,"Good, BUT Know What You're Getting Into",Fist let me tell you what/where I am coming fr...,2011-07-18,inf
6200288,52323503,904063220,Samsung Series 9 NP900X3A-A03US 13.3-Inch Lapt...,PC,1,15,20,N,N,Weak Wifi is a killer,I have owned this for over a month now and I h...,2011-05-19,3.00


Return to [Table of Contents](#table)

## 4. Tokenize <a name="token"></a>

A review can have a lot of mixed sentiment due to discussion of pros and cons. <br>
Even a 5 star review can have a lot of mixed sentiment, which might affect our modeling. <br>
To achieve better results, we create sentence tokens from the reviews. <br>
This allows us to more effectively capture the sentiment behind each product feature in the review. 

We will be using `VADER` to help us parse sentiment, specifically the compound score. <br>
Essentially, the compound score is a measure of whether a sentence is more positive than negative. <br>

According to the documentatation, we can group sentences with:
- a score higher than 0.05 as positive
- a score between -0.05 and 0.05 as neutral
- a score lower than -0.05 as negative

We are specifically interested in sentences with positive and negative sentiment since they are related to pros and cons. <br>
Sentences with neutral sentiment offer no insight. <br>

In [25]:
analyzer = SentimentIntensityAnalyzer() #ok for product reviews


def sentence_tokenizer(review):
    """
        Helper function for identifying sentences with 
        positive and negative sentiment in a review via VADER
        
        :params review: a string representation of the review
        
        :returns positive_sentences: a list of strings with positive sentiment
        :returns negative_sentences: a list of strings with negative sentiment
    """
    
    positive_sentences = []
    negative_sentences = []
    
    sentences = set(review.split('.'))
    
    for sentence in sentences:
        if len(sentence) == 0:
            continue
            
        vs = analyzer.polarity_scores(sentence)
        
        #if the sentence has mostly positive sentiment
        if vs['compound'] > 0.05:
            positive_sentences.append(sentence)
        
        #if the sentence has mostly negative sentiment
        elif vs['compound'] < -0.05:
            negative_sentences.append(sentence)
    
    return positive_sentences, negative_sentences

In [26]:
sentences_list = helpful_df['review_body'].apply(sentence_tokenizer).values

positive_sentences_list = []
negative_sentences_list = []

for sentences in sentences_list:
    positive_sentences_list.append(sentences[0])
    negative_sentences_list.append(sentences[1])
    
helpful_df['positive_sentences'] = positive_sentences_list
helpful_df['negative_sentences'] = negative_sentences_list

In [27]:
helpful_df.head()

Unnamed: 0,customer_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,ratio,positive_sentences,negative_sentences
304725,28267438,519804412,"ASUS Chromebook 11.6""",PC,3,495,529,N,N,"Great Chromebook, but not so great for watchin...",My old Chromebook was the very first HP 11&#34...,2015-07-21,14.56,[ The HP11 was only able to last all day if I ...,"[Well, the quality of the speakers on the C201..."
588697,51997017,519804412,"ASUS Chromebook 11.6""",PC,5,1,2,Y,N,Great Chromebook,"I’ve always been a traditional laptop guy, but...",2015-06-09,1.0,"[ However, the price is amazing, <br /><br />O...","[<br /><br />Basically, this is a laptop that ..."
782848,52717702,519804412,"ASUS Chromebook 11.6""",PC,5,3,4,Y,N,ASUS Chromebook C201PA-DS02 11.6-Inch Laptop,I’ve been a long time user of tablets (Kindle ...,2015-05-11,3.0,[ I wasn’t sure it would be as fast as my lapt...,[I’ve been a long time user of tablets (Kindle...
407648,50450152,519804412,"ASUS Chromebook 11.6""",PC,5,6,8,N,Y,Super pleased!,"Just to be clear upfront, this is my first Chr...",2015-07-07,3.0,"[ It is uber lightweight and compact, so easy ...",[]
786258,47792218,519804412,"ASUS Chromebook 11.6""",PC,5,4,5,Y,N,Best Chromebook At The Best Price That I Have ...,This is a simple to set up chromebook that so ...,2015-05-11,4.0,"[ Movies stream well, The navy blue cove...",[]


Return to [Table of Contents](#table)

## 5. Text Cleaning <a name="text"></a>

In [39]:
tag_re = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
backslash_re = re.compile(r'\"') 

#for getting rid of strange characters in text
def remove_strange_chars(text):
    """
        Helper function for removing strange characters present in text
        
        :params text: text string
        
        :returns text: text string after strange characters have been removed
    """
    #remove html tags
    text = tag_re.sub(' ', text) 
    
    #remove backslashes in conjunctions
    text = backslash_re.sub(' ', text)
    
    #remove backslashes around words
    text = text.replace("\\", '')

    return text



lemmatizer = WordNetLemmatizer()

#for processing the text
def text_preprocessing(text):
    
    """
        Helper function for cleaning and lemmatizing the text
        
        :params text: text string
        
        :returns text: text string after cleaning and lemmatization
    """
    
    #match only letters
    text = re.sub('\w*\d\w*', ' ', text) 
    
    #lowercase all text and remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text.lower()) 
    
    #lemmatize text and recombined into one string
    word_list = nltk.word_tokenize(text)
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_list]) 
    
    return text

In [30]:
helpful_df['positive_sentences'] = helpful_df['positive_sentences'].apply(lambda sentences: 
                                                                [remove_strange_chars(sentence) for sentence in sentences])

helpful_df['negative_sentences'] = helpful_df['negative_sentences'].apply(lambda sentences: 
                                                                [remove_strange_chars(sentence) for sentence in sentences])

In [31]:
helpful_df['lemmatized_positives'] = helpful_df['positive_sentences'].apply(lambda sentences: 
                                                                [text_preprocessing(sentence) for sentence in sentences])

helpful_df['lemmatized_negatives'] = helpful_df['negative_sentences'].apply(lambda sentences: 
                                                                [text_preprocessing(sentence) for sentence in sentences])

In [920]:
helpful_df = helpful_df.sort_values('product_title')

helpful_df.head()

Unnamed: 0,customer_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,ratio,positive_sentences,negative_sentences,lemmatized_positives,lemmatized_negatives
304725,28267438,519804412,"ASUS Chromebook 11.6""",PC,3,495,529,N,N,"Great Chromebook, but not so great for watchin...",My old Chromebook was the very first HP 11&#34...,2015-07-21,14.56,[ PORTABILITY(5/5): The C201 is slightly ligh...,[ My C201 has been dropped several times and s...,[portability the is slightly lighter and ha a ...,[my ha been dropped several time and still ha ...
72820,21910022,519804412,"ASUS Chromebook 11.6""",PC,1,6,9,N,Y,The second one I bought is excellent! / Beware...,"Previous post: Actually, it had been my favori...",2015-08-21,2.00,[ 3) Additional storage It comes with a mi...,[ I virtually have an extremely inexpensive an...,[additional storage it come with a micro sd sl...,[i virtually have an extremely inexpensive and...
593272,51925793,519804412,"ASUS Chromebook 11.6""",PC,5,0,0,Y,N,I LOVE This Little ASUS Chromebook,This ASUS Chromebook C201PA-DS02 is super fast...,2015-06-09,,"[ I like that a LOT, It's lightweight and ex...",[ There is only one thing that I don't like a...,"[i like that a lot, it s lightweight and extre...",[there is only one thing that i don t like abo...
482481,16913265,519804412,"ASUS Chromebook 11.6""",PC,5,9,10,N,Y,"The ASUS C201PA: Affordable, Stylish, Fast, Ri...",Just buy it. It's light. The battery lasts a l...,2015-06-25,9.00,"[ It gets you where you want to go, It picks ...",[ I have no intention of installing Linux on t...,"[it get you where you want to go, it pick up a...",[i have no intention of installing linux on th...
429628,478265,519804412,"ASUS Chromebook 11.6""",PC,5,12,15,N,Y,A capable little machine,I've never used a Chrome OS device before. Wit...,2015-07-04,4.00,"[ For the touchpad, well, I've used an old Del...","[ This is disappointing, But amazingly I have...",[for the touchpad well i ve used an old dell a...,"[this is disappointing, but amazingly i have n..."
725001,49945042,519804412,"ASUS Chromebook 11.6""",PC,4,17,22,Y,N,Excellent Device Overall,If you've not seen a Chromebook before THIS IS...,2015-05-20,3.40,"[5 GB free after space taken by the OS, Shor...","[pdf (read-only) --Update-- One last thing, ...","[gb free after space taken by the o, short ver...",[pdf read only update one last thing if your a...
721719,51133542,519804412,"ASUS Chromebook 11.6""",PC,5,5,6,N,Y,"A snappy little Chromebook, feel free to say g...","A recent convert from Windows 8.1, I was fed u...",2015-05-21,5.00,[ When you create documents you have the optio...,[ Another thing you'll notice with a Chromeboo...,[when you create document you have the option ...,[another thing you ll notice with a chromebook...
608572,42951718,519804412,"ASUS Chromebook 11.6""",PC,5,4,5,Y,N,"Price, speed, battery life and portability...y...",I have had the pleasure of using this ASUS Chr...,2015-06-06,4.00,"[ Yep, 10 seconds! Another 10 seconds to boot ...","[ I was wrong, It seems odd not to have a 3, ...",[yep second another second to boot up chrome a...,"[i wa wrong, it seems odd not to have a, thin ..."
620514,33299926,519804412,"ASUS Chromebook 11.6""",PC,5,6,7,N,Y,One word - AMAZING,"This product has exceeded my expectations, it ...",2015-06-05,6.00,[ Sound quality is decently good - Speakers at...,[ The 13 hours included about 7-8 hours of non...,[sound quality is decently good speaker at the...,[the hour included about hour of non stop yout...
291685,34536212,519804412,"ASUS Chromebook 11.6""",PC,5,10,10,N,Y,Chromebook + Rockchip,Chromebooks are awesome. If you are into brows...,2015-07-23,inf,[ Their C series laptops are tops for design...,[],[their c series laptop are top for design batt...,[]


We save the data at this stage since it can take quite a few minutes to reach this checkpoint.

In [12]:
if 'helpful_df.pkl.gz' not in os.listdir('data'):
    helpful_df.to_pickle('./data/helpful_df.pkl.gz', compression = 'gzip')

else:
    helpful_df = pd.read_pickle('./data/helpful_df.pkl.gz', compression = 'gzip')

Return to [Table of Contents](#table)

## 6. Modeling <a name="model"></a>

We use the bidirectional feature importance of logistic regression to identify pros and cons from the n-grams produced from the sentence tokens of reviews. <br>
We then use word2vec to map these specific pros and cons n-grams to more general product features i.e. aspects.

In [35]:
def find_bad_phrases(vocab_dict):
    """
        Helper function to identify n-grams 
        to ignore because they are unhelpful
        
        :params vocab_dict: dictionary of indices and words from the vectorizer
                            with indices as the keys and words as the values
                            
        :returns bad_keys: a set of n-gram strings to ignore
    """
    
    bad_keys = []

    for key, value in vocab_dict.items():
        doc = nlp(value)
        pos_list = [] #parts of speech list

        tokens = value.split(' ')
        
        #ignore tokens that contain duplicate words
        if len(tokens) != len(set(tokens)):
            bad_keys.append(key)
            continue
        
        for token in doc:
            pos_list.append(token.pos_)
            
            #ignore tokens with text lengths that 
            #are less than 3 characters long,
            if len(token.text) < 3:
                bad_keys.append(key)
                continue
        
        #ignore ngrams with the label 'X'
        #this means that they cannot be assigned a part of speech tag
        if 'X' in pos_list:
            bad_keys.append(key)
            continue
        
        
        #ignore certain parts-of-speech combinations of words
        bad_pos_combos = [['NOUN', 'ADV'], ['ADV', 'ADP'], ['ADV', 'NOUN'], ['VERB', 'ADJ'], 
                          ['NOUN', 'VERB'], ['NUM', 'NOUN'], ['VERB', 'VERB'], ['ADV', 'ADJ'], 
                          ['PROPN', 'NOUN'], ['ADV', 'ADV'], ['ADV', 'VERB'], ['ADP', 'ADV'], 
                          ['ADP', 'ADJ'], ['NUM', 'ADJ'], ['VERB', 'ADV'], ['ADP', 'VERB'], 
                          ['ADJ', 'ADJ'], ['ADV', 'NUM'], ['VERB', 'NUM'], ['ADJ', 'VERB'], 
                          ['DET', 'NOUN'], ['NOUN', 'NUM'], ['ADJ', 'ADV'], ['ADV', 'VERB'], 
                          ['ADV', 'ADV'], ['NOUN', 'ADJ'], ['VERB', 'NOUN', 'VERB'], 
                          ['VERB', 'VERB', 'NOUN'], ['PRON', 'VERB'], ['ADP', 'NOUN']]

            
        if pos_list in bad_pos_combos:
            bad_keys.append(key)
            continue
        
        
        #ignore ngrams that are not related to the aspect words
        aspects = ['battery', 'ports', 'price', 'hard drive', 'processor speed', 'screen', 
           'ergonomic', 'fan cooling', 'sound', 'build quality', 'ram']
        
        #some values need to be hard coded in since word2vec
        #does not recognize ssd or solid state as well
        if 'solid' in tokens:
            if 'state' in tokens:
                continue

        if 'ssd' in tokens:
            continue
                
        
        #check to see if the word is related to any aspects
        #we consider a word to be related if it has a 
        #cosine similarity score of at least 0.4
        aspect_cnt = 0
        
        for aspect in aspects:
            try:
                similarity_score = model.n_similarity(aspect.split(' '), tokens) 
                if similarity_score >= 0.4:
                    aspect_cnt += 1
            except:
                continue
            
        #if the ngram is unrelated to any aspects, ignore it
        if aspect_cnt == 0:
            bad_keys.append(key)
                    

    return set(bad_keys)

In [43]:
def modeling(X, y, cv):
    """
        Function that uses bidirectional feature importance of logistic
        regression to separate out pros and cons
        
        :params X: frequency count of words used in review sentences as a matrix
        :params y: labels of whether the sentence is positive or not
        :params cv: vectorizer that was used to produce X
        
        :returns pros_list: a list of n-gram strings that could 
                            indicate positive product features
        :returns cons_list: a list of n-gram strings that could 
                            indicate negative product features
    """
    
    #use the best logistic regression model to fit on the 
    #matrix representation of the review sentences
    
    hyper_param_grid = {'C': [0.001, 0.01, 0.1, 1, 
                              10, 100, 1000, 10000]}
    

    lr = GridSearchCV(LogisticRegression(solver='saga', max_iter = 10000), 
                      hyper_param_grid, cv = 3, n_jobs=-1)

    lr.fit(X, y)


    # we use the beta coefficient values from logistic regression
    # to identify the ngrams associated with positive and negative snetiment

    model_coefs = lr.best_estimator_.coef_[0,:]
    
    #we sort the ngrams based on their beta coefficient value
    #which could be negative to postive,
    #indicating negative or positive sentiment respectively
    model_keys = np.argsort(model_coefs)

    
    #use vectorizer to generate a dictionary that maps indices to words
    #in other words, the key is the index and the value is the word
    vocab_dict = {v: k for k, v in cv.vocabulary_.items()}
    
    #find bad ngrams to ignore because they are unhelpful
    bad_keys = find_bad_phrases(vocab_dict)
    
    
    #find where the positive beta coefficients began
    #in order to separate negative sentiment ngrams from positive ones
    split_idx = 0

    for idx, key in enumerate(model_keys):
        if model_coefs[key] > 0:
            split_idx = idx
            break
    
    #split ngrams into positive and negative groups
    #sorted by their respective beta coefficient value
    neg_idx_list = model_keys[:split_idx]
    pos_idx_list = model_keys[split_idx:][::-1]
    
    
    #generate a filtered list of ngrams with positive sentiment
    #this is what we will consider as the pros
    #we will include ngrams if they are not in the list of bad ngrams to ignore
    pros_list = [(vocab_dict[pos_idx], model_coefs[pos_idx]) 
                 for pos_idx in pos_idx_list if pos_idx not in bad_keys]
    
    
    #generate a filtered list of ngrams with negative sentiment
    #this is what we will consider as the cons
    #we will include ngrams if they are not in the list of bad ngrams to ignore
    cons_list = [(vocab_dict[neg_idx], model_coefs[neg_idx])
                for neg_idx in neg_idx_list if neg_idx not in bad_keys]

    
    return pros_list, cons_list

In [54]:
def pros_and_cons(df, product_parent, stopwords):
    """
        Function for generating specific pros and cons for each product parent
        
        :params df: dataframe containing 
        :params product_parent: product parent ID string
        :params stopwords: a set of stopwords to modify and use
        
        :returns modeling: output from modeling function, 
                           which is a list of pros and a list of cons
    """
    
    
    product_parent_mask = df['product_parent'] == product_parent
    
    #add words in the title to the stopwords
    #make sure to process the title the same way the sentences were done
    title = df.loc[product_parent_mask, 'product_title'].values[0]
    title = remove_strange_chars(title)
    title = text_preprocessing(title)
    title_words = [word for word in title.split(' ')]
    
    
    title_words = set(title_words)

    stopwords = orig_stopwords.union(title_words)
    
    stopwords = [remove_strange_chars(stopword) for stopword in stopwords]
    stopwords = [text_preprocessing(stopword) for stopword in stopwords]
    
    
    #get positive and negative sentences
    lemmatized_positive_list = df.loc[product_parent_mask, 'lemmatized_positives'].values
    lemmatized_negative_list = df.loc[product_parent_mask, 'lemmatized_negatives'].values
    
    positive_sentences = []
    
    for lemmatized_positive in lemmatized_positive_list:
        if len(lemmatized_positive) == 0:
            continue
        positive_sentences.extend(lemmatized_positive)
    
    negative_sentences = []
    
    for lemmatized_negative in lemmatized_negative_list:
        if len(lemmatized_negative) == 0:
            continue
        negative_sentences.extend(lemmatized_negative)
    
    #concatenate postive and negative sentences together
    all_sentences = []
    all_sentences.extend(positive_sentences)
    all_sentences.extend(negative_sentences)
    

    #create labels based on order sentences were added, 
    #1 being positive, 0 being negative
    y = [1]*len(positive_sentences) + [0]*len(negative_sentences)
    
    
    #create features from words
    cv = CountVectorizer(ngram_range=(2,3), stop_words=stopwords, min_df = 2).fit(all_sentences)
    X = cv.transform(all_sentences)
    
    return modeling(X, y, cv)

In [41]:
def aspect_generator(aspects, ngram_list):
    
    """
        Function for mapping n-grams to aspects
        
        
        :params aspects: list of aspects as strings
        :params ngram_list: list of n-grams to match aspects to
        
        :returns aspect_dict: dictionary of n-grams and aspects with 
                              n-grams as keys, and the aspects they match to
                              as values, possibly more than one aspect
    """

    
    aspect_dict = defaultdict(list)
    
    ngram_list = [ngram[0] for ngram in ngram_list]
    
    for aspect in aspects:
        for ngram in ngram_list:
            try:
                similarity_score = model.n_similarity(aspect.split(' '), ngram.split(' ')) 
                                    
                if similarity_score >= 0.4:
                    aspect_dict[aspect].append(ngram)
                    
                #some values need to be hard coded in since word2vec
                #does not recognize ssd or solid state as well
                if aspect == 'hard drive':
                    if 'solid' in ngram:
                        if 'state' in ngram:
                            aspect_dict[aspect].append(ngram)

                    if 'ssd' in ngram:
                        aspect_dict[aspect].append(ngram)
    
            except:
                continue

    return aspect_dict

In [55]:
unique_product_parents = sorted(helpful_df['product_parent'].unique())

errors = []
pros_and_cons_dict = defaultdict(dict)


#define stopwords
orig_stopwords = set(stopwords.words('english'))

laptop_brands = ['asus', 'acer', 'alienware', 'apple', 'dell', 'hp', 'lenovo', 'samsung']
laptop_types = ['laptop', 'chromebook', 'macbook']

orig_stopwords = orig_stopwords.union(set(laptop_brands))
orig_stopwords = orig_stopwords.union(set(laptop_types))

misc = ['wa', 'windows', 'mac', 'computer', 'window', 'star', 'comment', 
        'macs', 'hewlett', 'thing', 'review', 'rating', 'like', 'laptop', 
        'pc', 'chrome', 'book', 'bill', 'packard', 'netflix', 'gates', 
        'steve', 'job', 'microsoft', 'http', 'www', 'people', 'google', 
        'amazon', 'grammar', 'youtube', 'geek', 'squad', 'reviewer', 
        'read', 'guy', 'bird', 'lap', 'top', 'luck', 'gift', 'day', 'bar']

orig_stopwords = orig_stopwords.union(set(misc))


#define aspects
aspects = ['battery', 'ports', 'price', 'hard drive', 'processor speed', 'screen', 
           'ergonomic', 'fan cooling', 'sound', 'build quality', 'ram']


for idx, product_parent in enumerate(unique_product_parents):    
    try:
        product_parent_mask = helpful_df['product_parent'] == product_parent

        title = helpful_df.loc[product_parent_mask, 'product_title'].values[0]

        brand = title.split(' ')[0]

        #find the pros and cons for a product parent
        pros, cons = pros_and_cons(helpful_df, product_parent, orig_stopwords)

        #find how pros and cons are related to our defined aspects
        pro_aspects = aspect_generator(aspects, pros)
        con_aspects = aspect_generator(aspects, cons)

        positive_sentences = helpful_df.loc[product_parent_mask, 'positive_sentences'].values #list of lists
        positive_sentences = [item for sublist in positive_sentences for item in sublist] #flatten into one list 

        negative_sentences = helpful_df.loc[product_parent_mask, 'negative_sentences'].values
        negative_sentences = [item for sublist in negative_sentences for item in sublist]

        lemmatized_positives = helpful_df.loc[product_parent_mask, 'lemmatized_positives'].values
        lemmatized_positives = [item for sublist in lemmatized_positives for item in sublist]

        lemmatized_negatives = helpful_df.loc[product_parent_mask, 'lemmatized_negatives'].values
        lemmatized_negatives = [item for sublist in lemmatized_negatives for item in sublist]


        pros_and_cons_dict[brand][product_parent] = {'product_title': title,
                                                     'pros': pros, 
                                                     'cons': cons, 
                                                     'pro_aspects': pro_aspects, 
                                                     'con_aspects': con_aspects,
                                                     'positive_sentences': positive_sentences,
                                                     'negative_sentences': negative_sentences,
                                                     'lemmatized_positives': lemmatized_positives,
                                                     'lemmatized_negatives': lemmatized_negatives}

    except:
        errors.append(idx)
        
    print(idx, title)
  

0 Apple MacBook Pro 13.3-Inch
1 Acer 13 Chromebook 13.3-inch Full HD
2 Samsung 11-Inch Chromebook 2 11.6-Inch Cloud Computer
3 Dell Inspiron 15 5000 Series 15-Inch Touchscreen Laptop Silver
4 ASUS T100 10-Inch Laptop OLD VERSION
5 HP Pavilion G6-2235us 15.6" Laptop (2.7 GHz AMD A6-4400M Accelerated Processor, 4GB RAM, 750GB Hard Drive, SuperMulti DVD Burner, Windows 8)
6 ASUS N550J 15.6-Inch Laptop (Intel Core i7-4700HQ 2.4GHz Processor, 1TB Hard Drive, 8GB RAM, Windows 8.1 64-bit) Silver Grey
7 Samsung ATIV Book 6 15.6-Inch Full HD Touchscreen Laptop (Mineral Ash Black)
8 Apple 13-Inch MacBook T7200 2.0 GHz Intel Core 2 Duo Processor, White
9 Apple MacBook MA472LL/A 13.3" Laptop (2.0 GHz Intel Core Duo, 512 MB RAM, 80 GB Hard Drive, SuperDrive)-Black
10 Acer C720 Chromebook (11.6-Inch, 4GB)
11 Acer Aspire AS5251-1805 15.6-Inch Laptop (Black)
12 Acer C720 Chromebook (11.6-Inch, 2GB) Discontinued by Manufacturer
13 ASUS K501UX 15-inch Gaming Laptop (Intel Core i7 Processor, 8GB RAM, 256

In [60]:
rewrite = False

if rewrite:
    with gzip.open('./data/pros_and_cons_dict.pkl.gz', 'wb') as f:
        pickle.dump(pros_and_cons_dict, f)

else:
    if 'pros_and_cons_dict.pkl.gz' in os.listdir('data'):
        with gzip.open('./data/pros_and_cons_dict.pkl.gz', 'wb') as f:
            pros_and_cons_dict = pickle.load(pros_and_cons_dict, f)

Return to [Table of Contents](#table)