This notebook covers:
    
- leverages recleaned data from v1 
- reruns sentiment analysis at the sentence level
- merges results of vader and textblob
- identifies subjects of highest and lowest sentiment sentences
- if above is not clear, identifies nouns in preceeding sentence 


In [92]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import collections


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# tokenization
from sklearn.feature_extraction.text import  ENGLISH_STOP_WORDS  
import re 
import nltk
from nltk import tokenize
import string 

# spacy
from __future__ import unicode_literals
import spacy 
nlp = spacy.load('en')

# sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [282]:
data = pd.read_pickle('../data/data_tokenized') # better to use pickle

In [291]:
# add review_id
data['review_id'] = data.reset_index().index+1

In [337]:
# remove extra white space from review text 
# dont use concatenated review title + text due to duplication
def remove_white_space(text_raw):
    text_clean = re.sub('\s+', ' ', text_raw).strip()
    return text_clean

data.reviewText_clean = data.reviewText_clean.apply(remove_white_space)

In [338]:
# subset to cols of interest 
data_sub = data[['asin','review_id','reviewText_clean']] # _clean has extra white space removed 

In [339]:
data_sub.review_id.nunique(), data_sub.reviewText_clean.nunique()

(355178, 353829)

In [315]:
# num duplicates
data_sub.review_id.nunique()-data_sub.review_all_clean.nunique()

1349

In [316]:
# find duplicates
a = data.review_all_clean.tolist()
print [item for item, count in collections.Counter(a).items() if count > 1]



In [310]:
# example of different description, same text --> fake review? 
data[data.review_all_clean == 'I love this stuff . I have lost ten pounds in about 2 weeks. It does what they say it does. you have nothing to lose if you dont like it send it back.. I love this stuff . I have lost ten pounds in about 2 weeks. It does what they say it does. you have nothing to lose if you dont like it send it back.']

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean,review_length,reviewText_clean,summary_clean,review_all,review_all_clean,review_all_no_punct,review_all_tokens,title_clean,title_tokens,title_tokens_lower,review_tokens_sub,review_id
331357,B00G1SQ874,"[0, 0]",I love this stuff . I have lost ten pounds in ...,5.0,Garcinia Cambogia,NaturaBest Garcinia Cambogia ExtractGarcinia C...,Garcinia Cambogia - NaturaBest - 100% PURE Gar...,"Health & Personal Care, Vitamins & Dietary Sup...",150,I love this stuff . I have lost ten pounds in ...,I love this stuff . I have lost ten pounds in ...,I love this stuff . I have lost ten pounds in ...,I love this stuff . I have lost ten pounds in ...,I love this stuff I have lost ten pounds in ...,"[I, love, this, stuff, , , I, have, lost, ten,...",Garcinia Cambogia NaturaBest PURE Garcinia Cap...,"[Garcinia, Cambogia, NaturaBest, PURE, Garcini...",garcinia cambogia naturabest pure garcinia cap...,"[love, this, stuff, have, lost, ten, pounds, a...",331295
332229,B00G8L58CA,"[0, 0]",I love this stuff . I have lost ten pounds in...,5.0,Garcinia Cambogia,What if YOU could lose that weight you need to...,Garcinia Cambogia with HCA Pure Extract Weight...,"Health & Personal Care, Vitamins & Dietary Sup...",152,I love this stuff . I have lost ten pounds in...,I love this stuff . I have lost ten pounds in...,I love this stuff . I have lost ten pounds in...,I love this stuff . I have lost ten pounds in ...,I love this stuff I have lost ten pounds in...,"[I, love, this, stuff, , , , I, have, lost, te...",Garcinia Cambogia with HCA Pure Extract Weight...,"[Garcinia, Cambogia, with, HCA, Pure, Extract,...",garcinia cambogia with hca pure extract weight...,"[love, this, stuff, have, lost, ten, pounds, a...",332167


## Sentiment Analysis

### Vader

#### At the sentence level

Break down each review into composite sentences using `NLTK`'s `tokenize.sent_tokenize()`,  find sentiments for all sentences, choose min and max.

In [340]:
analyser = SentimentIntensityAnalyzer()

In [342]:
data_sub['tokenized_sentences'] = data_sub.reviewText_clean.apply(tokenize.sent_tokenize)

In [377]:
data_sub[:10] # notice that ix 6 is missing 

Unnamed: 0,asin,review_id,reviewText_clean,tokenized_sentences
0,929619730,1,B-flax-D is a regular at our house. It does it...,"[B-flax-D is a regular at our house., It does ..."
1,978559088,2,I started taking this after both my parents di...,[I started taking this after both my parents d...
2,978559088,3,I really liked this product because it stayed ...,[I really liked this product because it stayed...
3,978559088,4,"Resveratrol is a polar compound, very insolubl...","[Resveratrol is a polar compound, very insolub..."
4,1427600228,5,I bought several of these bracelets for my YMC...,[I bought several of these bracelets for my YM...
5,1427600228,6,I bought a few the other week just to see what...,[I bought a few the other week just to see wha...
7,1427600228,7,This bracelet is so simple yet so elegant. I l...,"[This bracelet is so simple yet so elegant., I..."
8,1427600228,8,I recently bought a bunch of these to hand out...,[I recently bought a bunch of these to hand ou...
9,1928926215,9,I'm obsessed with this stuff and found it is t...,[I'm obsessed with this stuff and found it is ...
10,1928926215,10,I have been using this for awhile I dont know ...,[I have been using this for awhile I dont know...


In [349]:
# list of number of sentences per review 
num_sentences = []
for i in data_sub.tokenized_sentences:
    num_sentences.append(len(i))

In [393]:
# lists of lists of asins and number of sentences
n = len(data_sub.review_id)
review_id_ls_of_ls = []
asin_ls_of_ls = []
for i in range(n):  # work around because index is off
    asin_i = data_sub[data_sub.review_id == i+1].asin.values.tolist() 
    asin_ls_of_ls.append(asin_i*num_sentences[i])
    review_id_ls_of_ls.append([i+1]*num_sentences[i])

In [394]:
asin_ls = [i for sublist in asin_ls_of_ls for i in sublist] 
review_id_ls = [i for sublist in review_id_ls_of_ls for i in sublist]
len(asin_ls), len(review_id_ls)

(1488555, 1488555)

In [395]:
sentences_ls_of_ls = data_sub.tokenized_sentences.values.tolist()
sentences_ls = [i for sublist in sentences_ls_of_ls for i in sublist]
len(sentences_ls)

1488555

In [396]:
data_sub_long = pd.DataFrame(
    {'asin': asin_ls,
     'review_id': review_id_ls,
     'sentence': sentences_ls
    })
len(data_sub_long)
#data_small_long = data_small_long.drop_duplicates()
#data_small_long[:2]

1488555

In [404]:
# find duplicate sentences
b = data_sub_long['sentence'].tolist()
print len([item for item, count in collections.Counter(b).items() if count > 1])

28328


In [402]:
data_sub_long[data_sub_long.sentence == 'I have ordered this product several times and I will definitely reorder.']

Unnamed: 0,asin,review_id,sentence
1164647,B00A39O8UG,283802,I have ordered this product several times and ...
1281672,B00CKXJ06K,309383,I have ordered this product several times and ...


In [412]:
sum(data_sub_long.duplicated())

1213

In [413]:
# what do these duplicated records look like?
data_sub_long[data_sub_long.duplicated()]

Unnamed: 0,asin,review_id,sentence
2611,B0000533AJ,696,.
2612,B0000533AJ,696,.
2613,B0000533AJ,696,.
2614,B0000533AJ,696,.
2615,B0000533AJ,696,.
10451,B0000U1OCI,2608,.
14735,B0000U1OCI,3513,wonderful!
18101,B0000U1OCI,4254,.
18485,B0000U1OCI,4338,.
20993,B0000U1OCI,4870,.


In [414]:
data[data.review_id == 696]

Unnamed: 0,asin,helpful,reviewText,overall,summary,description,title,categories_clean,review_length,reviewText_clean,summary_clean,review_all,review_all_clean,review_all_no_punct,review_all_tokens,title_clean,title_tokens,title_tokens_lower,review_tokens_sub,review_id
696,B0000533AJ,"[0, 1]",Makes me feel good ..... .... .... .... .... ....,5.0,Great supplement,"&lt;P&gt;&lt;P style=""LEFT: 92px; TOP: 301px""&...","Nature Made St. John's Wort, 450mg Capsules - ...","Health & Personal Care, Vitamins & Dietary Sup...",102,Makes me feel good ..... .... .... .... .... ....,Makes me feel good ..... .... .... .... .... ....,Makes me feel good ..... .... .... .... .... ....,Makes me feel good ..... .... .... .... .... ....,Makes me feel good ...,"[Makes, me, feel, good, , , , , , , , , , , , ...",Nature Made St John s Wort mg Capsules count,"[Nature, Made, St, John, s, Wort, mg, Capsules...",nature made st john s wort mg capsules count,"[Makes, me, feel, good, Makes, me, feel, good]",696


In [415]:
data_sub_dedup = data_sub_long.drop_duplicates()
len(data_sub_dedup)

1487342

In [416]:
data_sub_dedup['vader_score'] = data_sub_dedup.sentence.apply(lambda x: analyser.polarity_scores(x)['compound'])

In [417]:
data_sub_dedup['blob_score'] = data_sub_dedup.sentence.apply(lambda x: TextBlob(x).sentiment[0])

In [462]:
n = len(data_sub_dedup)

round(sum(data_sub_dedup.vader_score == data_sub_dedup.blob_score)*1.0/n*1.0,2),\
round(sum(data_sub_dedup.vader_score > data_sub_dedup.blob_score)*1.0/n*1.0,2),\
round(sum(data_sub_dedup.vader_score < data_sub_dedup.blob_score)*1.0/n*1.0,2)

(0.17, 0.48, 0.35)

In [487]:
# Indentify sentences with min/max sentiments 

max_scores = data_sub_dedup.groupby(['asin', 'review_id'])[['vader_score', 'blob_score']].max().reset_index()
max_scores.rename( columns={"vader_score": "vader_max", "blob_score": "blob_max"}, inplace = True)

In [489]:
min_scores = data_sub_dedup.groupby(['asin', 'review_id'])[['vader_score', 'blob_score']].min().reset_index()
min_scores.rename(columns={"vader_score": "vader_min", "blob_score": "blob_min"}, inplace = True)

all_scores = min_scores.merge(max_scores)

In [491]:
data_wide = data_sub_dedup.merge(all_scores)

In [492]:
data_wide[:1]

Unnamed: 0,asin,review_id,sentence,vader_score,blob_score,vader_min,blob_min,vader_max,blob_max
0,929619730,1,B-flax-D is a regular at our house.,0.0,0.0,0.0,0.0,0.8271,0.7


In [493]:
data_wide['min_sent_blob'] = (data_wide.vader_score == data_wide.vader_min).astype(int)
data_wide['max_sent_blob'] = (data_wide.vader_score == data_wide.vader_max).astype(int)
data_wide['min_sent_vader'] = (data_wide.blob_score == data_wide.blob_min).astype(int)
data_wide['max_sent_vader'] = (data_wide.blob_score == data_wide.blob_max).astype(int)
data_wide.head()

Unnamed: 0,asin,review_id,sentence,vader_score,blob_score,vader_min,blob_min,vader_max,blob_max,min_sent_blob,max_sent_blob,min_sent_vader,max_sent_vader
0,929619730,1,B-flax-D is a regular at our house.,0.0,0.0,0.0,0.0,0.8271,0.7,1,0,1,0
1,929619730,1,It does its job simply and with good results.,0.4404,0.35,0.0,0.0,0.8271,0.7,0,0,0,0
2,929619730,1,"It is reasonable, lasts a long time, and is ab...",0.5106,0.2625,0.0,0.0,0.8271,0.7,0,0,0,0
3,929619730,1,"Good product, good price, good results.",0.8271,0.7,0.0,0.0,0.8271,0.7,0,1,0,1
4,978559088,2,I started taking this after both my parents di...,-0.5729,0.5,-0.5729,-0.357143,0.711,0.8,1,0,0,0


In [495]:
# num differing
sum(data_wide.max_sent_vader <> data_wide.max_sent_blob), sum(data_wide.min_sent_vader <> data_wide.min_sent_blob)

(317508, 372993)

In [502]:
# num matches  
sum(data_wide.max_sent_vader == data_wide.max_sent_blob), sum(data_wide.min_sent_vader == data_wide.min_sent_blob)

(1169834, 1114349)

In [500]:
len(data_wide)

1487342

In [505]:
data_wide.to_csv('../data/sentence_level_sentiments', sep =str('|'), index = False)

### Nouns only

In [504]:
data_wide[:1]

Unnamed: 0,asin,review_id,sentence,vader_score,blob_score,vader_min,blob_min,vader_max,blob_max,min_sent_blob,max_sent_blob,min_sent_vader,max_sent_vader
0,929619730,1,B-flax-D is a regular at our house.,0.0,0.0,0.0,0.0,0.8271,0.7,1,0,1,0


In [506]:
def pull_nouns(sentence):
    nouns = []
    doc = nlp(u'{}'.format(sentence))
    for np in doc.noun_chunks:
        nouns.append(np.text)
    return nouns

In [507]:
data_wide['nouns'] = data_wide.sentence.apply(pull_nouns)

In [509]:
data_wide.to_pickle('../data/sentence_w_nouns') # better to use pickle

In [516]:
data_wide[data_wide.max_sent_vader == 1].nouns.values[1001:1100]

array([[u'Great price', u'this excellent product'],
       [u'I', u'a bigger bottle', u'the price', u'ounce'], [u'The price'],
       [u'This oil'], [u'It', u'the best face', u'you'],
       [u'you', u'the shower'],
       [u'I', u"Jason's vitamin E oil", u'a long time', u'a dry area', u'my skin'],
       [u'I', u'it'], [u'great stuff', u'I', u'a difference', u'a week'],
       [u'I', u'this purchase'], [u'Great value'],
       [u'I', u'great things', u'Vitamin E use', u'I', u'it', u'an oil blend', u'I', u'my hair'],
       [],
       [u'I', u'many things', u'nothing', u'streachmarks', u'doctors', u'me', u'nothing', u'i', u'this oil', u'a week', u'i', u'a difference', u'the way', u'the lines', u'my skin', u'days', u'even some discoloration', u'it', u'almonds', u'i', u'i', u'it', u'i', u'my 5th month pregcy', u'i', u'6 bottles', u'i', u'i', u'a few months', u'I', u'this product', u'it', u'time'],
       [u'I', u'vitamin E oil', u'it', u'my dark circles', u'my eyes', u'it', u'my sunburns