In [1]:
import numpy as np
import pandas as pd
import pickle
import gzip

# Avoid restarting Kernel
%load_ext autoreload
%autoreload 2

%autosave 50

Autosaving every 50 seconds


## Get user comments

In [2]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path, trunc=0):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1        
    if trunc > 0 and i > trunc: 
        break
  return pd.DataFrame.from_dict(df, orient='index')

In [3]:
data_path = '../../datasets/'
file_name = 'reviews_Movies_and_TV.json.gz'

In [4]:
# comments_df = getDF(data_path + file_name, 200000)
# comments_df.loc[0,'reviewText']
# print(comments_df.shape)
# comments_df.head(1)

In [5]:
# Save the records
# import pickle
# pickle_out = open(data_path + "amzn_200k.pickle","wb")
# pickle.dump(comments_df, pickle_out)
# pickle_out.close()

In [6]:
# Reads records
import pickle
pickle_in = open(data_path + "amzn_200k.pickle","rb")
comments_df = pickle.load(pickle_in)

## Split comments into separate sentences

In [7]:
small = comments_df.loc[:2, :]
len(small.index)

3

In [8]:
from nltk.tokenize import sent_tokenize
small['sentence'] = small['reviewText'].map(sent_tokenize)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [9]:
small.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime', 'sentence'],
      dtype='object')

In [10]:
small.drop(['reviewerName', 'helpful', 'reviewText', 'summary', 
            'unixReviewTime', 'reviewTime'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [11]:
small

Unnamed: 0,reviewerID,asin,overall,sentence
0,A3R5OBKS7OM2IR,143502,5.0,[This has some great tips as always and is hel...
1,A3R5OBKS7OM2IR,143529,5.0,"[This is a great pastry guide., I love how Alt..."
2,AH3QC2PC1VTGP,143561,2.0,[I have to admit that I am a fan of Giada's co...


In [12]:
sentences = small['sentence'] \
.apply(pd.Series) \
.merge(small, left_index = True, right_index = True) \
.drop(['sentence'], axis = 1) \
.melt(id_vars = ['reviewerID', 'asin','overall'], value_name = 'sentence') \
.drop(['variable'], axis = 1) \
.dropna()

print(sentences.shape)
sentences.head(30)

(23, 4)


Unnamed: 0,reviewerID,asin,overall,sentence
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...
3,A3R5OBKS7OM2IR,143502,5.0,"I haven't tried any of the recipes yet, but I ..."
4,A3R5OBKS7OM2IR,143529,5.0,I love how Alton's collections can break it do...
5,AH3QC2PC1VTGP,143561,2.0,"They were however, crushed."
6,A3R5OBKS7OM2IR,143502,5.0,Sometimes it's just lovely to let Alton entert...
7,A3R5OBKS7OM2IR,143529,5.0,I might even try some of these recipes some day.
8,AH3QC2PC1VTGP,143561,2.0,"While I still love Giada's cooking, this set i..."
11,AH3QC2PC1VTGP,143561,2.0,They really cheated with these DVD's.


## Sentence-level prep & cleaning

In [13]:
%reload_ext autoreload
from utils import split_n_lower, not_about_support

### Split into words and lower the case

In [14]:
words = sentences['sentence'].apply(lambda s: split_n_lower(s))

In [15]:
print(words.shape)
words.head(2)

(23,)


0    [this, has, some, great, tips, as, always, and...
1               [this, is, a, great, pastry, guide, .]
Name: sentence, dtype: object

### Remove support-related sentences

In [16]:
sentences_on_movie = sentences[[not_about_support(word) for word in words]]
print('Removing {} records'.format(sentences.shape[0]- sentences_on_movie.shape[0]))

Removing 5 records


### Base case: A reviews with objective and subjective sentences

In [35]:
all_reviews_groups = sentences_on_movie.groupby(['reviewerID','asin'])
all_reviews_stars = all_reviews_groups['overall'].mean()
all_reviews_stars

reviewerID      asin      
A3R5OBKS7OM2IR  0000143502    5.0
                0000143529    5.0
AH3QC2PC1VTGP   0000143561    2.0
Name: overall, dtype: float64

In [36]:
all_reviews_comments = all_reviews_groups['sentence'].sum()
print(all_reviews_comments.shape)
print(all_reviews_comments[0])
all_reviews_comments

(3,)
This has some great tips as always and is helping me to complete my Good Eats collection.I haven't tried any of the recipes yet, but I will soon.Sometimes it's just lovely to let Alton entertain us.


reviewerID      asin      
A3R5OBKS7OM2IR  0000143502    This has some great tips as always and is help...
                0000143529    This is a great pastry guide.I love how Alton'...
AH3QC2PC1VTGP   0000143561    I have to admit that I am a fan of Giada's coo...
Name: sentence, dtype: object

### Remove objective sentences for case B using obj-subj model

In [32]:
# pd.set_option('display.max_colwidth', -1)
# sentences_on_movie['sentence']
sentences_on_movie.shape

(18, 4)

In [33]:
sentences_on_movie.head(20)

Unnamed: 0,reviewerID,asin,overall,sentence
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...
3,A3R5OBKS7OM2IR,143502,5.0,"I haven't tried any of the recipes yet, but I ..."
4,A3R5OBKS7OM2IR,143529,5.0,I love how Alton's collections can break it do...
5,AH3QC2PC1VTGP,143561,2.0,"They were however, crushed."
6,A3R5OBKS7OM2IR,143502,5.0,Sometimes it's just lovely to let Alton entert...
7,A3R5OBKS7OM2IR,143529,5.0,I might even try some of these recipes some day.
8,AH3QC2PC1VTGP,143561,2.0,"While I still love Giada's cooking, this set i..."
14,AH3QC2PC1VTGP,143561,2.0,"All they have are the video from the show, no ..."


#### Vectorize along the word space of the obj-subj training set

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = pickle.load(open('Obj-Subj_tfidf.pkl', 'rb'))
len(tfidf.vocabulary_)

20893

In [20]:
mat = tfidf.transform(sentences_on_movie['sentence']).todense()

In [21]:
mat

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

#### Apply the obj-subj model

In [22]:
N_TREES = 100
LEARN_RATE = 0.1
MIN_IN_LEAF = 10
pickle_in = open('GBC_'+ str(N_TREES) +'_' + str(LEARN_RATE) 
                        +'_' + str(MIN_IN_LEAF) + '_20min.pkl', 'rb')
gb_model = pickle.load(pickle_in)

In [23]:
y_test = gb_model.predict(mat)
y_test

array([1., 1., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 1., 1.,
       0.])

In [24]:
subjective_sentences = sentences_on_movie[y_test == 1]

In [25]:
subjective_sentences

Unnamed: 0,reviewerID,asin,overall,sentence
0,A3R5OBKS7OM2IR,143502,5.0,This has some great tips as always and is help...
1,A3R5OBKS7OM2IR,143529,5.0,This is a great pastry guide.
2,AH3QC2PC1VTGP,143561,2.0,I have to admit that I am a fan of Giada's coo...
4,A3R5OBKS7OM2IR,143529,5.0,I love how Alton's collections can break it do...
6,A3R5OBKS7OM2IR,143502,5.0,Sometimes it's just lovely to let Alton entert...
20,AH3QC2PC1VTGP,143561,2.0,You can get more by taping the shows and then ...
38,AH3QC2PC1VTGP,143561,2.0,"But, that's all you get is watching Giada."
44,AH3QC2PC1VTGP,143561,2.0,But to get consistent results on some of the d...
47,AH3QC2PC1VTGP,143561,2.0,"While Giada herself is great, this set is a wa..."


#### Merge the sentences back into paragraph reviews

In [26]:
subj_groups = subjective_sentences.groupby(['reviewerID','asin'])
subj_reviews_stars = subj_groups['overall'].mean()
# subjective_reviewssubjective_reviews['sentence'].apply(lambda x: x.sum())
# subjective_reviews_reviews = 
subj_reviews_stars

reviewerID      asin      
A3R5OBKS7OM2IR  0000143502    5.0
                0000143529    5.0
AH3QC2PC1VTGP   0000143561    2.0
Name: overall, dtype: float64

In [27]:
subj_review_comments = subj_groups['sentence'].sum()
print(subj_review_comments.shape)
print(subj_review_comments[0])
subj_review_comments

(3,)
This has some great tips as always and is helping me to complete my Good Eats collection.Sometimes it's just lovely to let Alton entertain us.


reviewerID      asin      
A3R5OBKS7OM2IR  0000143502    This has some great tips as always and is help...
                0000143529    This is a great pastry guide.I love how Alton'...
AH3QC2PC1VTGP   0000143561    I have to admit that I am a fan of Giada's coo...
Name: sentence, dtype: object

## Create emotion vectors

In [37]:
print('Total number of reviews:', all_reviews_comments.shape[0])
print('Total number of subjective reviews:', subj_review_comments.shape[0])

Total number of reviews: 3
Total number of subjective reviews: 3


In [42]:
%reload_ext autoreload
from emotions_seven import Emotions7
emote = Emotions7()

In [43]:
all_reviews_emotions = emote.vectorize(all_reviews_comments)
all_reviews_emotions

array([[0.00249115, 0.00249115, 0.99998138, 0.00249115, 0.00249115,
        0.00249115, 0.00249115],
       [0.00249115, 0.00249115, 0.99998138, 0.00249115, 0.00249115,
        0.00249115, 0.00249115],
       [0.00249115, 0.00249115, 0.99998138, 0.00249115, 0.00249115,
        0.00249115, 0.00249115]])

In [31]:
# comments_emotions.shape