In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pickle

In [11]:
with open("../../electronics_10000.pickle", 'rb') as f:
    noun_adj_data = pickle.load(f)

In [15]:
noun_adj_data[:5]

[({'asin': 'B000RGF29Q',
   'helpful': [0, 3],
   'overall': 3.0,
   'reviewText': 'The P4460 is an ok product but customer support is non existant. They never answer the phone so I left 2 voice mail messages & 1 email. They never responded.Update 10/09/10 They finally responded by email so I changed my rating from 1 to 3.Mike NY',
   'reviewTime': '10 6, 2010',
   'reviewerID': 'A35804IR4ZMEAF',
   'reviewerName': 'Mike NY',
   'summary': 'Ok Product But No Customer Support!',
   'unixReviewTime': 1286323200},
  [('support', ['existant']),
   ('P4460', ['product']),
   ('existant', ['non']),
   ('product', ['ok']),
   ('They', []),
   ('I', []),
   ('They', []),
   ('I', []),
   ('They', [])]),
 ({'asin': 'B000RGF29Q',
   'helpful': [2, 2],
   'overall': 5.0,
   'reviewText': 'We got ours last night and got down to business. Serious home-economics business. The big ones I was worried about were entertainment center with Wii, stereo, etc. "on idle", a laptop station in another room tha

In [47]:
# Group by product_id
from collections import defaultdict
noun_adj_data_of_pid = defaultdict(list)

for review_info, review_features in noun_adj_data:
    noun_adj_data_of_pid[review_info['asin']].append((review_info, review_features))

sorted_noun_adj_data = sorted(
    noun_adj_data_of_pid.items(),
    key=lambda x: len(x[1]),
    reverse=True
)

In [48]:
len(sorted_noun_adj_data[0][1])

628

In [49]:
# Convert noun adj data into vocab/corpus
# vocab: set of all nouns
# corpus: all reviews of product

def get_vocab_and_corpus(reviews):
    vocab = set()
    corpus = []
    for review_data, review_features in reviews:
        for noun, _ in review_features:
            vocab.add(noun)

        corpus.append(review_data['reviewText'])
    return vocab, corpus


In [80]:
def count_vocab_frequency(vocab, corpus):
    """
    vocab: ['battery life', 'picture quality', 'size']
    corpus: ['The battery life sucks.',
                'The picture quality is great, but poor battery life.',
                'Loved this product, but the size was too large.']
    """
    vectorizer = CountVectorizer(ngram_range=(1,2), token_pattern=r'\b\w+\b', vocabulary=vocab)
    X = vectorizer.fit_transform(corpus)
    df = X.power(0).sum(0).A1 #Calculates df of each feature
    df_dict = dict(zip(vocab, df))
    return df_dict


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [71]:
# sample_product: sorted_noun_adj_data[0]
vocab, corpus = get_vocab_and_corpus(sorted_noun_adj_data[0][1])


In [87]:
vocab_freq_dict = count_vocab_frequency(list(vocab), corpus)

In [91]:
sorted_vocab_freqs = sorted(vocab_freq_dict.items(), key=lambda x: x[1], reverse=True)

In [92]:
from stop_words import get_stop_words
stop_words = set(get_stop_words('english'))

In [105]:
import datetime
import csv

def export(asin, word_freqs):
    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
    with open('%s_%s.csv' % (asin, now), 'w') as f:
        writer = csv.DictWriter(f, fieldnames=['Word', 'Count'])
        writer.writeheader()
        for word, count in word_freqs:
            writer.writerow({'Word': word, 'Count': count})
    
def process_product_reviews_and_features(asin, reviews_and_features):
    # Input: [(review_data, features), ...] for a particular product.
    vocab, corpus = get_vocab_and_corpus(reviews_and_features)
    vocab_freq_dict = count_vocab_frequency(vocab, corpus)
    sorted_vocab_freqs = sorted(vocab_freq_dict.items(), key=lambda x: x[1], reverse=True)
    filtered_freqs = [(word, count) for word, count in sorted_vocab_freqs if word not in stop_words]
    # export
    export(asin, filtered_freqs)

In [106]:
for asin, reviews_and_features in sorted_noun_adj_data[:5]:
    process_product_reviews_and_features(asin, reviews_and_features)

In [57]:
sorted_noun_adj_data[0][1]

[({'asin': 'B000S5Q9CA',
   'helpful': [0, 0],
   'overall': 5.0,
   'reviewText': 'ok , fit my iphone',
   'reviewTime': '07 15, 2014',
   'reviewerID': 'AGWKQIBF6VNXB',
   'summary': 'Five Stars',
   'unixReviewTime': 1405382400},
  []),
 ({'asin': 'B000S5Q9CA',
   'helpful': [0, 0],
   'overall': 5.0,
   'reviewText': "Got this for my Dad to replace his car charger from Wal-****.  He loves this one.  Says it's much heavier and better quality and charges faster.  I have no personal interaction with it, but he loves it.",
   'reviewTime': '11 20, 2013',
   'reviewerID': 'A3UN4DWLYLRIKN',
   'reviewerName': '04Blackram "Dantie"',
   'summary': 'Great charger for a great price!!',
   'unixReviewTime': 1384905600},
  [('He', []),
   ('it', ['quality']),
   ('quality', ['much', 'heavier', 'better']),
   ('I', []),
   ('he', []),
   ('interaction', ['personal'])]),
 ({'asin': 'B000S5Q9CA',
   'helpful': [0, 0],
   'overall': 4.0,
   'reviewText': 'i ordered this as a back up to ensure I wo

holograms. It matches exactly the charger I bought at the Verizon store 2 years ago for my Original Droid.Yes the charger will last a long time.


holograms. It matches exactly the charger I bought at the Verizon store 2 years ago for my Original Droid.Yes the charger will last a long time.


['The charger came with all of the correct Motorola markings and holograms. It matches exactly the charger I bought at the Verizon store 2 years ago for my Original Droid.Yes the charger will last a long time.']

In [74]:
counts = defaultdict(int)
for review in corpus:
    for word in review.split():
        counts[word] += 1

In [76]:
counts['Droid.Yes']

1

In [None]:
# Remove 