In [1]:
import mysql.connector
from pprint import pprint

config = {
    "user": "root",
    "password": "root",
    "host": "127.0.0.1",
    "database": "senior_design"
}
connection = mysql.connector.connect(**config)
cursor = connection.cursor(buffered=True)

In [2]:
query = "SELECT distinct asin, COUNT(asin) AS count FROM review GROUP BY asin ORDER BY count DESC"
cursor.execute(query)
asins = []
for asin, count in cursor:
    asins.append((asin,count))
asins = sorted(asins, key=lambda x: x[1], reverse=True)

In [5]:
pprint(asins[8000:8005])

[('B000Q3IUV2', 158),
 ('B000QSN3O6', 158),
 ('B000TXNS6G', 158),
 ('B000J1CCGA', 158),
 ('B000GYU9IS', 158)]


In [5]:
def get_all_reviews(asin):
    query = "SELECT review_text FROM review WHERE asin = '{}'".format(asin)
    cursor.execute(query)
    reviews = []
    for (review_text) in cursor:
        reviews.append(review_text[0])
    print("# reviews: {}".format(len(reviews)))
    return reviews


In [6]:
positive_lexicon = {"good", "great", "better", "excellent", "best", "easy", "nice", "simple", "clear", "strong", 
                    "perfect", "comfortable", "friendly", "solid", "precise", "awesome", "amazing", "bright", "vibrant",
                    "fantastic", "vibrant", "realistic", "stunning", "superior", "super", "rich", "exceptional",
                    "impressive", "ideal"}
negative_lexicon = {"poor", "old", "bad", "weak", "annoying", "defective", "horrible", "buggy", "worst", "mediocre",
                    "difficult", "unstable", "inferior", "lousy", "complicated", "useless", "unreliable", "sloppy",
                    "strange", "weird", "malfunctioning", "miserable", "terrible", "misleading"}

In [7]:
from nltk.parse.corenlp import CoreNLPDependencyParser
# Start the CoreNLP server with:
# java -mx4g -cp "./CoreNLP/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
#     (on my Mac, java8 bin located at /Library/Internet\ Plug-Ins/JavaAppletPlugin.plugin/Contents/Home/bin/java)
nlp = CoreNLPDependencyParser(url="http://localhost:9000")

In [8]:
from collections import defaultdict
from nltk.tokenize import sent_tokenize

# input: parsed_sentence, cumulative information dictionaries (FO_dict, OF_dict, FF_dict, OO_dict, features_count, opinions_count)
# output: extracted dependency features
def extract_relevant_dependencies(parsed_sentence, FO_dict, OF_dict, FF_dict, OO_dict, features_count, opinions_count):
    extracted_sentence = []
    for (gov, gov_pos), dependency, (dep, dep_pos) in parsed_sentence.triples():
        if not gov.isalpha() or not dep.isalpha():
            continue
        gov = gov.lower()
        dep = dep.lower()
        if dependency == "nsubj" and dep_pos == "NN":
            OF_dict[gov] = dep
            FO_dict[dep] = gov
            features_count[dep] += 1
            opinions_count[gov] += 1
        elif dependency == "amod" and gov_pos == "NN":
            OF_dict[dep] = gov
            FO_dict[gov] = dep
            opinions_count[dep] += 1
            features_count[gov] += 1
        elif dependency == "conj":
            if gov_pos == "JJ" and dep_pos == "JJ":
                OO_dict[gov].append(dep)
                OO_dict[dep].append(gov)
                opinions_count[gov] += 1
                opinions_count[dep] += 1
            elif gov_pos == "NN" and dep_pos == "NN":
                FF_dict[gov].append(dep)
                FF_dict[dep].append(gov)
                features_count[gov] += 1
                features_count[dep] += 1
        extracted_sentence.append(((gov, gov_pos), dependency, (dep, dep_pos)))
    #parsed_sentences.append(extracted_sentence)
    return extracted_sentence


# input: all_review_info, cumulative information dictionaries
# output: new_features, new_opinions
def double_propagation_iterate(all_review_info,
                               features,
                               feature_words_by_review,
                               feature_sentiments_by_review,
                               feature_sentiments_cumulative,
                               feature_sentiments_pos,
                               feature_sentiments_neg,
                               opinions,
                               opinion_words_by_review,
                               opinion_sentiments):
    new_opinions = set()
    new_features = set()

    for index, info in all_review_info.items():
        feature_sentiments_by_review[index] = defaultdict(int)

        for opinion, feature in info['OF_dict'].items():
            if opinion in opinions:
                if feature not in features:
                    new_features.add(feature)

                if feature not in feature_words_by_review[index]:
                    feature_words_by_review[index].add(feature)

                    # target takes polarity of modifying opinion word
                    feature_sentiments_by_review[index][feature] = opinion_sentiments[opinion]
                    # add to target's cumulative sentiment score
                    feature_sentiments_cumulative[feature] += opinion_sentiments[opinion]
                    if opinion_sentiments[opinion] > 0:
                        feature_sentiments_pos[feature].append(info['index'])
                    elif opinion_sentiments[opinion] < 0:
                        feature_sentiments_neg[feature].append(info['index'])

                # have we seen this opinion word in this review?
                if opinion not in opinion_words_by_review[index]:
                    opinion_words_by_review[index].add(opinion)
                    info['cumulative_polarity'] += opinion_sentiments[opinion]

        for opinion1, related in info['OO_dict'].items():
            if opinion1 in opinions:
                for opinion in related:
                    if opinion not in opinions:
                        new_opinions.add(opinion)
                        opinion_sentiments[opinion] = opinion_sentiments[opinion1]

                    # have we seen this opinion word in this review?
                    if opinion not in opinion_words_by_review[index]:
                        opinion_words_by_review[index].add(opinion)
                        info['cumulative_polarity'] += opinion_sentiments[opinion]

                # have we seen this opinion word in this review?
                if opinion1 not in opinion_words_by_review[index]:
                    opinion_words_by_review[index].add(opinion1)
                    info['cumulative_polarity'] += opinion_sentiments[opinion1]

        for feature, opinion in info['FO_dict'].items():
            if feature in features:
                if opinion not in opinions:
                    new_opinions.add(opinion)

                    # if target has sentiment in current review
                    if feature in feature_words_by_review[index]:
                        # then opinion takes polarity of target (Homogenous Rule)
                        opinion_sentiments[opinion] = feature_sentiments_by_review[index][feature]
                    else:
                        # else target is from another review
                        # opinion takes cumulative sentiment of entire review (Intra-review Rule)
                        try:
                            cumulative_polarity = int(info['cumulative_polarity'] / abs(info['cumulative_polarity']))
                        except ZeroDivisionError:
                            cumulative_polarity = 0
                        opinion_sentiments[opinion] = cumulative_polarity

                        # also apply that polarity to the feature (should we do this?)
                        feature_words_by_review[index].add(feature)
                        feature_sentiments_by_review[index][feature] = opinion_sentiments[opinion]
                        # add to target's cumulative sentiment
                        feature_sentiments_cumulative[feature] += opinion_sentiments[opinion]
                        if opinion_sentiments[opinion] > 0:
                            feature_sentiments_pos[feature].append(info['index'])
                        elif opinion_sentiments[opinion] < 0:
                            feature_sentiments_neg[feature].append(info['index'])

                if opinion not in opinion_words_by_review[index]:
                    opinion_words_by_review[index].add(opinion)
                    info['cumulative_polarity'] += opinion_sentiments[opinion]

                if feature not in feature_sentiments_by_review[index]:
                    feature_words_by_review[index].add(feature)
                    feature_sentiments_by_review[index][feature] = opinion_sentiments[opinion]
                    # add to target's cumulative sentiment
                    feature_sentiments_cumulative[feature] += opinion_sentiments[opinion]
                    if opinion_sentiments[opinion] > 0:
                        feature_sentiments_pos[feature].append(info['index'])
                    elif opinion_sentiments[opinion] < 0:
                        feature_sentiments_neg[feature].append(info['index'])

        for feature1, related in info['FF_dict'].items():
            if feature1 in features and feature1 in feature_sentiments_by_review[index]:
                for feature in related:
                    if feature not in features:
                        new_features.add(feature)

                    # have we seen this target word in this review?
                    if feature not in feature_words_by_review[index]:
                        feature_words_by_review[index].add(feature)

                        # Homogenous Rule
                        feature_sentiments_by_review[index][feature] = feature_sentiments_by_review[index][feature1]
                        feature_sentiments_cumulative[feature] += feature_sentiments_by_review[index][feature]
                        if feature_sentiments_by_review[index][feature] > 0:
                            feature_sentiments_pos[feature].append(info['index'])
                        elif feature_sentiments_by_review[index][feature] < 0:
                            feature_sentiments_neg[feature].append(info['index'])
    return new_features, new_opinions


# input: list of review texts
# output: all features, expanded opinion lexicon
def extract_features_opinions(reviews):
    features = set()
    features_count = defaultdict(int)
    opinions = positive_lexicon.union(negative_lexicon)
    opinions_count = defaultdict(int)
    
    raw_sentences = []
    parsed_sentences = []
    parses = []
    review_indices = []
    review_info = {} # store info about deps on per review basis
    for i, review in enumerate(reviews):
        if i % 500 == 0:
            print(i)
        OF_dict = {}
        FO_dict = {}
        OO_dict = defaultdict(list)
        FF_dict = defaultdict(list)
        
        raw_sentences.extend(sent_tokenize(review))
        parse = nlp.parse_text(review)
        parses.append(parse)
        try:
            for sentence in parse:
                # extract relevant dependency information
                extracted_sentence = extract_relevant_dependencies(sentence, FO_dict, OF_dict, FF_dict, OO_dict, features_count, opinions_count)

                review_indices.append(i)
                parsed_sentences.append(extracted_sentence)
        except ValueError:
            # probably not English
            pass

        review_info[i] = { 'index' : i,
                           'OF_dict' : OF_dict,
                           'FO_dict' : FO_dict,
                           'OO_dict' : OO_dict,
                           'FF_dict' : FF_dict,
                           'cumulative_polarity' : 0 }

    # instantiate cumulative data structures
    i = 0
    feature_sentiments_by_review = defaultdict(dict) # same sentiment for target words within review (this is an assumption [Observation 1])
    feature_sentiments_cumulative = defaultdict(int)
    feature_sentiments_pos = defaultdict(list) # keep track of the review indices that contributed negative sentiments toward each feature
    feature_sentiments_neg = defaultdict(list) # keep track of the review indices that contributed negative sentiments toward each feature
    feature_words_by_review = defaultdict(set) # keep track of the feature words in each review
    opinion_words_by_review = defaultdict(set) # keep track of the opinion words in each review
    opinion_sentiments = {} # same sentiment for opinion words throughout the corpus (this is an assumption [Observation 2])
    opinion_sentiments.update({op:(1 if op in positive_lexicon else -1) for op in opinions})

    while (True):
        print("DP Iteration: {}".format(i))
        i += 1

        # double propagation step
        new_features, \
        new_opinions = double_propagation_iterate(review_info,
                                                  features,
                                                  feature_words_by_review,
                                                  feature_sentiments_by_review,
                                                  feature_sentiments_cumulative,
                                                  feature_sentiments_pos,
                                                  feature_sentiments_neg,
                                                  opinions,
                                                  opinion_words_by_review,
                                                  opinion_sentiments)
        
        features = features.union(new_features)
        opinions = opinions.union(new_opinions)
        if len(new_opinions) == 0 and len(new_features) == 0:
            break

    res = (features,
           features_count,
           opinions,
           opinions_count,
           raw_sentences,
           parsed_sentences,
           review_indices,
           feature_sentiments_by_review,
           feature_words_by_review,
           feature_sentiments_cumulative,
           feature_sentiments_pos,
           feature_sentiments_neg,
           opinion_words_by_review,
           opinion_sentiments)
    return res


In [9]:
# Product quality clustering
import pickle
from collections import defaultdict

CLASS_FILE = "./clustering/results/clean-classes.pkl"
feature_to_class = pickle.load(open(CLASS_FILE, 'rb'))

def get_sorted_classes(features_by_count):
    frequent_features = [(f, cnt) for f, cnt in features_by_count if cnt >= 5]
    features_by_class = dict()
    not_found = set()

    # features_by_class is a dict from class number -> [feature list, total count]
    for feature, cnt in frequent_features:
        if feature not in feature_to_class:
            not_found.add(feature)
            continue
        class_num = feature_to_class[feature]
        if class_num not in features_by_class:
            features_by_class[class_num] = [[], 0]
        features_by_class[class_num][0].append(feature)
        features_by_class[class_num][1] += cnt

    sorted_classes = sorted(features_by_class.values(), key=lambda x: x[1], reverse=True)
    print("Not found in any cluster: " + str(not_found))
    return sorted_classes


In [10]:
def process_asin(asin):
    feature_to_class = pickle.load(open("./clustering/results/clean-classes.pkl", "rb"))
    print(asin)
    product_reviews = get_all_reviews(asin)

    product_info = {'asin':asin}
    product_info['features'], \
    product_info['features_count'], \
    product_info['opinions'], \
    product_info['opinions_count'], \
    product_info['raw_sentences'], \
    product_info['parsed_sentences'], \
    product_info['review_indices'], \
    product_info['feature_sentiments_by_review'], \
    product_info['feature_words_by_review'], \
    product_info['feature_sentiments_cumulative'], \
    product_info['feature_sentiments_pos'], \
    product_info['feature_sentiments_neg'], \
    product_info['opinion_words_by_review'], \
    product_info['opinion_sentiments'] = extract_features_opinions(product_reviews)

    features_by_count = sorted(product_info['features_count'].items(), key=lambda x: x[1], reverse=True)
    product_info['features_by_count'] = features_by_count
    
    product_info['sorted_classes'] = get_sorted_classes(features_by_count)

    return product_info


In [12]:
# import nltk
# nltk.download("punkt")

asins_to_process = [asins[1000]]
product_infos = {}

for asin,_ in asins_to_process:
    product_infos[asin] = process_asin(asin)


B000CRFOMK
# reviews: 770
0
500
DP Iteration: 0
DP Iteration: 1
DP Iteration: 2
DP Iteration: 3
DP Iteration: 4
DP Iteration: 5
DP Iteration: 6
DP Iteration: 7
DP Iteration: 8
Not found in any cluster: {'solution', 'something', 'anyone', 'issue', 'work', 'everything', 'side', 'rating', 'job', 'nothing', 'idea', 'use', 'thing', 'product', 'equipment', 'duty', 'everyone', 'lot', 'problem', 'purpose'}


In [None]:
import pathlib, pickle, time

OUTPUT_DIR = 'output/product_info' 
pathlib.Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) 

for asin, product_info in product_infos.items():
    
    with open('{}/{}.pkl'.format(OUTPUT_DIR, asin), 'wb') as handle:
        pickle.dump(product_info, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
def print_detailed_summary(product_info):
    features = product_info['features']
    opinions = product_info['opinions']
    parsed_sentences = product_info['parsed_sentences']
    review_indices = product_info['review_indices']
    raw_sentences = product_info['raw_sentences']
    feature_sentiments_by_review = product_info['feature_sentiments_by_review']
    j = 0
    k = -1
    for i, sentence in enumerate(parsed_sentences):
        phrase_dict = {}
        review_index = review_indices[i]
        if review_index != review_indices[i-1]:
            j = 0
            print("==========================================\n\nReview #{}".format(review_index))
        else:
            j += 1
        k += 1
        print("\n\tSentence #{}".format(j))
        print("\t{}".format(raw_sentences[k]))
        for (gov, gov_pos), dependency, (dep, dep_pos) in sentence:
            if not gov.isalpha() or not dep.isalpha():
                continue
            gov = gov.lower()
            dep = dep.lower()
            if dependency == "nsubj" and dep in features:
                print("\t\tnsubj: {} -> {}, {}".format(gov, dep, feature_sentiments_by_review[review_index][dep]))
            elif dependency == "amod" and gov in features:
                print("\t\tamod: {} -> {}, {}".format(dep, gov, feature_sentiments_by_review[review_index][gov]))
            elif dependency == "compound" and gov in features:
                phrase_dict[gov] = dep + " " + gov


In [None]:
print_detailed_summary(product_info)

In [None]:
import pickle

with open('product_infos.pkl', 'rb') as handle:
    product_infos = pickle.load(handle)


In [77]:
# TODO: ADD TO DOUBLE_PROP.PY

from collections import defaultdict

# quality cluster table
def get_quality_clusters(asin):
    quality_clusters = []
    
    product_info = product_infos[asin]
    clusters_dict = defaultdict(list)
    feature_to_class = pickle.load(open("./clustering/results/classes.pkl", "rb"))
    features_by_count = sorted(product_info['features_count'].items(), key=lambda x:x[1], reverse=True)
    for feature, _ in features_by_count:
        try:
            class_of_feature = feature_to_class[feature]
            clusters_dict[class_of_feature].append(feature)
        except KeyError:
            pass
    clusters = list(clusters_dict.values())

    class_of_cluster = {}
    clusters_inverse = {}
    cluster_sentiments = {}
    for id_, cluster_features in enumerate(clusters):
        cluster_sentiment = [0, 0] # [pos, neg]
        for feature in cluster_features:
            clusters_inverse[feature] = id_
            class_of_cluster[id_] = feature_to_class[feature]
            cluster_sentiment[0] += len(product_info['feature_sentiments_pos'][feature])
            cluster_sentiment[1] += len(product_info['feature_sentiments_neg'][feature])
        cluster_sentiments[id_] = cluster_sentiment
    for cluster_id, sentiments in cluster_sentiments.items():
        class_id = class_of_cluster[cluster_id]
        cluster_features = clusters_dict[class_id]
        num_positive = cluster_sentiments[cluster_id][0]
        num_negative = cluster_sentiments[cluster_id][1]
        quality_clusters.append((asin, class_id, cluster_id, cluster_features, num_positive, num_negative))
    product_info['cluster_sentiments'] = cluster_sentiments
    product_info['class_of_cluster'] = class_of_cluster
    product_info['clusters'] = clusters
    return quality_clusters


# product-quality relationship table
def get_product_quality_relationships(asin):
    product_quality_relationships = []
    product_info = product_infos[asin]
    clusters = product_info['clusters']
    cluster_sentiments = product_info['cluster_sentiments']
    class_of_cluster = product_info['class_of_cluster']
    for id_, cluster in enumerate(clusters):
        quality_cluster_id = id_
        quality_list = clusters[id_]
        num_positive = cluster_sentiments[id_][0]
        num_negative = cluster_sentiments[id_][1]
        
        for feature in cluster:
            quality = feature
            quality_class_id = class_of_cluster[id_]
            num_positive = len(product_info['feature_sentiments_pos'][feature])
            num_negative = len(product_info['feature_sentiments_neg'][feature])
            product_quality_relationships.append((asin, quality, quality_cluster_id, quality_class_id, num_positive, num_negative))
    return product_quality_relationships


# product quality class table
def get_class_table(feature_to_class)
    classes = defaultdict(list)
    for feature, class_ in feature_to_class.items():
        classes[class_].append(feature)
    classes = sorted(classes.items(), key=lambda x: x[0])
    
    return classes

product_quality_class_table_columns = ['id', 'quality_list']
product_quality_class_table = get_class_table(feature_to_class)

product_quality_relationship_table = []
quality_clusters_table = []

for asin, _ in [asins[35]]:
    quality_clusters = get_quality_clusters(asin)
    quality_clusters_table.extend(quality_clusters)
    
    product_quality_relationships = get_product_quality_relationships(asin)
    product_quality_relationship_table.extend(product_quality_relationships)
quality_clusters_table_columns = ['asin', 'class_id', 'quality_cluster_id', 'quality_list', 'num_positive', 'num_negative']
quality_clusters_table = sorted(quality_clusters_table, key=lambda x: x[4]+x[5], reverse=True)

product_quality_relationship_table_columns = ['asin', 'quality', 'quality_cluster_id', 'quality_class_id', 'num_positive', 'num_negative']
product_quality_relationship_table = sorted(product_quality_relationship_table, key=lambda x: x[4]+x[5], reverse=True)


In [78]:
product_quality_relationship_table[:5]

[('B003DZ165W', 'cover', 169, 208, 7405, 484),
 ('B003DZ165W', 'light', 340, 433, 7089, 363),
 ('B003DZ165W', 'kindle', 169, 208, 2325, 223),
 ('B003DZ165W', 'product', 226, 288, 2283, 86),
 ('B003DZ165W', 'case', 169, 208, 2190, 175)]

In [79]:
quality_clusters_table[:10]


[('B003DZ165W',
  208,
  169,
  ['cover',
   'kindle',
   'case',
   'leather',
   'protection',
   'tab',
   'bulk',
   'closure',
   'flap',
   'clasp',
   'spine',
   'ipad',
   'fire',
   'tablet',
   'folio',
   'closing',
   'kindel',
   'portfolio',
   'glove',
   'snug',
   'paperwhite'],
  14162,
  964),
 ('B003DZ165W',
  433,
  340,
  ['light',
   'lighting',
   'night',
   'illumination',
   'darkness',
   'brightness',
   'intensity',
   'booklight',
   'dark',
   'dim',
   'flashlight',
   'shine',
   'backlight',
   'spotlight',
   'shone',
   'dimmer'],
  8482,
  434),
 ('B003DZ165W',
  288,
  226,
  ['product',
   'item',
   'purchase',
   'shipping',
   'delivery',
   'seller',
   'order',
   'manner',
   'company',
   'transaction',
   'shipment',
   'merchant',
   'fashion',
   'promptness',
   'vendor',
   'merchandise'],
  3084,
  155),
 ('B003DZ165W',
  102,
  79,
  ['corner',
   'side',
   'right',
   'left',
   'spot',
   'bottom',
   'front',
   'top',
   'rest

In [None]:
def print_summary(asin, product_info):
    if 'error' not in product_info:
        print('ASIN: {}'.format(asin))
        
        print('# of Positive Sentiments')
        feature_sentiments_pos_sorted = sorted([(k,len(v)) for k,v in product_info['feature_sentiments_pos'].items()], key=lambda x: x[1], reverse=True)
        pprint(feature_sentiments_pos_sorted[:15])
        
        print('# of Negative Sentiments')
        feature_sentiments_neg_sorted = sorted([(k,len(v)) for k,v in product_info['feature_sentiments_neg'].items()], key=lambda x: x[1], reverse=True)
        pprint(feature_sentiments_neg_sorted[:15])
        
        feature = feature_sentiments_neg_sorted[0][0]
        print('FEATURE: {}'.format(feature))
        for review_num, words in product_info['feature_words_by_review'].items():
            if feature in words:
                if product_info['feature_sentiments_by_review'][review_num][feature] < 0:
                    print(review_num, product_info['feature_sentiments_by_review'][review_num][feature])
                    for sentence_num,review_num_ in enumerate(product_info['review_indices']):
                        if review_num == review_num_:
                            sentence = product_info['raw_sentences'][sentence_num]
                            if feature in [w.lower() for w in sentence.split()]:
                                print(sentence)

#                print([product_info['raw_sentences'][sentence_num] \
#                       for sentence_num,review_num in enumerate(product_info['review_indices']) \
#                       if feature in product_info['feature_words_by_review'][review_num] ])                    


In [None]:
#product_infos['B00DR0PDNE']['opinion_words_by_review']


('B003DZ165W', 4567)

In [None]:
# possible statistics
# # pos sentiments vs # neg sentiments

In [92]:
# TODO: ADD TO DOUBLE_PROP.PY

# snippet table

from nltk import word_tokenize
asin = 'B000LRMS66'
def get_snippet_table(asin, k):
    snippets = []
    product_info = product_infos[asin]
    raw_sentences = product_info['raw_sentences']
    review_indices = product_info['review_indices']
    top_features_by_count = [feature for feature,cnt in sorted(product_info['features_count'].items(), key=lambda x:x[1], reverse=True)]
    top_feature_set = set(top_features_by_count[:k])
    for sentence_id, (sentence,review_id) in enumerate(zip(raw_sentences,review_indices)):
        for word in word_tokenize(sentence):
            word = word.lower()
            
            if type(k) is int:
                if word in top_feature_set:
                    polarity = product_info['feature_sentiments_by_review'][review_id]
                    snippets.append((asin, word, review_id, sentence_id, sentence, polarity[word]))
            elif type(k) is list:
                if word in k:
                    polarity = product_info['feature_sentiments_by_review'][review_id]
                    snippets.append((asin, word, review_id, sentence_id, sentence, polarity[word]))

        return snippets

# TODO: get_positive/negative_snippets
#pprint([(a,b) for a,b in {k:list(filter(lambda w: (product_infos['B00DR0PDNE']['opinion_sentiments'][w] < 0),v)) for k,v in product_infos['B00DR0PDNE']['opinion_words_by_review'].items()}.items() if len(b)>0])



In [99]:
asin = asins[35][0]
snippets = get_snippet_table(asin, 10)
positive_snippets = [(asin, word, review_id, sentence_id, sentence, polarity) for asin, word, review_id, sentence_id, sentence, polarity in snippets if polarity > 0]
negative_snippets = [(asin, word, review_id, sentence_id, sentence, polarity) for asin, word, review_id, sentence_id, sentence, polarity in snippets if polarity < 0]


In [101]:
snippets[:20]

[('B003DZ165W', 'kindle', 0, 0, 'The lighted Kindle cover works great.', 0),
 ('B003DZ165W', 'cover', 0, 0, 'The lighted Kindle cover works great.', 1),
 ('B003DZ165W',
  'kindle',
  0,
  1,
  'Protects my Kindle and allows me to read in the dark without a bulky book light.',
  0),
 ('B003DZ165W',
  'light',
  0,
  1,
  'Protects my Kindle and allows me to read in the dark without a bulky book light.',
  -1),
 ('B003DZ165W',
  'leather',
  1,
  3,
  'I purchased a black leather kindle cover with a built in light.',
  0),
 ('B003DZ165W',
  'kindle',
  1,
  3,
  'I purchased a black leather kindle cover with a built in light.',
  0),
 ('B003DZ165W',
  'cover',
  1,
  3,
  'I purchased a black leather kindle cover with a built in light.',
  1),
 ('B003DZ165W',
  'light',
  1,
  3,
  'I purchased a black leather kindle cover with a built in light.',
  0),
 ('B003DZ165W', 'cover', 2, 5, 'This is a really nice cover.', 1),
 ('B003DZ165W',
  'light',
  2,
  7,
  'The light is perfect - I use 