In [1]:
import mysql.connector
from pprint import pprint

config = {
    "user": "root",
    "password": "root",
    "host": "127.0.0.1",
    "database": "senior_design"
}
connection = mysql.connector.connect(**config)
cursor = connection.cursor(buffered=True)

In [2]:
query = "SELECT distinct asin, COUNT(asin) AS count FROM review GROUP BY asin ORDER BY count DESC"
cursor.execute(query)
asins = []
for asin, count in cursor:
    asins.append((asin,count))


KeyboardInterrupt: 

In [None]:
pprint(asins[:20])

In [2]:
def get_all_reviews(asin):
    query = "SELECT review_text FROM review WHERE asin = '{}'".format(asin)
    cursor.execute(query)
    reviews = []
    for (review_text) in cursor:
        reviews.append(review_text[0])
    print("# reviews: {}".format(len(reviews)))
    return reviews


In [7]:
reviews = get_all_reviews("B00004ZC8Y")


# reviews: 720


In [None]:
print(reviews[0])

In [4]:
positive_lexicon = {"good", "great", "better", "excellent", "best", "easy", "nice", "simple", "clear", "strong", 
                    "perfect", "comfortable", "friendly", "solid", "precise", "awesome", "amazing", "bright", "vibrant",
                    "fantastic", "vibrant", "realistic", "stunning", "superior", "super", "rich", "exceptional",
                    "impressive", "ideal"}
negative_lexicon = {"poor", "old", "bad", "weak", "annoying", "defective", "horrible", "buggy", "worst", "mediocre",
                    "difficult", "unstable", "inferior", "lousy", "complicated", "useless", "unreliable", "sloppy",
                    "strange", "weird", "malfunctioning", "miserable", "terrible", "misleading"}

In [3]:
from nltk.parse.corenlp import CoreNLPDependencyParser
# Start the CoreNLP server with:
# java -mx4g -cp "./CoreNLP/*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
#     (on my Mac, java8 bin located at /Library/Internet\ Plug-Ins/JavaAppletPlugin.plugin/Contents/Home/bin/java)
nlp = CoreNLPDependencyParser(url="http://localhost:9000")

In [5]:
from collections import defaultdict
from nltk.tokenize import sent_tokenize

# input: parsed_sentence, cumulative information dictionaries (FO_dict, OF_dict, FF_dict, OO_dict, features_count, opinions_count)
# output: extracted dependency features
def extract_relevant_dependencies(parsed_sentence, FO_dict, OF_dict, FF_dict, OO_dict, features_count, opinions_count):
    extracted_sentence = []
    for (gov, gov_pos), dependency, (dep, dep_pos) in parsed_sentence.triples():
        if not gov.isalpha() or not dep.isalpha():
            continue
        gov = gov.lower()
        dep = dep.lower()
        if dependency == "nsubj" and dep_pos == "NN":
            OF_dict[gov] = dep
            FO_dict[dep] = gov
            features_count[dep] += 1
            opinions_count[gov] += 1
        elif dependency == "amod" and gov_pos == "NN":
            OF_dict[dep] = gov
            FO_dict[gov] = dep
            opinions_count[dep] += 1
            features_count[gov] += 1
        elif dependency == "conj":
            if gov_pos == "JJ" and dep_pos == "JJ":
                OO_dict[gov].append(dep)
                OO_dict[dep].append(gov)
                opinions_count[gov] += 1
                opinions_count[dep] += 1
            elif gov_pos == "NN" and dep_pos == "NN":
                FF_dict[gov].append(dep)
                FF_dict[dep].append(gov)
                features_count[gov] += 1
                features_count[dep] += 1
        extracted_sentence.append(((gov, gov_pos), dependency, (dep, dep_pos)))
    #parsed_sentences.append(extracted_sentence)
    return extracted_sentence


# input: all_review_info, cumulative information dictionaries
# output: new_features, new_opinions
def double_propagation_iterate(all_review_info,
                               features,
                               feature_words_by_review,
                               feature_sentiments_by_review,
                               feature_sentiments_cumulative,
                               feature_sentiments_pos,
                               feature_sentiments_neg,
                               opinions,
                               opinion_words_by_review,
                               opinion_sentiments):
    new_opinions = set()
    new_features = set()

    for index, info in all_review_info.items():
        feature_sentiments_by_review[index] = defaultdict(int)

        for opinion, feature in info['OF_dict'].items():
            if opinion in opinions:
                if feature not in features:
                    new_features.add(feature)

                if feature not in feature_words_by_review[index]:
                    feature_words_by_review[index].add(feature)

                    # target takes polarity of modifying opinion word
                    feature_sentiments_by_review[index][feature] = opinion_sentiments[opinion]
                    # add to target's cumulative sentiment score
                    feature_sentiments_cumulative[feature] += opinion_sentiments[opinion]
                    if opinion_sentiments[opinion] > 0:
                        feature_sentiments_pos[feature].append(info['index'])
                    elif opinion_sentiments[opinion] < 0:
                        feature_sentiments_neg[feature].append(info['index'])

                # have we seen this opinion word in this review?
                if opinion not in opinion_words_by_review[index]:
                    opinion_words_by_review[index].add(opinion)
                    info['cumulative_polarity'] += opinion_sentiments[opinion]

        for opinion1, related in info['OO_dict'].items():
            if opinion1 in opinions:
                for opinion in related:
                    if opinion not in opinions:
                        new_opinions.add(opinion)
                        opinion_sentiments[opinion] = opinion_sentiments[opinion1]

                    # have we seen this opinion word in this review?
                    if opinion not in opinion_words_by_review[index]:
                        opinion_words_by_review[index].add(opinion)
                        info['cumulative_polarity'] += opinion_sentiments[opinion]

                # have we seen this opinion word in this review?
                if opinion1 not in opinion_words_by_review[index]:
                    opinion_words_by_review[index].add(opinion1)
                    info['cumulative_polarity'] += opinion_sentiments[opinion1]

        for feature, opinion in info['FO_dict'].items():
            if feature in features:
                if opinion not in opinions:
                    new_opinions.add(opinion)

                    # if target has sentiment in current review
                    if feature in feature_words_by_review[index]:
                        # then opinion takes polarity of target (Homogenous Rule)
                        opinion_sentiments[opinion] = feature_sentiments_by_review[index][feature]
                    else:
                        # else target is from another review
                        # opinion takes cumulative sentiment of entire review (Intra-review Rule)
                        try:
                            cumulative_polarity = int(info['cumulative_polarity'] / abs(info['cumulative_polarity']))
                        except ZeroDivisionError:
                            cumulative_polarity = 0
                        opinion_sentiments[opinion] = cumulative_polarity

                        # also apply that polarity to the feature (should we do this?)
                        feature_words_by_review[index].add(feature)
                        feature_sentiments_by_review[index][feature] = opinion_sentiments[opinion]
                        # add to target's cumulative sentiment
                        feature_sentiments_cumulative[feature] += opinion_sentiments[opinion]
                        if opinion_sentiments[opinion] > 0:
                            feature_sentiments_pos[feature].append(info['index'])
                        elif opinion_sentiments[opinion] < 0:
                            feature_sentiments_neg[feature].append(info['index'])

                if opinion not in opinion_words_by_review[index]:
                    opinion_words_by_review[index].add(opinion)
                    info['cumulative_polarity'] += opinion_sentiments[opinion]

                if feature not in feature_sentiments_by_review[index]:
                    feature_words_by_review[index].add(feature)
                    feature_sentiments_by_review[index][feature] = opinion_sentiments[opinion]
                    # add to target's cumulative sentiment
                    feature_sentiments_cumulative[feature] += opinion_sentiments[opinion]
                    if opinion_sentiments[opinion] > 0:
                        feature_sentiments_pos[feature].append(info['index'])
                    elif opinion_sentiments[opinion] < 0:
                        feature_sentiments_neg[feature].append(info['index'])

        for feature1, related in info['FF_dict'].items():
            if feature1 in features and feature1 in feature_sentiments_by_review[index]:
                for feature in related:
                    if feature not in features:
                        new_features.add(feature)

                    # have we seen this target word in this review?
                    if feature not in feature_words_by_review[index]:
                        feature_words_by_review[index].add(feature)

                        # Homogenous Rule
                        feature_sentiments_by_review[index][feature] = feature_sentiments_by_review[index][feature1]
                        feature_sentiments_cumulative[feature] += feature_sentiments_by_review[index][feature]
                        if feature_sentiments_by_review[index][feature] > 0:
                            feature_sentiments_pos[feature].append(info['index'])
                        elif feature_sentiments_by_review[index][feature] < 0:
                            feature_sentiments_neg[feature].append(info['index'])
    return new_features, new_opinions


# input: list of review texts
# output: all features, expanded opinion lexicon
def extract_features_opinions(reviews):
    features = set()
    features_count = defaultdict(int)
    opinions = positive_lexicon.union(negative_lexicon)
    opinions_count = defaultdict(int)
    
    raw_sentences = []
    parsed_sentences = []
    parses = []
    review_indices = []
    review_info = {} # store info about deps on per review basis
    for i, review in enumerate(reviews):
        if i % 500 == 0:
            print(i)
        OF_dict = {}
        FO_dict = {}
        OO_dict = defaultdict(list)
        FF_dict = defaultdict(list)
        
        raw_sentences.extend(sent_tokenize(review))
        parse = nlp.parse_text(review)
        parses.append(parse)
        for sentence in parse:
            # extract relevant dependency information
            extracted_sentence = extract_relevant_dependencies(sentence, FO_dict, OF_dict, FF_dict, OO_dict, features_count, opinions_count)
            
            review_indices.append(i)
            parsed_sentences.append(extracted_sentence)

        review_info[i] = { 'index' : i,
                           'OF_dict' : OF_dict,
                           'FO_dict' : FO_dict,
                           'OO_dict' : OO_dict,
                           'FF_dict' : FF_dict,
                           'cumulative_polarity' : 0 }

    # instantiate cumulative data structures
    i = 0
    feature_sentiments_by_review = defaultdict(dict) # same sentiment for target words within review (this is an assumption [Observation 1])
    feature_sentiments_cumulative = defaultdict(int)
    feature_sentiments_pos = defaultdict(list) # keep track of the review indices that contributed negative sentiments toward each feature
    feature_sentiments_neg = defaultdict(list) # keep track of the review indices that contributed negative sentiments toward each feature
    feature_words_by_review = defaultdict(set) # keep track of the feature words in each review
    opinion_words_by_review = defaultdict(set) # keep track of the opinion words in each review
    opinion_sentiments = {} # same sentiment for opinion words throughout the corpus (this is an assumption [Observation 2])
    opinion_sentiments.update({op:(1 if op in positive_lexicon else -1) for op in opinions})

    while (True):
        print("DP Iteration: {}".format(i))
        i += 1

        # double propagation step
        new_features, \
        new_opinions = double_propagation_iterate(review_info,
                                                  features,
                                                  feature_words_by_review,
                                                  feature_sentiments_by_review,
                                                  feature_sentiments_cumulative,
                                                  feature_sentiments_pos,
                                                  feature_sentiments_neg,
                                                  opinions,
                                                  opinion_words_by_review,
                                                  opinion_sentiments)
        
        features = features.union(new_features)
        opinions = opinions.union(new_opinions)
        if len(new_opinions) == 0 and len(new_features) == 0:
            break

    res = (features,
           features_count,
           opinions,
           opinions_count,
           raw_sentences,
           parsed_sentences,
           review_indices,
           feature_sentiments_by_review,
           feature_words_by_review,
           feature_sentiments_cumulative,
           feature_sentiments_pos,
           feature_sentiments_neg,
           opinion_words_by_review,
           opinion_sentiments)
    return res


In [6]:
# Product quality clustering
import pickle
from collections import defaultdict

feature_to_class = pickle.load(open("./clustering/results/clean-classes.pkl", "rb"))

def get_sorted_classes(features_by_count):
    frequent_features = [(f, cnt) for f, cnt in features_by_count if cnt >= 5]
    features_by_class = dict()
    not_found = set()

    # features_by_class is a dict from class number -> [feature list, total count]
    for feature, cnt in frequent_features:
        if feature not in feature_to_class:
            not_found.add(feature)
            continue
        class_num = feature_to_class[feature]
        if class_num not in features_by_class:
            features_by_class[class_num] = [[], 0]
        features_by_class[class_num][0].append(feature)
        features_by_class[class_num][1] += cnt

    sorted_classes = sorted(features_by_class.values(), key=lambda x: x[1], reverse=True)
    print("Not found in any cluster: " + str(not_found))
    return sorted_classes

In [8]:
# import nltk
# nltk.download("punkt")

features, \
features_count, \
opinions, \
opinions_count, \
raw_sentences, \
parsed_sentences, \
review_indices, \
feature_sentiments_by_review, \
feature_words_by_review, \
feature_sentiments_cumulative, \
feature_sentiments_pos, \
feature_sentiments_neg, \
opinion_words_by_review, \
opinion_sentiments = extract_features_opinions(reviews)


0
500
DP Iteration: 0
DP Iteration: 1
DP Iteration: 2
DP Iteration: 3
DP Iteration: 4
DP Iteration: 5
DP Iteration: 6
DP Iteration: 7


In [9]:
print("Feature words, occurrences:")
features_by_count = sorted(features_count.items(), key=lambda x:x[1], reverse=True)
pprint(features_by_count)


Feature words, occurrences:
[('filter', 344),
 ('polarizer', 171),
 ('lens', 91),
 ('quality', 77),
 ('price', 73),
 ('product', 57),
 ('sky', 52),
 ('glass', 40),
 ('effect', 38),
 ('job', 35),
 ('time', 28),
 ('light', 27),
 ('ring', 26),
 ('glare', 24),
 ('fit', 22),
 ('day', 21),
 ('value', 20),
 ('camera', 20),
 ('water', 19),
 ('difference', 18),
 ('contrast', 17),
 ('thing', 17),
 ('sun', 17),
 ('way', 16),
 ('problem', 15),
 ('brand', 15),
 ('part', 15),
 ('item', 15),
 ('color', 15),
 ('size', 14),
 ('photography', 13),
 ('use', 12),
 ('case', 12),
 ('reflection', 12),
 ('condition', 11),
 ('nothing', 11),
 ('cap', 10),
 ('issue', 10),
 ('box', 9),
 ('addition', 9),
 ('sunlight', 9),
 ('angle', 9),
 ('rotation', 9),
 ('work', 9),
 ('mm', 9),
 ('construction', 9),
 ('look', 8),
 ('saturation', 8),
 ('delivery', 7),
 ('blue', 7),
 ('point', 7),
 ('loss', 7),
 ('experience', 7),
 ('shooting', 7),
 ('piece', 7),
 ('shipping', 7),
 ('challenge', 7),
 ('right', 7),
 ('money', 7),
 (

In [10]:
print("Feature clusters:")
sorted_classes = get_sorted_classes(features_by_count)
pprint(sorted_classes)

Feature clusters:
Not found in any cluster: {'issue', 'clarity', 'problem', 'ger', 'work', 'photography', 'lot', 'equipment', 'difference', 'reason', 'day', 'nothing', 'product', 'right', 'everything', 'disappointment', 'thing', 'side', 'trip', 'result', 'use', 'something', 'job', 'addition'}
[[['filter', 'polarizer', 'tiffen', 'element', 'cpl'], 531],
 [['lens', 'shooting'], 98],
 [['quality', 'value'], 97],
 [['price', 'money', 'buy'], 87],
 [['glare', 'sun', 'reflection', 'sunlight'], 62],
 [['sky'], 52],
 [['effect', 'haze'], 44],
 [['glass'], 40],
 [['contrast', 'saturation', 'point', 'image'], 38],
 [['ring', 'cap'], 36],
 [['item', 'delivery', 'shipping', 'purchase'], 36],
 [['way', 'angle', 'rotation'], 34],
 [['time'], 28],
 [['light'], 27],
 [['fit'], 22],
 [['part', 'piece'], 22],
 [['camera'], 20],
 [['water'], 19],
 [['brand'], 15],
 [['color'], 15],
 [['size'], 14],
 [['box', 'packaging'], 14],
 [['case'], 12],
 [['condition'], 11],
 [['mm'], 9],
 [['construction'], 9],
 

In [12]:
print("Opinion words, occurrences:")
opinions_by_count = sorted(opinions_count.items(), key=lambda x:x[1], reverse=True)
pprint(opinions_by_count)


Opinion words, occurrences:
[('good', 35),
 ('great', 25),
 ('usb', 22),
 ('little', 21),
 ('external', 19),
 ('is', 19),
 ('small', 18),
 ('better', 15),
 ('easy', 15),
 ('nice', 15),
 ('other', 13),
 ('only', 12),
 ('video', 12),
 ('same', 11),
 ('more', 11),
 ('quick', 11),
 ('first', 11),
 ('old', 10),
 ('high', 10),
 ('low', 9),
 ('short', 9),
 ('clear', 9),
 ('mic', 8),
 ('sound', 8),
 ('hard', 8),
 ('excellent', 7),
 ('bad', 7),
 ('sensitive', 7),
 ('last', 7),
 ('wide', 7),
 ('was', 6),
 ('best', 6),
 ('full', 6),
 ('awesome', 6),
 ('creative', 6),
 ('digital', 6),
 ('fine', 6),
 ('new', 5),
 ('available', 5),
 ('poor', 5),
 ('fantastic', 5),
 ('handy', 5),
 ('have', 5),
 ('seems', 5),
 ('internal', 5),
 ('included', 5),
 ('perfect', 5),
 ('next', 4),
 ('extra', 4),
 ('inexpensive', 4),
 ('right', 4),
 ('works', 4),
 ('impressed', 4),
 ('removable', 4),
 ('audio', 4),
 ('big', 4),
 ('plus', 3),
 ('defective', 3),
 ('recording', 3),
 ('lightweight', 3),
 ('entire', 3),
 ('tiny',

In [52]:
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

def is_sentiment_bearing(adj):
    for ss in wn.synsets(adj):
        if ss.pos() == "a" or ss.pos() == "s":
            return True
    return False

In [54]:
found = set()
not_found = set()

for opinion, _ in opinions_by_count:
    if is_sentiment_bearing(opinion):
        found.add(opinion)
    else:
        not_found.add(opinion)
    
print("NOT FOUND: " + str(not_found))
print("FOUND: " + str(found))

NOT FOUND: {'turn', 'record', 'gimmick', 'camcorder', 'point', 'flimsyindependent', 'option', 'mounting', 'boasted', 'video', 'charging', 'genius', 'stayed', 'supports', 'need', 'suggested', 'did', 'tooo', 'increases', 'beats', 'match', 'functions', 'arm', 'suffers', 'improve', 'sensitivity', 'was', 'added', 'is', 'highly', 'microphone', 'turns', 'test', 'playing', 'update', 'puts', 'unintuitive', 'makes', 'means', 'factor', 'surpasses', 'hold', 'sunlight', 'sucks', 'recording', 'stay', 'try', 'eats', 'sub', 'addition', 'help', 'accommodate', 'suck', 'needs', 'way', 'varies', 'cameras', 'power', 'shooting', 'writing', 'videoweighs', 'exited', 'mega', 'work', 'died', 'has', 'indicates', 'fails', 'smudgeable', 'shoots', 'integrates', 'picked', 'emailed', 'feature', 'intuitivethe', 'takes', 'accepts', 'produces', 'arrived', 'fluctuates', 'do', 'asked', 'user', 'weight', 'drive', 'inches', 'occur', 'boils', 'lasts', 'remains', 'comes', 'me', 'extrarounded', 'areas', 'version', 'say', 'runs

In [None]:
print("Feature cumulative sentiments:")
feature_sentiments_cumulative_sorted = sorted(feature_sentiments_cumulative.items(), key=lambda x: x[1], reverse=True)
feature_sentiments_cumulative_sorted


In [12]:
print("Opinion word sentiments:")
pprint(sorted(opinion_sentiments.items(), key=lambda x:x[1], reverse=False))


Opinion word sentiments:
[('horrible', -1),
 ('poor', -1),
 ('miserable', -1),
 ('unreliable', -1),
 ('complicated', -1),
 ('annoying', -1),
 ('worst', -1),
 ('defective', -1),
 ('weak', -1),
 ('lousy', -1),
 ('difficult', -1),
 ('sloppy', -1),
 ('malfunctioning', -1),
 ('useless', -1),
 ('mediocre', -1),
 ('unstable', -1),
 ('inferior', -1),
 ('old', -1),
 ('buggy', -1),
 ('weird', -1),
 ('bad', -1),
 ('terrible', -1),
 ('strange', -1),
 ('misleading', -1),
 ('narrow', -1),
 ('okay', -1),
 ('superb', -1),
 ('turns', -1),
 ('higher', -1),
 ('lot', -1),
 ('unusable', -1),
 ('intense', -1),
 ('facing', -1),
 ('general', -1),
 ('allowing', -1),
 ('different', 0),
 ('polarizing', 0),
 ('renowned', 0),
 ('decent', 0),
 ('original', 0),
 ('much', 0),
 ('fine', 0),
 ('competitive', 0),
 ('loose', 0),
 ('third', 0),
 ('circular', 0),
 ('rotating', 0),
 ('come', 0),
 ('useful', 0),
 ('continued', 0),
 ('deep', 0),
 ('minimalist', 0),
 ('west', 0),
 ('cheaper', 0),
 ('affordable', 0),
 ('reasona

In [None]:
print("Newly discovered opinion words:")
pprint(opinions.difference(positive_lexicon.union(negative_lexicon)))


In [None]:
compiled_results = {}
for i in review_indices:
    compiled_results[i] = {}
    compiled_results[i]['opinion_words'] = opinion_words_by_review[i]
    compiled_results[i]['feature_sentiments'] = feature_sentiments_by_review[i]
    
pprint(compiled_results)


In [None]:
def process_reviews(features, opinions, parsed_sentences, review_indices, raw_sentences, feature_sentiments_by_review):
    FO_dict = defaultdict(list)
    j = 0
    k = -1
    for i, sentence in enumerate(parsed_sentences):
        phrase_dict = {}
        FO_dict_sentence = defaultdict(list)
        review_index = review_indices[i]
        if review_index != review_indices[i-1]:
            j = 0
            print("==========================================\n\nReview #{}".format(review_index))
        else:
            j += 1
        k += 1
        print("\n\tSentence #{}".format(j))
        print("\t{}".format(raw_sentences[k]))
        for (gov, gov_pos), dependency, (dep, dep_pos) in sentence:
            if not gov.isalpha() or not dep.isalpha():
                continue
            gov = gov.lower()
            dep = dep.lower()
            if dependency == "nsubj" and dep in features:
                FO_dict_sentence[dep].append(gov)
                print("\t\tnsubj: {} -> {}, {}".format(gov, dep, feature_sentiments_by_review[review_index][dep]))
            elif dependency == "amod" and gov in features:
                FO_dict_sentence[gov].append(dep)
                print("\t\tamod: {} -> {}, {}".format(dep, gov, feature_sentiments_by_review[review_index][gov]))
            elif dependency == "compound" and gov in features:
                phrase_dict[gov] = dep + " " + gov
        for feature, opinions in FO_dict_sentence.items():
            if feature in phrase_dict:
                FO_dict[phrase_dict[feature]] += opinions
            else:
                FO_dict[feature] += opinions
    return FO_dict


In [None]:
process_reviews(features,
                opinions,
                parsed_sentences,
                review_indices,
                raw_sentences,
                feature_sentiments_by_review)

In [None]:
# some notes

# still need to handle negations

# "due to the two observations, multiple polarities may be assigned to an opinion word or target"
#   we should keep a running total of observations
#   num_negative_sentiments = total_observations - cumulative_polarity

# if target is from another review, we use the cumulative polarity to assign sentiment to the opinion word
#   should we also assign the cumulative polarity to the target itself?
#   we are right now, that way every observed feature gets a sentiment for each occurence

# CCB: compute opinion sentiment priors based on whole data set

In [None]:
# most reviewed asins
top_asins = asins[:30]
pprint(top_asins)


In [None]:
# plug and play
asin_ = "B003ELYQGG"

reviews_ = get_all_reviews(asin_)

features_, \
features_count_, \
opinions_, \
opinions_count_, \
raw_sentences_, \
parsed_sentences_, \
review_indices_, \
feature_sentiments_by_review_, \
feature_words_by_review_, \
feature_sentiments_cumulative_, \
feature_sentiments_pos_, \
feature_sentiments_neg_, \
opinion_words_by_review_, \
opinion_sentiments_ = extract_features_opinions(reviews_)

In [None]:
print("Feature words, occurrences:")
features_by_count_ = sorted(features_count_.items(), key=lambda x:x[1], reverse=True)
pprint(features_by_count_)


In [None]:
sorted_classes = get_sorted_classes(features_by_count_)
pprint(sorted_classes)

In [None]:
print("Opinion words, occurrences:")
opinions_by_count_ = sorted(opinions_count_.items(), key=lambda x:x[1], reverse=True)
pprint(opinions_by_count_)


In [None]:
print("Feature cumulative sentiments:")
feature_sentiments_cumulative_sorted_ = sorted(feature_sentiments_cumulative_.items(), key=lambda x: x[1], reverse=True)
feature_sentiments_cumulative_sorted_


In [None]:
print("Opinion word sentiments:")
pprint(sorted(opinion_sentiments_.items(), key=lambda x:x[1], reverse=False))


In [None]:
print("Newly discovered opinion words:")
pprint(opinions_.difference(positive_lexicon.union(negative_lexicon)))


In [None]:
process_reviews(features_,
                opinions_,
                parsed_sentences_,
                review_indices_,
                raw_sentences_,
                feature_sentiments_by_review_)

In [None]:
# bulk processing
asins_to_process = asins[:20]
product_infos = {}
for i, (asin, _) in enumerate(asins_to_process):
    try:
        print('{}, {}\n'.format(i, asin))
        product_reviews = get_all_reviews(asin)

        product_info = {}
        product_info['features'], \
        product_info['features_count'], \
        product_info['opinions'], \
        product_info['opinions_count'], \
        product_info['raw_sentences'], \
        product_info['parsed_sentences'], \
        product_info['review_indices'], \
        product_info['feature_sentiments_by_review'], \
        product_info['feature_words_by_review'], \
        product_info['feature_sentiments_cumulative'], \
        product_info['feature_sentiments_pos'], \
        product_info['feature_sentiments_neg'], \
        product_info['opinion_words_by_review'], \
        product_info['opinion_sentiments'] = extract_features_opinions(product_reviews)
        
        product_infos[asin] = product_info
    except ValueError as e:
        product_infos[asin] = {'error' : str(e)}
        print('Could not process product {} ({})'.format(asin, str(e)))


In [None]:
# todo: remove non-english reviews to avoid errors such as "Invalid control character at: line 1 column 23732 (char 23731)"


In [None]:
for asin,product_info in list(product_infos.items())[1:2]:
    if 'error' not in product_info:
        print('ASIN: {}'.format(asin))
        
        print('# of Positive Sentiments')
        feature_sentiments_pos_sorted = sorted([(k,len(v)) for k,v in product_info['feature_sentiments_pos'].items()], key=lambda x: x[1], reverse=True)
        pprint(feature_sentiments_pos_sorted[:15])
        
        print('# of Negative Sentiments')
        feature_sentiments_neg_sorted = sorted([(k,len(v)) for k,v in product_info['feature_sentiments_neg'].items()], key=lambda x: x[1], reverse=True)
        pprint(feature_sentiments_neg_sorted[:15])
        
        feature = feature_sentiments_neg_sorted[0][0]
        print('FEATURE: {}'.format(feature))
        for review_num, words in product_info['feature_words_by_review'].items():
            if feature in words:
                if product_info['feature_sentiments_by_review'][review_num][feature] < 0:
                    print(review_num, product_info['feature_sentiments_by_review'][review_num][feature])
                    for sentence_num,review_num_ in enumerate(product_info['review_indices']):
                        if review_num == review_num_:
                            sentence = product_info['raw_sentences'][sentence_num]
                            if feature in [w.lower() for w in sentence.split()]:
                                print(sentence)

#                print([product_info['raw_sentences'][sentence_num] \
#                       for sentence_num,review_num in enumerate(product_info['review_indices']) \
#                       if feature in product_info['feature_words_by_review'][review_num] ])                    


In [None]:
#product_infos['B00DR0PDNE']['opinion_words_by_review']
pprint([(a,b) for a,b in {k:list(filter(lambda w: (product_infos['B00DR0PDNE']['opinion_sentiments'][w] < 0),v)) for k,v in product_infos['B00DR0PDNE']['opinion_words_by_review'].items()}.items() if len(b)>0])


In [None]:
product_infos['B00DR0PDNE']['opinion_sentiments']['amazon']


In [None]:
print(len(product_infos['B000LRMS66']['raw_sentences']))
print(len(product_infos['B000LRMS66']['feature_words_by_review']))
print(product_infos['B000LRMS66']['review_indices'][-1])

In [None]:
# possible statistics
# # pos sentiments vs # neg sentiments