In [12]:
import pickle
import pprint

#product_dict: asin -> list(phrase, [adjs]), count
pp = pprint.PrettyPrinter(width=110, compact=True)
# with open("./results/features.pickle", "rb") as file:
with open("../../bad_products_features.pickle", "rb") as file:
    product_dict = pickle.load(file)

sorted_products = sorted(product_dict.items(), key=(lambda x: x[1][1]), reverse=True)
pp.pprint(sorted_products[:20])

[('B000CS1TLE',
  ([('software',
     ['willing', 'prompted', 'online', 'basic', 'has', 'have', 'starts', 'capable', 'bit', 'performs', 'easy',
      'accompanying', 'pain', 'buggy', 'latest', 'guide', 'set', 'poor', 'work', 'latest', 'recognized',
      'worked', 'improved', 'buggy', 'remote', 'clunky', 'wants']),
    ('thing',
     ['great', 'taken', 'DOA', 'next', 'is', 'good', 'need', 'whole', 'sliver', 'only', 'whole', 'only',
      'First', 'is', 'easy', 'worked', 'only', 'is', 'no', 'make', 'bad', 'make', 'only']),
    ('setup',
     ['took', 'remote', 'easy', 'take', 'online', 'consume', 'audio', 'visual', 'based', 'Online', 'easy',
      'online', 'benefit', 'friendly', 'guided', 'breeze', 'involves', 'complicated', 'adequate', 'initial',
      'allows', 'initial', 'initial', 'failed', 'initial']),
    ('time',
     ['front', 'considerable', 'next', 'is', 'next', 'remote', 'more', 'hard', 'more', 'more', 'related',
      'look', 'remote', 'remote', 'prime', 'much']),
    ('Har

In [13]:
# Cleaning/preprocessing: remove negation

for feature_modifier_list, _ in product_dict.values():
    for feature, modifiers in feature_modifier_list:
        for i, mod in enumerate(modifiers):
            if mod[0] == "*": # remove negation
                modifiers[i] = mod[1:]

In [14]:
# Finds the most mentioned features across all *products*
from collections import Counter

def get_frequent_features_products():
    feature_counts = Counter()
    for feature_modifier_list, _ in product_dict.values():
        for feature, _ in feature_modifier_list:
            feature_counts[feature] += 1
    return feature_counts

print("Total products: {0}".format(len(product_dict)))
#pp.pprint(get_frequent_features_products())

Total products: 7


In [15]:
# Finds most mentioned features across all *reviews*

def get_frequent_features_reviews():
    feature_counts = Counter()
    for feature_modifier_list, _ in product_dict.values():
        for feature, modifiers in feature_modifier_list:
            feature_counts[feature] += len(modifiers)
    return feature_counts
        
#pp.pprint(get_frequent_features_reviews())

In [16]:
# Finds most mentioned adjectives across all reviews
def get_frequent_adjs():
    modifier_counts = Counter()
    for feature_modifier_list, _ in product_dict.values():
        for _, modifiers in feature_modifier_list:
            for mod in modifiers:
                modifier_counts[mod] += 1
    return modifier_counts

modifier_counts = get_frequent_adjs()
print("Total modifiers: {}".format(len(modifier_counts)))
#pp.pprint(modifier_counts)

Total modifiers: 1503


In [17]:
# Returns a counter of all distinct modifiers used to describe this feature across all reviews
def get_all_modifiers(feature):
    all_modifiers = Counter()
    for feature_modifier_list, _ in product_dict.values():
        for curr_feature, modifiers in feature_modifier_list:
            if curr_feature == feature:
                for modifier in modifiers:
                    all_modifiers[modifier] += 1
    return all_modifiers

#pp.pprint(get_all_modifiers("lens"))

In [18]:
def get_adjs_above_threshold(counter, threshold):
    total_count = sum(counter.values())
    curr_count = 0
    adjs = set()
    for adj, count in counter.most_common():
        curr_count += count
        adjs.add(adj)
        if curr_count * 1.0 / total_count >= threshold:
            break
    return adjs

In [28]:
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn

# Returns 1 minus obj score or max of abs of all pos/neg scores. All 0's if no synsets.
def get_adj_subjectivity(adj):
    ss_count = 0
    total_obj = 0
    for ss in wn.synsets(adj):
        if ss.pos() == "a" or ss.pos() == "s":
            ss_count += 1
            bd = swn.senti_synset(ss.name())
            total_obj += bd.obj_score()
            #print(ss.name(), ss.definition())
    if ss_count == 0:
        avg_obj = 2 # Make result be -1 if lookup fails
    else:
        avg_obj = round(total_obj * 1.0 / ss_count, 3)
    #print("{3} / subj: {2}".format(avg_scores[0], avg_scores[1], 1-avg_scores[2], adj))
    return 1-avg_obj

#get_adj_subjectivity("different")

In [29]:
# Determine whether feature is valid or not by analyzing adjs
def analyze_feature_subjectivity(feature):
    counter = get_all_modifiers(feature)
    top_adjs = get_adjs_above_threshold(counter, 0.5)
    for adj in top_adjs:
        get_adj_subjectivity(adj)

In [30]:
'''
Determine set of non-sentiment bearing adjectives
From plugging in diff intervals, 0.2 seems like a good number
(better to be conservative and manually add to blacklist)
'''
import math

def bucket_adjs_by_objectivity(interval, num_adjs):
    buckets = []
    num_intervals = round(1.0 / interval)
    for _ in range(num_intervals):
        buckets.append(set())
    for adj, _ in modifier_counts.most_common(num_adjs):
        subj_score = get_adj_subjectivity(adj)
        if subj_score < 0:
            bucket_num = 0
        elif subj_score == 1:
            bucket_num = -1
        else:
            bucket_num = math.floor(subj_score / interval)
        buckets[bucket_num].add(adj)
    return buckets

buckets = bucket_adjs_by_objectivity(0.2, len(modifier_counts)) # do len(modifier_counts) for all adjs
adj_filter = set(buckets[0])
print("# adjectives in lowest bucket: {}".format(len(buckets[0])))
#pp.pprint(buckets)

# adjectives in lowest bucket: 961


In [31]:
# Manually add some adjectives to filter, went through top 100 adjs using this function

def get_associated_features(adj):
    feature_counter = Counter()
    for feature_modifier_list, _ in product_dict.values():
        for feature, modifiers in feature_modifier_list:
            for mod in modifiers:
                if mod == adj:
                    feature_counter[feature] += 1
    return feature_counter

adj_filter.update(["main", "manual", "new", "original", "right", "last", "different", "other", "sound", "usb"])

In [32]:
# Remove adjs in filter

print("# adjs in filter: {}".format(len(adj_filter)))

for asin, (feature_modifier_list, count) in product_dict.items():
    new_list = []
    for i, (feature, modifiers) in enumerate(feature_modifier_list):
        filtered_modifiers = [x for x in modifiers if x not in adj_filter]
        if len(filtered_modifiers) > 0:
            new_list.append((feature, filtered_modifiers))
        product_dict[asin] = new_list, count

#pp.pprint(get_frequent_features_reviews())

# adjs in filter: 971


In [33]:
# Hand-pick noun filter (looked at all with over 50 adjs)
# TODO: take into account compound noun phrases

#pp.pprint(get_all_modifiers("lot"))
noun_filter = {"product", "thing", "unit", "deal", "way", "size", "job", "use", "buy", "problem", "choice",
               "something", "amount", "side", "end", "nothing", "review", "version", "idea", "solution", "difference",
               "anything", "addition", "music", "work", "head", "reason", "day", "room", "stuff", "point", "line",
               "everything", "year", "luck", "issue", "option", "place", "user", "complaint", "touch", "one",
               "number", "experience", "card", "company", "effect", "bit", "money", "view", "look", "hand", "time"}
noun_filter.update(["device", "pair", "set", "piece", "item", "player", "purchase", "headset", "earbuds",
                    "headphones", "computer", "router", "system", "machine", "tool", "model", "radio"])

In [34]:
# Apply noun filter
for asin, (feature_modifier_list, count) in product_dict.items():
    product_dict[asin] = [(feat, adjs) for (feat, adjs) in feature_modifier_list if feat not in noun_filter], count
    
#pp.pprint(get_frequent_features_reviews())

In [35]:
# Final cleanup: sort and remove features below a certain threshold

threshold = 0.02
for asin, (feat_mod_list, count) in product_dict.items():
    feat_mod_list = [(feat, adjs) for (feat, adjs) in feat_mod_list if len(adjs) * 1.0 / count >= threshold]
    product_dict[asin] = sorted(feat_mod_list, key=lambda tup: len(tup[1]), reverse=True), count

In [36]:
with open("./results/bad_products_improved_features.pickle", "wb") as file:
    pickle.dump(product_dict, file, 0)