In [28]:
import pandas as pd
import json
import ast
import spacy
from sklearn import cluster
from collections import defaultdict
from time import time

NUM_CLUSTERS = 5

In [2]:
with open('results_file_aspects.txt', 'r') as fobj:
    #     data = json.load(fobj)\n",
    data = fobj.readlines()

In [12]:
with open('data.json', 'r') as fobj:
    data1 = json.load(fobj)

In [5]:
reviews_data = []
for line in data:
#         print(i)
    reviews_data.append(ast.literal_eval(line))

In [8]:
reviews_data[0]

{'review_id': 'R3W4P9UBGNGH1U',
 'aspect_pairs': [('as', 'usual', 0.0, 1),
  ('charging', 'wireless', 0.0, 1),
  ('product', 'nice', 0.4215, 3),
  ('battery', 'twice fast', 0.0, 4),
  ('charging', "n't second", 0.0, 4),
  ('watch', 'fine', 0.2023, 4)],
 'review_marketplace': 'US',
 'customer_id': 16414143,
 'product_id': 'B00YL0EKWE',
 'product_parent': 852431543,
 'product_title': 'LG G4 Case Hard Transparent Slim Clear Cover for LG G4',
 'product_category': 'Wireless',
 'date': '2015-08-31 00:00:00',
 'star_rating': 2,
 'url': 'http://amazon.com/dp/B00YL0EKWE'}

In [30]:
def get_unique_product_ids(reviews_data):
    product_ids = []
    product_ids = [r['product_id'] for r in reviews_data]
    return list(set(product_ids))

def get_aspects(reviews_data):
    aspects = []
    for review in reviews_data:
        aspect_pairs = review["aspect_pairs"]
        for noun,_,_,_ in aspect_pairs:
            aspects.append(noun)
    # aspects = [r['aspect_pairs'][0] for r in reviews_data]
    return aspects

def get_aspect_freq_map(aspects):
    aspect_freq_map = defaultdict(int)
    for asp in aspects:
        aspect_freq_map[asp] += 1
    return aspect_freq_map

def get_unique_aspects(aspects):
    unique_aspects = list(set(aspects)) # use this list for clustering
    return unique_aspects


def get_word_vectors(unique_aspects, nlp):
    asp_vectors = []
    for aspect in unique_aspects:
        # print(aspect)
        token = nlp(aspect)
        asp_vectors.append(token.vector)
    return asp_vectors

def get_word_clusters(unique_aspects, nlp):
    # print("Found {} unique aspects for this product".format(len(unique_aspects)))
    asp_vectors = get_word_vectors(unique_aspects, nlp)
    # n_clusters = min(NUM_CLUSTERS,len(unique_aspects))
    if len(unique_aspects) <= NUM_CLUSTERS:
        # print("Too few aspects ({}) found. No clustering required...".format(len(unique_aspects)))
        return list(range(len(unique_aspects)))

#     print("Running k-means clustering...")
    n_clusters = NUM_CLUSTERS
    kmeans = cluster.KMeans(n_clusters=n_clusters)
    kmeans.fit(asp_vectors)
    labels = kmeans.labels_
    # print("Finished running k-means clustering with {} labels".format(len(labels)))
    return labels

def get_cluster_names_map(asp_to_cluster_map, aspect_freq_map):
    cluster_id_to_name_map = defaultdict()
    # cluster_to_asp_map = defaultdict()
    n_clusters = len(set(asp_to_cluster_map.values()))
    for i in range(n_clusters):
        this_cluster_asp = [k for k,v in asp_to_cluster_map.items() if v == i]
        filt_freq_map = {k:v for k,v in aspect_freq_map.items() if k in this_cluster_asp}
        filt_freq_map = sorted(filt_freq_map.items(), key = lambda x: x[1], reverse = True)
        cluster_id_to_name_map[i] = filt_freq_map[0][0]

        # cluster_to_asp_map[i] = cluster_nouns

    # print(cluster_to_asp_map)
    return cluster_id_to_name_map



In [31]:
def add_clusters_to_reviews(reviews_data, nlp):
    product_aspects = get_aspects(reviews_data)
    # print("Total aspects found: {}".format(len(product_aspects)))
    aspect_freq_map = get_aspect_freq_map(product_aspects)
    unique_aspects = aspect_freq_map.keys()
    # print("Runnig clustering on {} unique aspects".format(len(unique_aspects)))

    aspect_labels = get_word_clusters(unique_aspects, nlp)
    asp_to_cluster_map = dict(zip(unique_aspects, aspect_labels))
    cluster_names_map = get_cluster_names_map(asp_to_cluster_map, aspect_freq_map)
    updated_reviews = []
    
    for review in reviews_data:
#         cluster_mapping = []
        result = []
        aspect_pairs = review["aspect_pairs"]
        for noun,adj,polarity,rule  in aspect_pairs:
            cluster_label_id = asp_to_cluster_map[noun]
            cluster_label_name = cluster_names_map[cluster_label_id]
#             cluster_mapping.append(cluster_label_name)
            result.append({'noun':noun, 'adj':adj, 'rule':rule, 'polarity':polarity, 'cluster':cluster_label_name})

        assert len(result) == len(aspect_pairs)
#         review['aspect_pairs'] = result
    # all_label_ids = []
    # for asp in all_aspects:
    #     this_label = asp_to_cluster_id_map[asp]
    #     this_label_name = cluster_names_map[this_label]
    #     all_label_ids.append(this_label)
    #     all_label_names.append(this_label_name)

        updated_reviews.append({'review_id':review['review_id'], 'product_id':review['product_id'], 'aspect_pairs':result})
    
    return updated_reviews

def update_reviews_data(reviews_data, nlp):
    updated_reviews = []
    product_ids = get_unique_product_ids(reviews_data)
    print("Total number of unique products in this category: {}".format(len(product_ids)))

    no_asp_reviews = [r for r in reviews_data if len(r['aspect_pairs']) == 0]
    print("Total reviews found with no aspect pairs: {}".format(len(no_asp_reviews)))

    for prod_id in product_ids:
#         print("\nRunning clustering for product ID - {}".format(prod_id))
        this_product_reviews = [r for r in reviews_data if r['product_id'] == prod_id]
        # this_no_asp_reviews = [r for r in this_product_reviews if len(r['aspect_pairs']) == 0]
        # print("Total reviews found: {}. Reviews with no aspect pairs: {}".format(len(this_product_reviews), len(this_no_asp_reviews)))

        this_product_upd_reviews = add_clusters_to_reviews(this_product_reviews, nlp)
        updated_reviews.extend(this_product_upd_reviews)

    print("\n----------------***----------------")
    print("Updating final results")
    with open('results_file.json', 'a') as f:
        json.dump(updated_reviews,f)
    print("Finished writing results to json!!")
    print("----------------***----------------")
    

In [11]:
print("\nLoading spaCy Model....")
nlp = spacy.load('en_core_web_lg')
print("spaCy successfully loaded")


Loading spaCy Model....
spaCy successfully loaded


In [32]:
print("Running clustering...")
update_reviews_data(reviews_data, nlp)

Running clustering...
Total number of unique products in this category: 248565
Total reviews found with no aspect pairs: 367423


KeyboardInterrupt: 