In [28]:
import spacy
import pandas as pd
import os
import sys



BASE_PATH = os.path.abspath(os.path.join('..')) # base path of project

file_path = BASE_PATH + '/data/raw/amazon_reviews_us_Electronics_v1_00.tsv'

raw_data = pd.read_table(file_path,error_bad_lines=False, nrows=100)

#review_body = raw_data['review_body']
reviews = raw_data[['review_id', 'review_body']]
nlp=spacy.load("en_core_web_lg")


In [29]:
def apply_extraction(row,nlp):
    review_body = row['review_body']
    review_id = row['review_id']

    doc=nlp(review_body)


    ## FIRST RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect
    ## RULE = M is child of A with a relationshio of amod
    rule1_pairs = []
    for token in doc:
        if token.dep_ == "amod":
            rule1_pairs.append((token.head.text, token.text))
            #return row['height'] * row['width']


    ## SECOND RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect
    #Direct Object - A is a child of something with relationship of nsubj, while
    # M is a child of the same something with relationship of dobj
    #Assumption - A verb will have only one NSUBJ and DOBJ

    rule2_pairs = []
    for token in doc:
        children = token.children
        A = "999999"
        M = "999999"
        for child in children :
            if(child.dep_ == "nsubj"):
                A = child.text
            if(child.dep_ == "dobj"):
                M = child.text
        if(A != "999999" and M != "999999"):
            rule2_pairs.append((A, M))


    ## THIRD RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect
    #Adjectival Complement - A is a child of something with relationship of nsubj, while
    # M is a child of the same something with relationship of acomp
    #Assumption - A verb will have only one NSUBJ and DOBJ

    rule3_pairs = []

    for token in doc:

        children = token.children
        A = "999999"
        M = "999999"
        for child in children :
            if(child.dep_ == "nsubj"):
                A = child.text

            if(child.dep_ == "acomp"):
                M = child.text

        if(A != "999999" and M != "999999"):
            rule3_pairs.append((A, M))

    ## FOURTH RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect

    #Adverbial modifier to a passive verb - A is a child of something with relationship of nsubjpass, while
    # M is a child of the same something with relationship of advmod

    #Assumption - A verb will have only one NSUBJ and DOBJ

    rule4_pairs = []
    for token in doc:


        children = token.children
        A = "999999"
        M = "999999"
        for child in children :
            if(child.dep_ == "nsubjpass"):
                A = child.text

            if(child.dep_ == "advmod"):
                M = child.text

        if(A != "999999" and M != "999999"):
            rule4_pairs.append((A, M))


    ## FIFTH RULE OF DEPENDANCY PARSE -
    ## M - Sentiment modifier || A - Aspect

    #Complement of a copular verb - A is a child of M with relationship of nsubj, while
    # M has a child with relationship of cop

    #Assumption - A verb will have only one NSUBJ and DOBJ

    rule5_pairs = []
    for token in doc:
        children = token.children
        A = "999999"
        buf_var = "999999"
        for child in children :
            if(child.dep_ == "nsubj"):
                A = child.text

            if(child.dep_ == "cop"):
                buf_var = child.text

        if(A != "999999" and buf_var != "999999"):
            rule3_pairs.append((A, token.text))

    aspects = []
    aspects = rule1_pairs + rule2_pairs + rule3_pairs +rule4_pairs +rule5_pairs
    dic = {"review_id" : review_id , "aspect_pairs" : aspects}
    return dic


In [31]:
review_decomp = reviews.apply(lambda row: apply_extraction(row,nlp), axis=1)


{'review_id': 'R372S58V6D11AT', 'aspect_pairs': [('Bass', 'lacking')]}

In [2]:
#from gensim.models import Word2Vec
import spacy
#from spacy import displacy
from sklearn import cluster 
from collections import defaultdict

#from src/models import aspect_extraction

In [3]:
#from src/models import aspect_extraction

In [4]:
NUM_CLUSTERS = 2

In [5]:
review_decomp = [{"aspect_pairs": [("sound", "great"),("music", "loud"),("shipping", "poor")]},
                 {"aspect_pairs": [("packaging", "pathetic"),("sound", "perfect")]},
                 {"aspect_pairs": [("quality", "wonderful"),("photos", "great")]}]

In [6]:
#nlp = spacy.load('en_core_web_lg')  # make sure to use larger model!
#tokens = nlp(u'sound music shipping packaging bass')
#for token1 in tokens:
#    for token2 in tokens:
  #      print(token1.text, token2.text, token1.similarity(token2))

In [7]:
aspects = []
for review in review_decomp:
    aspect_pairs = review["aspect_pairs"]
    for noun, adj in aspect_pairs:
        aspects.append(noun)

unique_aspects = list(set(aspects)) # use this list for clustering

In [8]:
# need this mapping later for tagging clusters
aspects_map = defaultdict(int) 
for asp in aspects:
    aspects_map[asp] += 1

In [9]:
nlp = spacy.load('en_core_web_lg')  # make sure to use larger model!

asp_vectors = []
for aspect in unique_aspects:
    token = nlp(aspect)
    asp_vectors.append(token.vector)

In [24]:
kmeans = cluster.KMeans(n_clusters=NUM_CLUSTERS)
kmeans.fit(asp_vectors)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=2, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [25]:
#centroids = kmeans.cluster_centers_
labels = kmeans.labels_ 
print ("Cluster id labels for inputted data")
print (unique_aspects)
print (labels)

asp_to_cluster_map = dict(zip(unique_aspects,labels))
print(asp_to_cluster_map)


Cluster id labels for inputted data
['sound', 'music', 'packaging', 'photos', 'shipping', 'quality']
[1 1 0 1 0 0]
{'sound': 1, 'music': 1, 'packaging': 0, 'photos': 1, 'shipping': 0, 'quality': 0}


In [27]:
cluster_map = defaultdict()
for i in range(NUM_CLUSTERS):
    cluster_nouns = [k for k,v in asp_to_cluster_map.items() if v == i]
#     print(cluster_nouns)
    freq_map = {k:v for k,v in aspects_map.items() if k in cluster_nouns}
    freq_map = sorted(freq_map.items(), key = lambda x: x[1])
#     print(freq_map)
    cluster_map[i] = freq_map[0][0]

['packaging', 'shipping', 'quality']
{'shipping': 1, 'packaging': 1, 'quality': 1}
[('shipping', 1), ('packaging', 1), ('quality', 1)]
['sound', 'music', 'photos']
{'sound': 2, 'music': 1, 'photos': 1}
[('music', 1), ('photos', 1), ('sound', 2)]
defaultdict(None, {0: 'shipping', 1: 'music'})


In [None]:
print(cluster_map)