In [1]:
import spacy
import pandas as pd
import os
import sys


In [3]:
#Get this from some place else. This is just for dummy usage
stop_words=["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
            "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", 
            "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", 
            "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", 
            "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", 
            "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", 
            "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", 
            "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", 
            "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than",
            "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]


nlp=spacy.load("en_core_web_lg") 



In [4]:
BASE_PATH = os.path.abspath(os.path.join('..')) # base path of project

file_path = BASE_PATH + '/data/raw/amazon_reviews_us_Electronics_v1_00.tsv'

raw_data = pd.read_table(file_path,error_bad_lines=False, nrows=100)

review_body = raw_data['review_body']

In [5]:
def get_aspects(x):
    doc=nlp(x) 
    doc=[i.text for i in doc if i.text not in stop_words and i.pos_=="NOUN"] ## Remove common words and retain only nouns
    doc=list(map(lambda i: i.lower(),doc)) ## Normalize text to lower case
    doc=pd.Series(doc)
    doc=doc.value_counts().head().index.tolist() ## Get 5 most frequent nouns
    return doc

aspects = []

for review in review_body :
    aspects.append(get_aspects(review))

In [6]:
dummy = "The sound quality of the speakers are wonderful. However, the packaging could have been better. Photos under low lighting is poor - both front and back cameras"
doc=nlp(dummy) 
for token in doc:
    print(token.text,token.tag_, token.dep_, token.pos_)

The DT det DET
sound JJ amod ADJ
quality NN nsubj NOUN
of IN prep ADP
the DT det DET
speakers NNS pobj NOUN
are VBP ROOT VERB
wonderful JJ acomp ADJ
. . punct PUNCT
However RB advmod ADV
, , punct PUNCT
the DT det DET
packaging NN nsubj NOUN
could MD aux VERB
have VB aux VERB
been VBN ROOT VERB
better JJR acomp ADJ
. . punct PUNCT
Photos NNS nsubj NOUN
under IN prep ADP
low JJ amod ADJ
lighting NN pobj NOUN
is VBZ ROOT VERB
poor JJ amod ADJ
- : punct PUNCT
both DT preconj DET
front NN amod NOUN
and CC cc CCONJ
back NN conj NOUN
cameras NNS attr NOUN


In [7]:
#noun chunks

for chunk in doc.noun_chunks:
    print(chunk.text," ***", chunk.root.text, " ***",chunk.root.dep_," ***", chunk.root.head.text)


The sound quality  *** quality  *** nsubj  *** are
the speakers  *** speakers  *** pobj  *** of
the packaging  *** packaging  *** nsubj  *** been
Photos  *** Photos  *** nsubj  *** is
low lighting  *** lighting  *** pobj  *** under
poor - both front and back cameras  *** cameras  *** attr  *** is


In [12]:
for token in doc:
    print(token.text,"****" ,token.dep_, "****" ,token.head.text, "****" ,token.is_stop,token.head.pos_,
          [child for child in token.children])

The **** det **** quality **** NOUN []
sound **** amod **** quality **** NOUN []
quality **** nsubj **** are **** VERB [The, sound, of]
of **** prep **** quality **** NOUN [speakers]
the **** det **** speakers **** NOUN []
speakers **** pobj **** of **** ADP [the]
are **** ROOT **** are **** VERB [quality, wonderful, .]
wonderful **** acomp **** are **** VERB []
. **** punct **** are **** VERB []
However **** advmod **** been **** VERB []
, **** punct **** been **** VERB []
the **** det **** packaging **** NOUN []
packaging **** nsubj **** been **** VERB [the]
could **** aux **** been **** VERB []
have **** aux **** been **** VERB []
been **** ROOT **** been **** VERB [However, ,, packaging, could, have, better, .]
better **** acomp **** been **** VERB []
. **** punct **** been **** VERB []
Photos **** nsubj **** is **** VERB [under]
under **** prep **** Photos **** NOUN [lighting]
low **** amod **** lighting **** NOUN []
lighting **** pobj **** under **** ADP [low]
is **** ROOT **** i

In [13]:
# Run this if you want to visualise dependancy tree
#spacy.displacy.serve(doc, style='dep')


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [06/Nov/2018 14:12:52] "GET / HTTP/1.1" 200 18731
127.0.0.1 - - [06/Nov/2018 14:12:53] "GET /favicon.ico HTTP/1.1" 200 18731



    Shutting down server on port 5000.



In [10]:
## Zero RULE 
## Noun - Adjective pairs

## Very basic rule. Should be least weightage

noun_adj_pairs = []
for i,token in enumerate(doc):
    if token.pos_ not in ('NOUN','PROPN'):
        continue
    for j in range(i+1,len(doc)):
        if doc[j].pos_ == 'ADJ':
            noun_adj_pairs.append((token,doc[j]))
            break
noun_adj_pairs

[(quality, wonderful),
 (speakers, wonderful),
 (packaging, better),
 (Photos, low),
 (lighting, poor)]

In [9]:
## FIRST RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect

## RULE = M is child of A with a relationshio of amod

rule1_pairs = []
for token in doc:
    if token.dep_ == "amod":
        rule1_pairs.append((token.head.text, token.text))

rule1_pairs

[('quality', 'sound'),
 ('lighting', 'low'),
 ('front', 'poor'),
 ('cameras', 'front')]

In [53]:
## SECOND RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect

#Direct Object - A is a child of something with relationship of nsubj, while 
# M is a child of the same something with relationship of dobj

#Assumption - A verb will have only one NSUBJ and DOBJ

rule2_pairs = []
for token in doc:
    children = token.children
    A = "999999"
    B = "999999"
    for child in children :
        if(child.dep_ == "nsubj"):
            A = child.text
        if(child.dep_ == "dobj"):
            M = child.text
    if(A != "999999" and B != "999999"):
        rule2_pairs.append((A, M))   
    
         
            
        
        
rule2_pairs        
        
        
   # if token.dep_ == "amod":
   #     rule1_pairs.append((token.head.text, token.text))

[]

In [52]:
## THIRD RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect

#Adjectival Complement - A is a child of something with relationship of nsubj, while 
# M is a child of the same something with relationship of acomp

#Assumption - A verb will have only one NSUBJ and DOBJ

rule3_pairs = []
for token in doc:

    children = token.children
    A = "999999"
    B = "999999"
    for child in children :
        if(child.dep_ == "nsubj"):
            A = child.text
          
        if(child.dep_ == "acomp"):
            M = child.text
        
    if(A != "999999" or B != "999999"):
        rule3_pairs.append((A, M)) 


            
        
        
rule3_pairs        
        
        
   # if token.dep_ == "amod":
   #     rule1_pairs.append((token.head.text, token.text))

[('quality', 'wonderful'), ('packaging', 'better'), ('Photos', 'better')]

In [58]:
## FOURTH RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect

#Adverbial modifier to a passive verb - A is a child of something with relationship of nsubjpass, while 
# M is a child of the same something with relationship of advmod

#Assumption - A verb will have only one NSUBJ and DOBJ

rule4_pairs = []
for token in doc:

    children = token.children
    A = "999999"
    B = "999999"
    for child in children :
        if(child.dep_ == "nsubjpass"):
            A = child.text
          
        if(child.dep_ == "advmod"):
            M = child.text
        
    if(A != "999999" or B != "999999"):
        rule4_pairs.append((A, M)) 


            
        
        
rule4_pairs        
        
        
   # if token.dep_ == "amod":
   #     rule1_pairs.append((token.head.text, token.text))

[]

In [59]:
## FIFTH RULE OF DEPENDANCY PARSE -
## M - Sentiment modifier || A - Aspect

#Complement of a copular verb - A is a child of M with relationship of nsubj, while 
# M has a child with relationship of cop

#Assumption - A verb will have only one NSUBJ and DOBJ

rule5_pairs = []
for token in doc:
    children = token.children
    A = "999999"
    buf_var = "999999"
    for child in children :
        if(child.dep_ == "nsubj"):
            A = child.text
          
        if(child.dep_ == "cop"):
            buf_var = child.text
        
    if(A != "999999" or buf_var != "999999"):
        rule3_pairs.append((A, token.text)) 


            
        
        
rule5_pairs        
        
        
   # if token.dep_ == "amod":
   #     rule1_pairs.append((token.head.text, token.text))

[]