# A graph based approach to opinion mining

Based on the work done by Kavita Ganesan [Opinosis](http://kavita-ganesan.com/opinosis) for mining aspects from product reviews, this notebook fetches product reviews from the Best Buy API and builds a word adjacency graph of the review corpus in Neo4j. Opinions are then minded from the graph using a simplied version of the Opinosis algorithm.

In [1]:
from py2neo import Graph
import json
import requests
import re, string
from py2neo.packages.httpstream import http
http.socket_timeout = 9999


In [7]:
API_KEY = "BEST_BUY_API_KEY"
# SKU = "9439005" # Kindle
# SKU = "4642026" # Bose headphones
# SKU = "6422016" # Samsung TV
# SKU = "3656051" # Samsung washing machine
# SKU = "2498029" # Dyson vacuum

REQUEST_URL = "https://api.bestbuy.com/v1/reviews(sku={sku})?apiKey={API_KEY}&show=comment,id,rating,reviewer.name,sku,submissionTime,title&pageSize=100&page={page}&sort=comment.asc&format=json"

### Connect to Neo4j instance and define Cypher queries

In [3]:
graph = Graph()

# Build a word adjacency graph for a comment string
INSERT_QUERY = '''
WITH split(tolower({comment}), " ") AS words
WITH [w in words WHERE NOT w IN ["the","and","i", "it", "to"]] AS text
UNWIND range(0,size(text)-2) AS i
MERGE (w1:Word {name: text[i]})
ON CREATE SET w1.count = 1 ON MATCH SET w1.count = w1.count + 1
MERGE (w2:Word {name: text[i+1]})
ON CREATE SET w2.count = 1 ON MATCH SET w2.count = w2.count + 1
MERGE (w1)-[r:NEXT]->(w2)
  ON CREATE SET r.count = 1
  ON MATCH SET r.count = r.count + 1;
'''

OPINION_QUERY = '''
MATCH p=(:Word)-[r:NEXT*1..4]->(:Word) WITH p
WITH reduce(s = 0, x IN relationships(p) | s + x.count) AS total, p
WITH nodes(p) AS text, 1.0*total/size(nodes(p)) AS weight
RETURN extract(x IN text | x.name) AS phrase, weight ORDER BY weight DESC LIMIT 10
'''

In [4]:
# define a regular expression to remove punctuation
regex = re.compile('[%s]' % re.escape(string.punctuation))
# exclude = set(string.punctuation)

### Fetch comments from Best Buy API and build word adjaceny graph

In [5]:
def load_graph(product_sku):
    for i in range(1,6):
        r = requests.get(REQUEST_URL.format(sku=product_sku, API_KEY=API_KEY, page=str(i)))
        data = r.json()
        for comment in data["reviews"]:
            comments = comment["comment"].split(".")
            for sentence in comments:
                sentence = sentence.strip()
                sentence = regex.sub("", sentence)
                graph.cypher.execute(INSERT_QUERY, parameters={'comment': sentence})

#### Query the graph for opinions
Find word paths of 3-5 words with highest number of occurences

In [6]:
def summarize_opinions():
    results = graph.cypher.execute(OPINION_QUERY)
    for result in results:
        print(str(result.phrase) + " " + str(result.weight))

### Bose headphones
<img src="img/bose.png" align="right" style="width:20%;">
* "They are great sound quality"
* "Comfortable and the great sound"
* "These headphones great sound quality"


In [162]:
graph.cypher.execute("MATCH A DETACH DELETE A;")
load_graph("4642026")
summarize_opinions()

['great', 'sound', 'and', 'they', 'are'] 11.0
['they', 'are', 'great', 'sound', 'quality'] 10.8
['great', 'sound', 'and', 'the', 'sound'] 10.8
['the', 'sound', 'great', 'sound', 'quality'] 10.6
['of', 'the', 'sound', 'quality'] 10.5
['comfortable', 'and', 'the', 'sound', 'quality'] 10.4
['of', 'the', 'sound', 'great', 'sound'] 10.2
['these', 'headphones', 'great', 'sound', 'quality'] 10.2
['and', 'the', 'sound', 'quality'] 10.0
['the', 'sound', 'and', 'they', 'are'] 10.0


### Samsung TV
<img src="img/tv.png" align="left" style="width:20%;">
* "Bought this smart TV for the price"

In [7]:
graph.cypher.execute("MATCH A DETACH DELETE A;")
load_graph("6422016")
summarize_opinions()

['this', 'tv', 'for', 'the'] 67.25
['this', 'tv', 'for', 'the', 'tv'] 66.0
['this', 'tv', 'for', 'the', 'price'] 65.6
['bought', 'this', 'tv', 'for', 'the'] 65.0
['this', 'tv', 'for', 'the', 'picture'] 62.8
['got', 'this', 'tv', 'for', 'the'] 62.2
['this', 'tv', 'for', 'the', 'smart'] 61.6
['smart', 'tv', 'for', 'the', 'tv'] 58.6
['this', 'tv', 'for', 'the', 'xbox'] 58.6
['smart', 'tv', 'for', 'the', 'price'] 58.2


### Amazon Kindle
<img src="img/kindle.png" align="right" style="width:20%;">
* "Easy to read"
* "Easy to read in the light"

In [8]:
graph.cypher.execute("MATCH A DETACH DELETE A;")
load_graph("9439005")
summarize_opinions()


['easy', 'to', 'read', 'in'] 76.5
['easy', 'to', 'read', 'in', 'the'] 73.8
['and', 'easy', 'to', 'read'] 73.0
['is', 'easy', 'to', 'read'] 71.5
['very', 'easy', 'to', 'read'] 71.25
['easy', 'to', 'read', 'and'] 70.75
['this', 'is', 'easy', 'to', 'read'] 70.6
['easy', 'to', 'read', 'at'] 70.5
['easy', 'to', 'read', 'the', 'kindle'] 70.4
['easy', 'to', 'read', 'books'] 70.25


### Samsung washer
<img src="img/washer.png" align="left" style="width:20%;">
* "I love this washer"

In [12]:
graph.cypher.execute("MATCH A DETACH DELETE A;")
load_graph("3656051")
summarize_opinions()


['this', 'washer'] 62.0
['this', 'washer', 'is', 'a'] 59.75
['this', 'washer', 'is'] 59.666666666666664
['this', 'washer', 'and'] 55.333333333333336
['this', 'washer', 'and', 'the'] 52.5
['with', 'this', 'washer', 'is'] 52.5
['i', 'love', 'this', 'washer'] 52.0
['this', 'washer', 'and', 'i'] 51.75
['with', 'this', 'washer'] 51.666666666666664
['this', 'washer', 'is', 'very'] 51.5


### Dyson vacuum
<img src="img/dyson.png" align="right" style="width:20%;">
* "Easy to use this vacuum"

In [21]:
graph.cypher.execute("MATCH A DETACH DELETE A;")
load_graph("2498029")
summarize_opinions()

['this', 'vacuum', 'is', 'easy', 'to'] 238
['easy', 'to', 'this', 'vacuum', 'is'] 223
['this', 'vacuum', 'and', 'easy', 'to'] 213
['is', 'easy', 'to', 'this', 'vacuum'] 212
['it', 'is', 'easy', 'to', 'use'] 211
['easy', 'to', 'this', 'vacuum', 'cleaner'] 210
['and', 'easy', 'to', 'this', 'vacuum'] 208
['very', 'easy', 'to', 'this', 'vacuum'] 208
['easy', 'to', 'this', 'vacuum', 'i'] 206
['easy', 'to', 'get', 'this', 'vacuum'] 206
