# Neo4j

Nodes:
- Product
    - productID
    - name
    - price
    - brand

Relations:
- Product -[:ALSO_BOUGHT]- Product   
- Product -[:ALSO_VIEWED]- Product
- Product -[:ALSO_BOUGHT]- Product
- Product -[:BUY_AFTER_VIEWING]-Product
    
Steps:
1. Populate the DB with all products
2. Create the relationships

In [1]:
from neo4j.v1 import GraphDatabase, basic_auth

driver = GraphDatabase.driver("bolt://localhost", auth=basic_auth("neo4j", "amazon"), encryption=False)
session = driver.session()

In [2]:
res = session.run("MATCH ()-[r]-() RETURN count(r)")

In [4]:
print res.peek()

KeyboardInterrupt: 

### Step 1: Products
(We do not include in the graph the products without a title and/or without related products)

In [None]:
import time 
import datetime

def parse_line(line):
    product = eval(line)
    params = {}
    if (product.has_key('title') and product.has_key('related')):
        params['asin'] = product['asin']
        params['title'] = product['title']
        if product.has_key('price'):
            params['price'] = product['price']
        else:
            params['price'] = None
    return params

start = time.time()
session.run("CREATE CONSTRAINT ON (p:Product) ASSERT p.productID IS UNIQUE")

with open("C:\\dtu\\ctbd\\amazon_dataset\\metadata_no_books.json", "r") as data:
    line_count=0
    products = []
    for line in data:
        products.append(parse_line(line))
        line_count = line_count + 1
        if(line_count % 5000 == 0):#every 5000 lines
            tx = session.begin_transaction()
            for product in products:
                if not (product.has_key('title')):
                    continue
                else:
                    create_query="""CREATE (p:Product {productID:{asin}})
                    SET p.name={title}, p.price=toFloat({price})"""
                    res = tx.run(create_query, product)
                    for r in res:
                        continue
            tx.commit()
            products = []
            if line_count % 60000 == 0: 
                print '%.3f %% completed' % (line_count*1.0/60000 ,) 
session.close()
end = time.time()
print "executed in:", datetime.timedelta(seconds = (end - start))

### Step 2: Relationships

In [None]:
import time 
import datetime

def parse_line(line):
    product = eval(line)
    params = {}
    if (product.has_key('title') and product.has_key('related')):
        params['asin'] = product['asin']
        params['title'] = product['title']
        for key in ['also_bought', 'also_viewed', 'bought_together', 'buy_after_viewing']:
            if not product['related'].has_key(key):
                params[key] = []
            else:
                params[key] = product['related'][key]
    return params

start = time.time()
with open("C:\\dtu\\ctbd\\amazon_dataset\\metadata_no_books.json", "r") as data:
    line_count=0
    params_list = []
    for line in data:
        params_list.append(parse_line(line))
        line_count = line_count + 1
        if(line_count % 1000 == 0): #every 1000 lines
            tx = session.begin_transaction()
            for params in params_list:
                if not (params.has_key('title')):
                    continue
                else:
                    also_bought_query = """MATCH (p:Product),(q:Product)
                    WHERE p.productID = {asin} AND q.productID IN {also_bought}
                    CREATE UNIQUE (p)-[:ALSO_BOUGHT]-(q)"""
                    res = tx.run(also_bought_query, params)
                    for r in res:
                        continue
                    also_viewed_query = """MATCH (p:Product),(q:Product)
                    WHERE p.productID = {asin} AND q.productID IN {also_viewed}
                    CREATE UNIQUE (p)-[:ALSO_VIEWED]-(q)"""
                    res = tx.run(also_viewed_query, params)
                    for r in res:
                        continue
                    bought_together_query = """MATCH (p:Product),(q:Product)
                    WHERE p.productID = {asin} AND q.productID IN {bought_together}
                    CREATE UNIQUE (p)-[:BOUGHT_TOGETHER]-(q)"""
                    res = tx.run(bought_together_query, params)
                    for r in res:
                        continue
                    buy_after_viewing_query = """MATCH (p:Product),(q:Product)
                    WHERE p.productID = {asin} AND q.productID IN {buy_after_viewing}
                    CREATE UNIQUE (p)-[:BUY_AFTER_VIEWING]-(q)"""
                    res = tx.run(buy_after_viewing_query, params)
                    for r in res:
                        continue
            tx.commit()
            params_list = []
            if line_count % 60000 == 0:
                print '%.1f %% completed' % (line_count*1.0/60000 ,) 
end = time.time()
session.close()
print "executed in:", datetime.timedelta(seconds = (end - start))

### Query
- Product count:
"""MATCH (p:Product) RETURN count(p)"""
- Relationships count:
"""MATCH ()-[r]-() RETURN count(r)"""
- Diameter of network (4 longest shortest path):
"""MATCH (p:Product), (q:Product) WHERE id(p) > id(q)
MATCH sp=shortestPath((p)-[:ALSO_VIEWED]-(q))
RETURN length(sp) AS len, extract(x IN nodes(sp) | x.title) AS path
ORDER BY len DESC LIMIT 4"""
- Degree centrality (number of connections):
"""MATCH (p:Product)
RETURN p.title AS product, size( (p)-[:ALSO_BOUGHT]-() ) AS degree ORDER BY degree DESC LIMIT 5"
- Shortest path between two products:
"""MATCH (p:Product {title: ""}), (q:Product {title: ""})
MATCH sp=shortestPath((p)-[ALSO_VIEWED]-(q))
RETURN sp"""

- Pivotal nodes (node is pivotal if it lies on all shortest path between two other nodes):
"""MATCH (a:Character), (b:Character)
MATCH p=allShortestPaths((a)-[:INTERACTS*]-(b)) WITH collect(p) AS paths, a, b
MATCH (c:Character) WHERE all(x IN paths WHERE c IN nodes(x)) AND NOT c IN [a,b]
RETURN a.name, b.name, c.name AS PivotalNode SKIP 490 LIMIT 10"""
To check:
"""MATCH (a:Character {name: "Drogo"}), (b:Character {name: "Ramsay"})
MATCH p=allShortestPaths((a)-[:INTERACTS*]-(b))
RETURN p"""

- Betweennes centrality(brokers of informations/clusters connectors)[requires apoc library]:
"""MATCH (c:Character)
WITH collect(c) AS characters
CALL apoc.algo.betweenness(['INTERACTS'], characters, 'BOTH') YIELD node, score
SET node.betweenness = score
RETURN node.name AS name, score ORDER BY score DESC"""
- Closeness centrality(highly connected nodes inside a cluster):
"""MATCH (c:Character)
WITH collect(c) AS characters
CALL apoc.algo.closeness(['INTERACTS'], characters, 'BOTH') YIELD node, score
RETURN node.name AS name, score ORDER BY score DESC"""