# Neo4j

Nodes:
- Product
    - productID
    - name
    - price
    - brand

Relations:
- Product -[:ALSO_BOUGHT]- Product   
- Product -[:ALSO_VIEWED]- Product
- Product -[:ALSO_BOUGHT]- Product
- Product -[:BUY_AFTER_VIEWING]-Product
    
Steps:
1. Populate the DB with all products
2. Create the relationships

In [1]:
from neo4j.v1 import GraphDatabase, basic_auth

driver = GraphDatabase.driver("bolt://localhost", auth=basic_auth("neo4j", "comptools"), encryption=False)
session = driver.session()

### Step 1: Products
(We do not include in the graph the products without a title and/or without related products)

In [None]:
import time 
import datetime

def parse_line(line):
    product = eval(line)
    params = {}
    if (product.has_key('title') and product.has_key('related')):
        params['asin'] = product['asin']
        params['title'] = product['title']
        if product.has_key('price'):
            params['price'] = product['price']
        else:
            params['price'] = None
    return params

start = time.time()
session.run("CREATE CONSTRAINT ON (p:Product) ASSERT p.productID IS UNIQUE")

with open("/Users/lucacambiaghi/amz_dataset/metadata_no_books.json", "r") as data:
    line_count=0
    products = []
    for line in data:
        products.append(parse_line(line))
        line_count = line_count + 1
        if(line_count % 1000 == 0):#every 1000 lines
            with session.new_transaction() as tx:
                for product in products:
                    if not (product.has_key('title')):
                        continue
                    else:
                        create_query="""CREATE (p:Product {productID:{asin}})
                        SET p.name={title}, p.price=toFloat({price})"""
                        tx.run(create_query, params)
                tx.success() # check every 1000 queries
                products = []
            if line_count % 30000 == 0: #every 30000 lines
                print (line_count*1.0/600000), '% completed' # print every 5%
session.close()
end = time.time()
print "executed in:", datetime.timedelta(seconds = (end - start))

### Step 2: Relationships

In [9]:
def parse_line(line):
    product = eval(line)
    params = {}
    if (product.has_key('title') and product.has_key('related')):
        params['asin'] = product['asin']
        params['title'] = product['title']
        for key in ['also_bought', 'also_viewed', 'bought_together', 'buy_after_viewing']:
            if not product['related'].has_key(key):
                params[key] = []
            else:
                params[key] = product['related'][key]
    return params

with open("/Users/lucacambiaghi/amz_dataset/metadata_no_books.json", "r") as data:
    count=0
    for line in data:
        params = parse_line(line)
        if not (params.has_key('title')):
            count +=1
            continue
        create_query = """MATCH (p:Product),(q:Product)
        WHERE p.productID = {asin} AND q.productID IN {also_bought}
        CREATE UNIQUE (p)-[:ALSO_BOUGHT]-(q)
        
        MATCH (p:Product),(q:Product)
        WHERE p.productID = {asin} AND q.productID IN {also_viewed}
        CREATE UNIQUE (p)-[:ALSO_BOUGHT]-(q)
        
        MATCH (p:Product),(q:Product)
        WHERE p.productID = {asin} AND q.productID IN {bought_together}
        CREATE UNIQUE (p)-[:ALSO_BOUGHT]-(q)
        
        MATCH (p:Product),(q:Product)
        WHERE p.productID = {asin} AND q.productID IN {buy_after_viewing}
        CREATE UNIQUE (p)-[:ALSO_BOUGHT]-(q)"""
        session.run(create_query, params)
        count+=1
        if count % 600000 == 0:
            print str(count/600000) + '% completed'
            break
print "done"
session.close()

0% completed
done


### Query
- 