# Project 3, Part 2: Create a graph database in Neo4j for the Movie Lens 

University of California, Berkeley

Master of Information and Data Science (MIDS) program

w205 - Fundamentals of Data Engineering

In [1]:
# import statements 

import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

### Supporting Code

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [5]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [6]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")


In [7]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [8]:
cursor = connection.cursor()

## Clean up Neo4j

In [9]:
# wipe out the database
my_neo4j_wipe_out_database()

In [10]:
# verify number of nodes and relationships
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 0
  Relationships: 0
-------------------------


##  Create Nodes
- create node for Movie titles
- create node for Users
- create node for Genres (from a list) 

In [11]:
# define a funciton to create nodes for movieid
# come back later and see if you want nodes to display titles 
# unsure if I can make this scale to different nodes I want to create

def my_neo4j_create_node(movieid):
    "create a node with label movie"
    
    query = """
    
    CREATE (:movie {name: $movieid})
    
    """
    
    session.run(query, movieid=movieid)

In [12]:
connection.rollback()

query = """

select movieid::int
from movies
order by movieid

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    movieid = row[0]
    
    my_neo4j_create_node(movieid)

In [15]:
# define a funciton to create nodes for movie titles
# unsure if I can make this scale to different nodes I want to create

def my_neo4j_create_node_users(userid):
    "create a node with label user"
    
    query = """
    
    CREATE (:user {name: $userid})
    
    """
    
    session.run(query, userid=userid)

In [16]:
connection.rollback()

query = """

select distinct userid::int as userid
from ratings
order by 1

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    userid = row[0]
    
    my_neo4j_create_node_users(userid)

In [28]:
# high_corr_series

## Create Relationships
- Create a relationship between users and movies weighted by rating

In [17]:
def my_neo4j_create_relationship_one_way(userid, movieid, rating):
    "create a relationship one way between users and movieid with a weight of rating"
    
    query = """
    
    MATCH (from:user), 
          (to:movie)
    WHERE from.name = $userid AND to.name = $movieid
    CREATE (from)-[:RATED {rating: $rating}]->(to)
    
    """
    
    # MATCH (u:user)-[r:RATED]->(m:movie) RETURN u, type(r), m
    
    session.run(query, userid=userid, movieid=movieid, rating=rating)

In [18]:
connection.rollback()

query = """

select userid::int,
    movieid::int,
    rating::float
from ratings
where movieid in (5)
order by 1

"""
cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    userid = int(row[0])
    movieid = int(row[1]) 
    rating = float(row[2])
    
    my_neo4j_create_relationship_one_way(userid, movieid, rating)

## Possible additions to graph database
- create genre nodes 
- add movie titles to the movie ID nodes 
- create a relationship between genres and movies (pay attention that it is multi-select)
- do something with the timestamps

In [20]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'user', 'movie', {relationshipProperties: 'rating'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7fb1077fa580>

In [21]:
query = """

CALL gds.graph.project(
    'userMovieGraph', // The name of the projected graph
    'user',          // The node label to include in the projection
    { 
        RATED: { 
            type: 'RATED', 
            orientation: 'UNDIRECTED' // Considering the RATED relationship as undirected for PageRank
        } 
    }
)

"""
session.run(query)

ClientError: {code: Neo.ClientError.Procedure.ProcedureCallFailed} {message: Failed to invoke procedure `gds.graph.project`: Caused by: java.lang.IllegalArgumentException: Invalid relationship projection, one or more relationship types not found: 'movie'}

In [22]:
query = """

CALL gds.pageRank.stream('userMovieGraph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC

"""

max_iterations = 20
damping_factor = 0.05

my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,1,0.95
1,2,0.95
2,3,0.95
3,4,0.95
4,5,0.95
...,...,...
605,606,0.95
606,607,0.95
607,608,0.95
608,609,0.95
