# Project 3, Part 2: Create a graph database in Neo4j for the Movie Lens 

University of California, Berkeley

Master of Information and Data Science (MIDS) program

w205 - Fundamentals of Data Engineering

In [1]:
# import statements 

import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

### Supporting Code

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))
session = driver.session(database="neo4j")

In [3]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [4]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [5]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")


In [6]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)
cursor = connection.cursor()

## Clean up Neo4j

In [7]:
# wipe out the database
my_neo4j_wipe_out_database()

# verify number of nodes and relationships
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 0
  Relationships: 0
-------------------------


#  Create Nodes

In [8]:
# define a funciton to create nodes for movieid
# note genres in pipe separated values 

def my_neo4j_create_node_movies(movieid, title, genres):
    "create a node with label movie"
    
    query = """
    
    CREATE (:movie {name: $movieid, genres: $genres, title: $title})
    
    """
    
    session.run(query, movieid=movieid, title=title, genres=genres)

In [9]:
# query the postgres table to fill create movie nodes 

connection.rollback()

query = """

select movieid::int,
    title,
    genres
from movies
where movieid in (5)
order by 1

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    movieid = row[0]
    title = row[1]
    genres = row[2]
    
    my_neo4j_create_node_movies(movieid, title, genres)

In [10]:
# define a funciton to create nodes for users
# contina user id and average rating 

def my_neo4j_create_node_users(userid, average_rating):
    "create a node with label user"
    
    query = """
    
    CREATE (:user {name: $userid, average_rating: $average_rating})
    
    """
    
    session.run(query, userid=userid, average_rating=average_rating)

In [11]:
# query postgres ratings table to create user nodes 
# group by average rating to get distinct userid and add parameter average rating 
connection.rollback()

query = """

select userid::int as userid,
    round((avg(rating)::int), 0) as average_rating
from ratings
where movieid in (5)
group by 1

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    userid = int(row[0])
    average_rating = int(row[1])
    
    my_neo4j_create_node_users(userid, average_rating)

## Optional: if we want to create nodes for the high level genres
these are from the list of csv genres not pipe separated 

In [12]:
# # define a funciton to create nodes for genres

# def my_neo4j_create_node_genres(genres):
#     "create a node with label genres"
    
#     query = """
    
#     CREATE (:genres {name: $genres})
    
#     """
    
#     session.run(query, genres=genres)

In [13]:
# # create genre node based on list of genres not pipe separated variables
# connection.rollback()

# query = """

# select distinct genre as genres
# from genre
# order by 1

# """

# cursor.execute(query)

# connection.rollback()

# rows = cursor.fetchall()

# for row in rows:
        
#     genres = row[0]
    
#     my_neo4j_create_node_genres(genres)

## Create Relationships
- Create a relationship between users and movies weighted by rating

### one way relationship users rating movies 

In [14]:
# This defines a function to create a one way relationship which we use to connect users to movies via ratings
# this is the origional graph relationship 
def my_neo4j_create_relationship_one_way(userid, movieid, rating):
    "create a relationship one way between users and movieid with a weight of rating"
    
    query = """
    
    MATCH (from:user), 
          (to:movie)
    WHERE from.name = $userid AND to.name = $movieid
    CREATE (from)-[:RATED {rating: $rating}]->(to)
    
    """
    
    session.run(query, userid=userid, movieid=movieid, rating=rating)

In [15]:
# query ratings table to create user to movie rating relationship in one way scenario 
connection.rollback()

query = """

select userid::int,
    movieid::int,
    rating::float
from ratings
where movieid in (5)
order by 1

"""
cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
    
    userid = int(row[0])
    movieid = int(row[1]) 
    rating = float(row[2])
    
    my_neo4j_create_relationship_one_way(userid, movieid, rating)

## two way relationships 

In [16]:
# # this creates a two way relationship users rating movies and movies rated by users 
# def my_neo4j_create_relationship_two_way_ratings(userid, movieid, rating):
#     "create relationships two way between between movies and users"
    
#     query = """
    
#     MATCH (from:user), 
#           (to:movie)
#     WHERE from.name = $userid AND to.name = $movieid
#     CREATE (from)-[:RATED {rating: $rating}]->(to),
#            (to)-[:RATEDBY {rating: $rating}]->(from)
    
#     """
    
#     session.run(query, userid=userid, movieid=movieid, rating=rating)

In [17]:
# # this queries the table to fill the two way user <-rating-> movies relationship
# connection.rollback()

# query = """

# select userid::int,
#     movieid::int,
#     rating::float
# from ratings
# where movieid in (5)
# order by 1

# """
# cursor.execute(query)

# connection.rollback()

# rows = cursor.fetchall()

# for row in rows:
    
#     userid = int(row[0])
#     movieid = int(row[1]) 
#     rating = float(row[2])
    
#     my_neo4j_create_relationship_two_way_ratings(userid, movieid, rating)

## Average Rating Attempts

In [18]:
# create a one way relationship matching userid to userid if they have the same average rating 
# define this relationship as similar 

def my_neo4j_create_relationship_one_way_avg_rating(userid1, userid2, average_rating):
    "create relationships two way between between users who have the same average rating"
    
    query = """
    
    MATCH (from:user), 
          (to:user)
    WHERE from.average_rating = to.average_rating
    CREATE (from)-[:SIMILAR {average_rating: from.average_rating}]->(to)
    
    """
    
    session.run(query, userid1=userid1, userid2=userid2, average_rating=average_rating)

In [19]:
# use the average_rating table to populate one way relationships
# not this is currently limited in order to test quicker (see create tables notebook)
connection.rollback()

query = """

select userid1,
    userid2,
    average_rating
from average_ratings
order by userid1

"""
cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:   
    userid1 = int(row[0])
    userid2 = int(row[1])
    average_rating = int(row[2])
    
    my_neo4j_create_relationship_one_way_avg_rating(userid1, userid2, average_rating)

In [20]:
def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [21]:
query = """

select *
from average_ratings
order by 3,1,2
limit 40

"""
cursor.execute(query)
my_select_query_pandas(query, rollback_before_flag=True, rollback_after_flag=True)

Unnamed: 0,userid1,userid2,average_rating
0,68,288,2
1,68,414,2
2,68,437,2
3,68,474,2
4,68,483,2
5,68,489,2
6,68,509,2
7,68,590,2
8,68,600,2
9,288,68,2


### below is my original failed attempt at looping python

In [22]:
# # failed attempt at looping python 
# connection.rollback()

# query = """

# select userid::int as userid,
#     round((avg(rating)::int), 0) as average_rating
# from ratings
# group by userid

# """
# cursor.execute(query)

# connection.rollback()

# rows = cursor.fetchall()

# userid2 = 0
# average_rating2 = 'NULL'
# for row in rows:   
#     userid1 = int(row[0])
#     average_rating1 = int(row[1])
#     for row in rows:
#         userid2= int(row[0])
#         if userid1 != userid2: 
#             average_rating2 = int(row[1])
#             if average_rating1 == average_rating2:
#                 my_neo4j_create_relationship_two_way_avg_rating(userid1, userid2)
#     userid2=int(row[0])
#     average_rating2 = int(row[1])

## Optional Genre Relationships

In [23]:
# # attempt to make genres connect to movies 
# # non-functioning 
# def my_neo4j_create_relationship_two_way_genres(movieid, genres):
#     "create relationships two way between between movies and genres"
    
#     query = """
    
#     MATCH (from:movieid), 
#           (to:genres)
#     WHERE from.name = $movieid AND "$genres" CONTAINS to.name
#     CREATE (from)-[:ISGENRE]->(to),
#            (to)-[:CONTAINS]->(from)
    
#     """
    
#     session.run(query, genres=genres, movieid=movieid)

In [24]:
# # attempt to connect genres and movies 
# # non-functioning
# connection.rollback()

# query = """

# select genres,
#     movieid::int
# from movies
# order by 1

# """
# cursor.execute(query)

# connection.rollback()

# rows = cursor.fetchall()

# for row in rows:
    
#     genres = row[0]
#     movieid = int(row[1]) 
    
#     my_neo4j_create_relationship_two_way_genres(movieid, genres)