# Project 3

### Create Tables from Movie Lens Small Data Set, Create Graph Database, Run Algorithms 

University of California Berkeley 

Master of Information and Data Science (MIDS) program

w205 - Fundamentals of Data Engineering

Section 009 

In [1]:
# import statements 

import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [2]:
# Starter Code 
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [3]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [4]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")


In [5]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [6]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [7]:
cursor = connection.cursor()

In [8]:
# starter code to read a csv file 

def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

## drop tables if exist 

In [9]:
# drop the movies table if it exits 
# for clean up if needed

connection.rollback()

query = """

drop table if exists movies

"""

cursor.execute(query)

connection.commit()

In [10]:
# drop the ratings table if it exits 
# for clean up if needed 

connection.rollback()

query = """

drop table if exists ratings

"""

cursor.execute(query)

connection.commit()

## Create Tables for Movies and Load Data 

the movies table should have the following columns 
- movieid numeric (primary key) 
- title varchar
- genres varchar


In [11]:
# create a table for movies 
# set primary key as movieID
# note that genres is a pipe separated list of genres 

connection.rollback()

query = """

create table movies (
  movieid numeric,
  title varchar,
  genres varchar,
  primary key (movieid)
);

"""

cursor.execute(query)

connection.commit()

In [12]:
# display the first 10 rows of the movies.csv file to check it is as expected 

my_read_csv_file('MovieLens_small/movies.csv', limit=10)

['movieId', 'title', 'genres']
['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy']
['2', 'Jumanji (1995)', 'Adventure|Children|Fantasy']
['3', 'Grumpier Old Men (1995)', 'Comedy|Romance']
['4', 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance']
['5', 'Father of the Bride Part II (1995)', 'Comedy']
['6', 'Heat (1995)', 'Action|Crime|Thriller']
['7', 'Sabrina (1995)', 'Comedy|Romance']
['8', 'Tom and Huck (1995)', 'Adventure|Children']
['9', 'Sudden Death (1995)', 'Action']

Printed  10 lines of  9743 total lines.


In [13]:
# load the csv file movies.csv into the movies database table

connection.rollback()

query = """

copy movies
from '/user/projects/project-3-RebeccaBaugh/code/MovieLens_small/movies.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [14]:
# verify movies loaded correctly 

rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from movies
order by movieid

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


## Create Tables for Ratings and Load Data 

the ratings table should have the following columns
- userid numeric
- movieid numeric
- rating numeric
- timestamp timestamp 
- primary key is userid, movieid

In [15]:
# create a table for ratings 
# set primary key as movieid and userid composite key 

connection.rollback()

query = """

create table ratings (
  userid numeric,
  movieid numeric,
  rating float,
  timestamp varchar,
  primary key (userid, movieid)
);

"""

cursor.execute(query)

connection.commit()

In [16]:
# display the first 10 rows of the ratings.csv file to check it is as expected 

my_read_csv_file('MovieLens_small/ratings.csv', limit=10)

['userId', 'movieId', 'rating', 'timestamp']
['1', '1', '4.0', '964982703']
['1', '3', '4.0', '964981247']
['1', '6', '4.0', '964982224']
['1', '47', '5.0', '964983815']
['1', '50', '5.0', '964982931']
['1', '70', '3.0', '964982400']
['1', '101', '5.0', '964980868']
['1', '110', '4.0', '964982176']
['1', '151', '5.0', '964984041']

Printed  10 lines of  100837 total lines.


In [17]:
# load the csv file rating.csv into the ratings database table

connection.rollback()

query = """

copy ratings
from '/user/projects/project-3-RebeccaBaugh/code/MovieLens_small/ratings.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [18]:
# verify ratings loaded correctly 

rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from ratings
order by userid, movieid

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [19]:
connection.rollback()
query = """

drop table if exists incommon

"""
cursor.execute(query)
connection.commit()

In [20]:
connection.rollback()
query = """

create table incommon (
  movieid numeric,
  primary key (movieid)
);

"""
cursor.execute(query)
connection.commit()

In [21]:
## Toy Story = 1; Clueless = 39; GoldenEye = 10; City of Lost Children = 29


rollback_before_flag = True
rollback_after_flag = True

query = """

with sample as (
select userid
from ratings
where movieid in (1,10,29,39)
order by random()
limit 100
),
r as (
select *
from ratings
where userid in (select userid from sample)
),
pairspermovie as (
select ra.userid as uid1,
        rb.userid as uid2,
        ra.movieid as mid
from r as ra
join r as rb on ra.movieid = rb.movieid
where ra.userid != rb.userid
)
select uid1,
        uid2,
        count(*) as commonmovies
from pairspermovie
group by 1,2

"""

data = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

## Note: List of Genres for Nodes
* Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western
* (no genres listed)

In [22]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [23]:
session = driver.session(database="neo4j")

In [24]:
my_neo4j_wipe_out_database()

In [25]:
# Function to create user nodes, using MERGE to avoid duplicates
def create_user_node(userid):
    query = "MERGE (:User {id: $userid, name: $userid})"
    session.run(query, parameters={'userid': userid})

# Function to create relationships with correlation weights
def create_correlation(user1, user2, correlation):
    query = """
    MATCH (a:User {id: $user1}), (b:User {id: $user2})
    MERGE (a)-[r:COMMON_MOVIES]->(b)
    SET r.weight = $correlation
    """
    session.run(query, parameters={'user1': user1, 'user2': user2, 'correlation': correlation})

# Import data from DataFrame
def import_data(df):
    # Ensure user nodes are created or merged first to avoid creating any orphan relationships
    unique_users = set(df['uid1']).union(set(df['uid2']))
    for user in unique_users:
        create_user_node(user)

    # Create or update relationships based on the DataFrame rows
    for _, row in df.iterrows():
        create_correlation(row['uid1'], row['uid2'], row['commonmovies'])

In [26]:
import_data(data)

In [27]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 92
  Relationships: 8226
-------------------------


# Algorithms 

## Page Rank

In [28]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'COMMON_MOVIES', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f4250893d60>

In [29]:
query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC

"""

max_iterations = 20
damping_factor = 0.05

my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,1,1.001009
1,19,1.001009
2,32,1.001009
3,43,1.001009
4,45,1.001009
...,...,...
87,26,0.996627
88,485,0.996064
89,35,0.995501
90,291,0.991203


In [30]:
# final_correlations[final_correlations['user1'] == 293]

In [31]:
# final_correlations[final_correlations['user1'] == 590]

## Degree Centrality

In [32]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

#query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
query = "CALL gds.graph.project('ds_graph', 'User', 'COMMON_MOVIES', {relationshipProperties: 'weight'})"
session.run(query)


query = """

CALL gds.degree.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,degree
0,1,91.0
1,19,91.0
2,32,91.0
3,43,91.0
4,45,91.0
...,...,...
87,26,84.0
88,485,83.0
89,35,82.0
90,291,74.0


## Triangle Count
Does not work because "correlated with" relationships are not all undirected

## Closeness Centrality

In [33]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'COMMON_MOVIES', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.beta.closeness.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,closeness
0,1,1.000000
1,517,1.000000
2,522,1.000000
3,19,1.000000
4,32,1.000000
...,...,...
87,26,0.928571
88,485,0.919192
89,35,0.910000
90,291,0.842593


## Wasserman and Faust Centrality

In [34]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'COMMON_MOVIES', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.beta.closeness.stream('ds_graph',
                               {useWassermanFaust: true}
                              )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,closeness
0,1,1.000000
1,517,1.000000
2,522,1.000000
3,19,1.000000
4,32,1.000000
...,...,...
87,26,0.928571
88,485,0.919192
89,35,0.910000
90,291,0.842593


## Harmonic Centrality

In [35]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'COMMON_MOVIES', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.alpha.closeness.harmonic.stream('ds_graph', {})
YIELD nodeId, centrality
RETURN gds.util.asNode(nodeId).name AS name, centrality as closeness
ORDER BY centrality DESC

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,closeness
0,1,1.000000
1,517,1.000000
2,522,1.000000
3,19,1.000000
4,32,1.000000
...,...,...
87,26,0.961538
88,485,0.956044
89,35,0.950549
90,291,0.906593


## Louvain Modularity

In [36]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'COMMON_MOVIES', {relationshipProperties: 'weight'})"

session.run(query)

query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,community,intermediate_community
0,1,55,[55]
1,23,55,[55]
2,32,55,[55]
3,34,55,[55]
4,43,55,[55]
...,...,...,...
87,485,88,[88]
88,522,88,[88]
89,531,88,[88]
90,557,88,[88]


In [37]:
x = my_neo4j_run_query_pandas(query)
x['community'].value_counts()
## There seem to be only two distinct groups

55    56
88    36
Name: community, dtype: int64