# Project 3

### Create Tables from Movie Lens Small Data Set, Create Graph Database, Run Algorithms 

University of California Berkeley 

Master of Information and Data Science (MIDS) program

w205 - Fundamentals of Data Engineering

Section 009 

In [1]:
# import statements 

import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

In [2]:
# Starter Code 
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [3]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [4]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")


In [5]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [6]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [7]:
cursor = connection.cursor()

In [8]:
# starter code to read a csv file 

def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

## drop tables if exist 

In [9]:
# drop the movies table if it exits 
# for clean up if needed

connection.rollback()

query = """

drop table if exists movies

"""

cursor.execute(query)

connection.commit()

In [10]:
# drop the ratings table if it exits 
# for clean up if needed 

connection.rollback()

query = """

drop table if exists ratings

"""

cursor.execute(query)

connection.commit()

## Create Tables for Movies and Load Data 

the movies table should have the following columns 
- movieid numeric (primary key) 
- title varchar
- genres varchar


In [11]:
# create a table for movies 
# set primary key as movieID
# note that genres is a pipe separated list of genres 

connection.rollback()

query = """

create table movies (
  movieid numeric,
  title varchar,
  genres varchar,
  primary key (movieid)
);

"""

cursor.execute(query)

connection.commit()

In [12]:
# display the first 10 rows of the movies.csv file to check it is as expected 

my_read_csv_file('MovieLens_small/movies.csv', limit=10)

['movieId', 'title', 'genres']
['1', 'Toy Story (1995)', 'Adventure|Animation|Children|Comedy|Fantasy']
['2', 'Jumanji (1995)', 'Adventure|Children|Fantasy']
['3', 'Grumpier Old Men (1995)', 'Comedy|Romance']
['4', 'Waiting to Exhale (1995)', 'Comedy|Drama|Romance']
['5', 'Father of the Bride Part II (1995)', 'Comedy']
['6', 'Heat (1995)', 'Action|Crime|Thriller']
['7', 'Sabrina (1995)', 'Comedy|Romance']
['8', 'Tom and Huck (1995)', 'Adventure|Children']
['9', 'Sudden Death (1995)', 'Action']

Printed  10 lines of  9743 total lines.


In [13]:
# load the csv file movies.csv into the movies database table

connection.rollback()

query = """

copy movies
from '/user/projects/project-3-RebeccaBaugh/code/MovieLens_small/movies.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [14]:
# verify movies loaded correctly 

rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from movies
order by movieid

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,movieid,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


## Create Tables for Ratings and Load Data 

the ratings table should have the following columns
- userid numeric
- movieid numeric
- rating numeric
- timestamp timestamp 
- primary key is userid, movieid

In [15]:
# create a table for ratings 
# set primary key as movieid and userid composite key 

connection.rollback()

query = """

create table ratings (
  userid numeric,
  movieid numeric,
  rating float,
  timestamp varchar,
  primary key (userid, movieid)
);

"""

cursor.execute(query)

connection.commit()

In [16]:
# display the first 10 rows of the ratings.csv file to check it is as expected 

my_read_csv_file('MovieLens_small/ratings.csv', limit=10)

['userId', 'movieId', 'rating', 'timestamp']
['1', '1', '4.0', '964982703']
['1', '3', '4.0', '964981247']
['1', '6', '4.0', '964982224']
['1', '47', '5.0', '964983815']
['1', '50', '5.0', '964982931']
['1', '70', '3.0', '964982400']
['1', '101', '5.0', '964980868']
['1', '110', '4.0', '964982176']
['1', '151', '5.0', '964984041']

Printed  10 lines of  100837 total lines.


In [17]:
# load the csv file rating.csv into the ratings database table

connection.rollback()

query = """

copy ratings
from '/user/projects/project-3-RebeccaBaugh/code/MovieLens_small/ratings.csv' delimiter ',' NULL '' csv header;

"""

cursor.execute(query)

connection.commit()

In [18]:
# verify ratings loaded correctly 

rollback_before_flag = True
rollback_after_flag = True

query = """

select *
from ratings
order by userid, movieid

"""

my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [19]:
df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

In [20]:
df['rating'].unique()

array([4. , 5. , 3. , 2. , 1. , 4.5, 3.5, 2.5, 0.5, 1.5])

In [21]:
df_sample = df[df['movieid'] == 5]

In [22]:
df

Unnamed: 0,userid,movieid,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [23]:
pivot_table = df.pivot_table(index='movieid', columns='userid', values='rating')


In [24]:
pivot_table

userid,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [25]:
user_correlations = pivot_table.corr(method='pearson')

In [26]:
user_correlations

userid,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,,0.079819,0.207983,0.268749,-0.291636,-0.118773,0.469668,0.918559,-0.037987,...,9.157371e-02,-1.597727e-16,-0.061503,-0.407556,-0.164871,0.066378,0.174557,0.268070,-0.175412,-0.032086
2,,1.000000,,,,,-0.991241,,,0.037796,...,-3.873468e-01,,-1.000000,,,0.583333,,-0.125000,,0.623288
3,0.079819,,1.000000,,,,,,,,...,,,0.433200,,,-0.791334,-0.333333,-0.395092,,0.569562
4,0.207983,,,1.000000,-0.336525,0.148498,0.542861,0.117851,,0.485794,...,-2.221127e-01,3.966413e-01,0.090090,-0.080296,0.400124,0.144603,0.116518,-0.170501,-0.277350,-0.043786
5,0.268749,,,-0.336525,1.000000,0.043166,0.158114,0.028347,,-0.777714,...,2.719480e-16,1.533034e-01,0.234743,0.067791,-0.364156,0.244321,0.231080,-0.020546,0.384111,0.040582
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.066378,0.583333,-0.791334,0.144603,0.244321,-0.049192,0.137771,0.253582,0.572700,-0.382955,...,2.904896e-01,1.406134e-01,0.318473,0.682949,0.167062,1.000000,0.114191,0.240842,0.533002,0.389185
607,0.174557,,-0.333333,0.116518,0.231080,0.255639,0.402792,0.251280,,-0.241121,...,6.982411e-01,2.172105e-01,0.192787,0.035806,-0.299641,0.114191,1.000000,0.200814,0.190117,0.106605
608,0.268070,-0.125000,-0.395092,-0.170501,-0.020546,0.125428,0.008081,0.434423,0.336625,-0.571043,...,4.739665e-01,2.976461e-01,0.086423,0.053986,-0.075673,0.240842,0.200814,1.000000,0.488929,0.147606
609,-0.175412,,,-0.277350,0.384111,0.193649,0.420288,0.141860,,,...,1.000000e+00,1.885115e-01,0.343303,0.641624,-0.550000,0.533002,0.190117,0.488929,1.000000,-0.521773


In [27]:
filtered_correlations = user_correlations.stack()

In [28]:
filtered_correlations

userid  userid
1       1         1.000000
        3         0.079819
        4         0.207983
        5         0.268749
        6        -0.291636
                    ...   
610     606       0.389185
        607       0.106605
        608       0.147606
        609      -0.521773
        610       1.000000
Length: 265417, dtype: float64

In [29]:
filtered_correlations.index = filtered_correlations.index.rename(['userid1', 'userid2'])

In [30]:
filtered_correlations = filtered_correlations.reset_index()

In [31]:
filtered_correlations.columns = ['user1', 'user2', 'correlation']  # Rename columns directly after resetting index
filtered_correlations = filtered_correlations[filtered_correlations['correlation'] > 0.7]

In [32]:
final_correlations = filtered_correlations[filtered_correlations['user1'] != filtered_correlations['user2']]

In [33]:
final_correlations

Unnamed: 0,user1,user2,correlation
7,1,9,0.918559
10,1,13,0.878310
46,1,49,0.750000
82,1,90,0.821584
98,1,106,1.000000
...,...,...,...
265302,610,494,0.811761
265352,610,545,1.000000
265382,610,576,1.000000
265384,610,578,0.808224


## Note: List of Genres for Nodes
* Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western
* (no genres listed)

In [34]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [35]:
session = driver.session(database="neo4j")

In [36]:
my_neo4j_wipe_out_database()

In [37]:
# Function to create user nodes, using MERGE to avoid duplicates
def create_user_node(userid):
    query = "MERGE (:User {id: $userid, name: $userid})"
    session.run(query, parameters={'userid': userid})

# Function to create relationships with correlation weights
def create_correlation(user1, user2, correlation):
    query = """
    MATCH (a:User {id: $user1}), (b:User {id: $user2})
    MERGE (a)-[r:CORRELATED_WITH]->(b)
    SET r.weight = $correlation
    """
    session.run(query, parameters={'user1': user1, 'user2': user2, 'correlation': correlation})

# Import data from DataFrame
def import_data(df):
    # Ensure user nodes are created or merged first to avoid creating any orphan relationships
    unique_users = set(df['user1']).union(set(df['user2']))
    for user in unique_users:
        create_user_node(user)

    # Create or update relationships based on the DataFrame rows
    for _, row in df.iterrows():
        create_correlation(row['user1'], row['user2'], row['correlation'])

In [38]:
import_data(final_correlations)

In [39]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 608
  Relationships: 25838
-------------------------


# Algorithms 

## Page Rank

In [40]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'CORRELATED_WITH', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j._sync.work.result.Result at 0x7f9362c97b50>

In [41]:
query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC

"""

max_iterations = 20
damping_factor = 0.05

my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,250,1.103935
1,293,1.099777
2,396,1.094582
3,575,1.093582
4,146,1.091399
...,...,...
603,274,0.957191
604,140,0.956545
605,414,0.956303
606,474,0.953797


In [42]:
final_correlations[final_correlations['user1'] == 293]

Unnamed: 0,user1,user2,correlation
127042,293,6,0.707107
127044,293,11,1.000000
127045,293,14,1.000000
127051,293,26,1.000000
127054,293,31,1.000000
...,...,...,...
127316,293,592,0.866025
127317,293,593,0.753778
127318,293,594,1.000000
127322,293,602,0.866025


In [43]:
final_correlations[final_correlations['user1'] == 590]

Unnamed: 0,user1,user2,correlation
254664,590,13,0.78843
254676,590,25,0.793377
254735,590,85,0.714191
254768,590,118,0.718185
254862,590,214,0.825969
254898,590,250,0.716977
254926,590,278,0.743392
254944,590,296,0.732271
254950,590,302,0.766572
255014,590,366,0.762932


## Degree Centrality

In [44]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

#query = "CALL gds.graph.project('ds_graph', 'Station', 'TRACK', {relationshipProperties: 'track_miles'})"
query = "CALL gds.graph.project('ds_graph', 'User', 'CORRELATED_WITH', {relationshipProperties: 'weight'})"
session.run(query)


query = """

CALL gds.degree.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,degree
0,396,118.0
1,375,109.0
2,422,107.0
3,293,106.0
4,490,105.0
...,...,...
603,251,8.0
604,414,8.0
605,140,5.0
606,599,4.0


## Triangle Count
Does not work because "correlated with" relationships are not all undirected

## Closeness Centrality

In [45]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'CORRELATED_WITH', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.beta.closeness.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,closeness
0,375,0.546355
1,396,0.545863
2,422,0.544395
3,490,0.543907
4,449,0.538121
...,...,...
603,599,0.393645
604,175,0.384421
605,3,0.380803
606,140,0.374923


## Wasserman and Faust Centrality

In [46]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'CORRELATED_WITH', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.beta.closeness.stream('ds_graph',
                               {useWassermanFaust: true}
                              )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as closeness
ORDER BY score DESC

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,closeness
0,375,0.546355
1,396,0.545863
2,422,0.544395
3,490,0.543907
4,449,0.538121
...,...,...
603,599,0.393645
604,175,0.384421
605,3,0.380803
606,140,0.374923


## Harmonic Centrality

In [47]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.project('ds_graph', 'User', 'CORRELATED_WITH', {relationshipProperties: 'weight'})"
session.run(query)

query = """

CALL gds.alpha.closeness.harmonic.stream('ds_graph', {})
YIELD nodeId, centrality
RETURN gds.util.asNode(nodeId).name AS name, centrality as closeness
ORDER BY centrality DESC

"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,name,closeness
0,396,0.592806
1,375,0.588138
2,422,0.585942
3,490,0.584569
4,293,0.577430
...,...,...
603,599,0.412136
604,175,0.406370
605,3,0.400604
606,140,0.392092


## Louvain Modularity

In [49]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = """
CALL gds.graph.project('ds_graph', 'User', 'CORRELATED_WITH', {relationshipProperties: 'weight'})
"""

session.run(query)

query = """

CALL gds.louvain.stream('ds_graph', {includeIntermediateCommunities: true})
YIELD nodeId, communityId, intermediateCommunityIds
RETURN gds.util.asNode(nodeId).name AS name, communityId as community, intermediateCommunityIds as intermediate_community
ORDER BY community, name ASC

"""

my_neo4j_run_query_pandas(query)


Unnamed: 0,name,community,intermediate_community
0,13,312,"[566, 312]"
1,23,312,"[566, 312]"
2,24,312,"[566, 312]"
3,26,312,"[566, 312]"
4,41,312,"[566, 312]"
...,...,...,...
603,600,544,"[553, 544]"
604,606,544,"[553, 544]"
605,607,544,"[553, 544]"
606,608,544,"[544, 544]"
