In [142]:
from typing import TextIO
import csv
import pprint
from secret import NEO4J_PASSWORD

from neo4j import GraphDatabase, Driver

URI = "neo4j://localhost"
AUTH = ("neo4j", NEO4J_PASSWORD)
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()


In [143]:
with open(r"../data/ml-100k/u.item") as file:
    for i, item in enumerate(csv.reader(file, delimiter="|")):
        pprint.pprint(item, compact=True)
        if i >= 10:
            break

['1', 'Toy Story (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', '0', '0', '0', '1',
 '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['2', 'GoldenEye (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', '0', '1', '1', '0', '0',
 '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0']
['3', 'Four Rooms (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)', '0', '0', '0', '0',
 '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0']
['4', 'Get Shorty (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)', '0', '1', '0', '0',
 '0', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['5', 'Copycat (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?Copycat%20(1995)', '0', '0', '0', '0', '0',
 '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0']
['6', 'Sh

In [144]:
def open_file(csv_file_path: str) -> TextIO:
    with open(csv_file_path, encoding="utf-8") as f:
        return f

def read_user(csv_file_path):
    out = []
    with open(csv_file_path, encoding="utf-8") as f:
        reader = csv.DictReader(f,fieldnames=["userId", "age", "gender", "occupation", "zipCode"] ,delimiter="|")
        for i, row in enumerate(reader):
            row["userId"] = int(row["userId"])
            out.append(row)
    return out

def populate_user_txn(tx, rows):
    query = """
    UNWIND $users as user
    CREATE (Person:User {
    userId: user.userId,
    age: user.age,
    gender: user.gender,
    occupation: user.occupation,
    zipCode: user.zipCode})
    """
    try:
        tx.run(query, users=rows)
    except Exception as e:
        raise e

def create_users(driver: Driver):
    with driver.session(database="neo4j") as session:
        session.execute_write(
            populate_user_txn,
            read_user(r"../data/ml-100k/u.user")
        )
    return


def populate_occupation_tnx(tx, rows):
    query = """
    UNWIND $occupations as occupation
    CREATE (:Occupation {
    occupation: occupation.occupation,
    })
    """
    try:
        tx.run(query, occupation=rows)
    except Exception as e:
        raise e

def read_items(csv_file_path):
    out = []
    with open(csv_file_path, encoding="iso-8859-1") as f:

        reader = csv.DictReader(f, fieldnames= [
            "movieId", "title", "releaseDate", "videoReleaseDate", "IMDbURL",
            "unknown", "action", "adventure", "animation", "childrens", "comedy",
            "crime", "documentary", "drama", "fantasy", "filmNoir", "horror",
            "musical", "mystery", "romance", "sciFi", "thriller", "war", "western"
        ], delimiter="|")

        base = ["movieId", "title", "releaseDate", "videoReleaseDate", "IMDbURL"]

        genres = ["unknown", "action", "adventure", "animation", "childrens", "comedy",
        "crime", "documentary", "drama", "fantasy", "filmNoir", "horror",
        "musical", "mystery", "romance", "sciFi", "thriller", "war", "western"]

        for i, row in enumerate(reader):
            for genre in genres:
                if row[genre] == '1':
                    row["movieId"] = int(row["movieId"])
                    new_row = {key: value for key, value in row.items() if key in base}| {"genre": genre}
                    out.append(new_row)
    return out


def populate_movies_and_genres_txn(tx, rows):
    query = """
            UNWIND $rows AS row
            MERGE (m:Movie {
                movieId: row.movieId
            })
            SET m.title = row.title,
                m.releaseDate = row.releaseDate,
                m.videoReleaseDate = row.videoReleaseDate,
                m.IMDbURL = row.IMDbURL

            MERGE (g:Genre { name: row.genre })

            MERGE (m)-[:HAS_GENRE]->(g)
        """
    try:
        result = tx.run(query, rows=rows)
        return result
    except Exception as e:
        raise e

def create_movies_and_genres(driver: Driver):
    with driver.session(database="neo4j") as session:
        session.execute_write(
            populate_movies_and_genres_txn,
            read_items(r"../data/ml-100k/u.item")
        )
    return

def read_data(csv_file_path):
    with open(csv_file_path) as f:
        reader = csv.DictReader(f, fieldnames=["userId", "movieId", "rating", "timestamp"], dialect="excel-tab")
        out = []
        for row in reader:
            row["movieId"] = int(row["movieId"])
            row["userId"] = int(row["userId"])
            row["rating"] = float(row["rating"])
            out.append(row)
        return out

def populate_data_txn(tx, rows):
    query = """
    UNWIND $rows AS row

    MATCH (u:User { userId: row.userId })
    MATCH (m:Movie { movieId: row.movieId })

    MERGE (u)-[r:RATED]->(m)
    SET r.rating = row.rating,
        r.timestamp = row.timestamp
    """
    try:
        result = tx.run(query, rows=rows)
        return result
    except Exception as e:
        raise e

def create_data(driver: Driver):
    with driver.session(database="neo4j") as session:
        session.execute_write(
            populate_data_txn,
            read_data(r"../data/ml-100k/u.data")
        )
    return


def create_user_index_txn(tx):
    query = """
    CREATE INDEX user_userId IF NOT EXISTS FOR (u:User) ON (u.userId)
    """
    return tx.run(query)

def create_movie_index_txn(tx):
    query = """
    CREATE INDEX movie_movieId IF NOT EXISTS FOR (m:Movie) ON (m.movieId)
    """
    return tx.run(query)

def create_genre_index_txn(tx):
    query = """
    CREATE INDEX genre_name IF NOT EXISTS FOR (g:Genre) ON (g.name)
    """
    return tx.run(query)


def create_index(driver):
    with driver.session(database="neo4j") as session:
        session.execute_write(
           create_user_index_txn
        )
        session.execute_write(
            create_movie_index_txn
        )

        session.execute_write(
            create_genre_index_txn
        )
    return


In [145]:
driver = GraphDatabase.driver(URI, auth=AUTH)
try:
    create_users(driver)
    create_movies_and_genres(driver)
    create_index(driver)
    create_data(driver)
except Exception as e:
    raise e
finally:
    driver.close()

In [176]:
import pandas as pd

# find an appropriate high rating by looking at 1 standard deviation away from the average.
df = pd.read_csv(r"../data/ml-100k/u.data", dialect="excel-tab", names=["userId", "movieId", "rating", "timestamp"])
ratings: pd.Series = df["rating"]

high_rating = ratings.mean() + ratings.std()

In [177]:
def graph_traversal(tx, userId, rating):
    query = """
    WITH $userId AS targetUser, $rating AS ratingThreshold
    MATCH (target:User {userId: targetUser})-[:RATED]->(m:Movie)
    WITH target, collect(m) AS watched, ratingThreshold
    MATCH (target)-[:RATED]->(common:Movie)<-[:RATED]-(similar:User)
    WHERE target <> similar
    MATCH (similar)-[r:RATED]->(rec:Movie)
    WHERE NOT rec IN watched AND r.rating >= ratingThreshold
    WITH rec, avg(r.rating) AS avgRating
    RETURN rec.title AS movie, avgRating
    ORDER BY avgRating DESC
    LIMIT 10
    """
    result = tx.run(query, userId=userId, rating=rating)
    return [record.data() for record in result]

driver = GraphDatabase.driver(URI, auth=AUTH)
try:
    with driver.session(database="neo4j") as session:
        reccs = session.execute_read(graph_traversal, 13, high_rating)
        for rec in reccs:
            print(rec)
except Exception:
    raise
finally:
    driver.close()

{'movie': 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'avgRating': 5.0}
{'movie': 'Richard III (1995)', 'avgRating': 5.0}
{'movie': "Mr. Holland's Opus (1995)", 'avgRating': 5.0}
{'movie': 'French Twist (Gazon maudit) (1995)', 'avgRating': 5.0}
{'movie': 'White Balloon, The (1995)', 'avgRating': 5.0}
{'movie': "Antonia's Line (1995)", 'avgRating': 5.0}
{'movie': 'Angels and Insects (1995)', 'avgRating': 5.0}
{'movie': 'Brothers McMullen, The (1995)', 'avgRating': 5.0}
{'movie': 'Belle de jour (1967)', 'avgRating': 5.0}
{'movie': 'Four Rooms (1995)', 'avgRating': 5.0}


Question 2C: Find all triples of movies that share at least one genre and have been rated above the high-rating threshold by at least one user.

The maximum amount of triangles we could have scales with the amount of movies. We could find that all the movies are highly rated and they all share the same genre, this could mean that we would have $\binom{n}{3} = O(n^3)$ time complexity. This large run-rime makes the above query difficult to scale, so we are limiting our output to 100 triangles.

In [189]:
df = pd.read_csv(r"../data/ml-100k/u.item", encoding="iso-8859-1", delimiter="|",
                 names=["movieId", "title", "releaseDate", "videoReleaseDate", "IMDbURL",
                "unknown", "action", "adventure", "animation", "childrens", "comedy",
                "crime", "documentary", "drama", "fantasy", "filmNoir", "horror",
                "musical", "mystery", "romance", "sciFi", "thriller", "war", "western"])

df["title"].unique().shape

(1664,)

Which means that our maximum possible number of triangles is $\binom{1644}{3} \approx 7.4 \times 10^8$.

In [205]:
# Counting the maximum for our data
def count_movie_triangles(tx, rating_threshold):
    query = """
    WITH $rating_threshold as ratingThreshold
    MATCH (g:Genre)<-[:HAS_GENRE]-(m:Movie)<-[r:RATED]-(:User)
    WHERE r.rating >= ratingThreshold
    WITH g, count(m) AS n
    WHERE n >= 3
    RETURN g.name, n, (n*(n-1)*(n-2))/6 AS possibleTriples;
    """
    result = tx.run(query, rating_threshold=rating_threshold)
    return [record.data() for record in result]


driver = GraphDatabase.driver(URI, auth=AUTH)
try:
    with driver.session(database="neo4j") as session:
        triangles: list[dict] = session.execute_read(count_movie_triangles, high_rating)
        if not triangles:
            print("no matches found")
        total = 0
        for tri in triangles:
            total += tri["possibleTriples"]
        print("total number of possible triangles is: " + format(total, ","))
except Exception:
    raise
finally:
    driver.close()


total number of possible triangles is: 256,090,333,042


In [203]:
def find_movie_triangles(tx, rating_threshold):
    query = """
    WITH $rating_threshold as ratingThreshold
    MATCH (g:Genre)<-[:HAS_GENRE]-(m:Movie)<-[r:RATED]-(:User)
    WHERE r.rating >= ratingThreshold
    WITH g, collect(m) AS movies
    WHERE size(movies) >= 3

    UNWIND range(0, size(movies)-3) AS i
    UNWIND range(i+1, size(movies)-2) AS j
    UNWIND range(j+1, size(movies)-1) AS k

    RETURN
      g.name AS genre,
      movies[i].title AS movie1,
      movies[j].title AS movie2,
      movies[k].title AS movie3
    LIMIT 100
    """
    result = tx.run(query, rating_threshold=rating_threshold)
    return [record.data() for record in result]


driver = GraphDatabase.driver(URI, auth=AUTH)
try:
    with driver.session(database="neo4j") as session:
        triangles: list[dict] = session.execute_read(find_movie_triangles, high_rating)
        if not triangles:
            print("no matches found")
        for tri in triangles:
            pprint.pp(tri)
except Exception:
    raise
finally:
    driver.close()


{'genre': 'action',
 'movie1': 'Empire Strikes Back, The (1980)',
 'movie2': 'Jurassic Park (1993)',
 'movie3': 'Mars Attacks! (1996)'}
{'genre': 'action',
 'movie1': 'Empire Strikes Back, The (1980)',
 'movie2': 'Jurassic Park (1993)',
 'movie3': 'Star Trek: The Wrath of Khan (1982)'}
{'genre': 'action',
 'movie1': 'Empire Strikes Back, The (1980)',
 'movie2': 'Jurassic Park (1993)',
 'movie3': 'Star Wars (1977)'}
{'genre': 'action',
 'movie1': 'Empire Strikes Back, The (1980)',
 'movie2': 'Jurassic Park (1993)',
 'movie3': 'Godfather, The (1972)'}
{'genre': 'action',
 'movie1': 'Empire Strikes Back, The (1980)',
 'movie2': 'Jurassic Park (1993)',
 'movie3': 'Aliens (1986)'}
{'genre': 'action',
 'movie1': 'Empire Strikes Back, The (1980)',
 'movie2': 'Jurassic Park (1993)',
 'movie3': 'Good, The Bad and The Ugly, The (1966)'}
{'genre': 'action',
 'movie1': 'Empire Strikes Back, The (1980)',
 'movie2': 'Jurassic Park (1993)',
 'movie3': 'Terminator, The (1984)'}
{'genre': 'action',
 'm

2D) Compute movie similarity via genre overlap. We want to create or modify edges in the graph to capture similar movies based on shared genres. You can compute a `genreScore` based on the number of common genres between two movies. You need to specify a threshold for `genreScore`. You can run exploratory queries to identify how many movies share 2, 3, 4 or more genres, and pick the threshold based on the percentage of movies sharing genres. You may want want to note that most movies share at elast one genre; thus, having a proper threshold helps to control density and filter out noisy edges. At the end we want the database to have relationships like `(:Title)-[:SIMILAR_TO] {genreScore: ...}]->(:Title)`

In [219]:
def find_pairs(tx):
    query = """
    MATCH (m1:Movie)-[:HAS_GENRE]->(g:Genre)<-[:HAS_GENRE]-(m2:Movie)
    WHERE m1.movieId < m2.movieId
    WITH m1, m2, count(DISTINCT g) AS genreScore
    RETURN genreScore, count(*) AS numMoviePairs
    """
    result = tx.run(query)
    return [record.data() for record in result]

driver = GraphDatabase.driver(URI, auth=AUTH)

def find_genre_percentiles(d):
    try:
        with d.session(database="neo4j") as session:
            pairs: list[dict] = session.execute_read(find_pairs)
            if not pairs:
                print("no matches found")
            df = pd.DataFrame(pairs).sort_values(by=["genreScore"])
            n = df["numMoviePairs"]
            df["percentiles"] = (n/n.sum())*100
            print(df)
    except Exception:
        raise
    finally:
        d.close()

find_genre_percentiles(driver)


   genreScore  numMoviePairs  percentiles
1           1         458002    93.509681
0           2          30230     6.172020
2           3           1512     0.308703
3           4             44     0.008983
4           5              3     0.000613


So from the above we can see that 93.5% of movies share one genre, 6% share 2 and less than a percentage for the rest. So will define `genreScoreThreshold = 2`

In [222]:
def f(tx, genre_score_threshold):
    query = """
    WITH $genreScoreThreshold AS genreScoreThreshold
    MATCH (m1:Movie)-[:HAS_GENRE]->(g:Genre)<-[:HAS_GENRE]-(m2:Movie)
    WHERE m1.movieId < m2.movieId
    WITH m1, m2, count(DISTINCT g) AS genreScore
    WHERE genreScore >= 2
    MERGE (m1)-[s:SIMILAR_TO]->(m2)
    SET s.genreScore = genreScore;
    """
    result = tx.run(query, genreScoreThreshold=genre_score_threshold)
    return [record.data() for record in result]

driver = GraphDatabase.driver(URI, auth=AUTH)
try:
    with driver.session(database="neo4j") as session:
        triangles: list[dict] = session.execute_write(f, genre_score_threshold=2)
        if not triangles:
            print("no matches found")
        for tri in triangles:
            pprint.pp(tri)
except Exception:
    raise
finally:
    driver.close()


no matches found
