In [142]:
from typing import TextIO
import csv
import pprint
from secret import NEO4J_PASSWORD

from neo4j import GraphDatabase, Driver

URI = "neo4j://localhost"
AUTH = ("neo4j", NEO4J_PASSWORD)
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    driver.verify_connectivity()


In [143]:
with open(r"../data/ml-100k/u.item") as file:
    for i, item in enumerate(csv.reader(file, delimiter="|")):
        pprint.pprint(item, compact=True)
        if i >= 10:
            break

['1', 'Toy Story (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)', '0', '0', '0', '1',
 '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['2', 'GoldenEye (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?GoldenEye%20(1995)', '0', '1', '1', '0', '0',
 '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0']
['3', 'Four Rooms (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)', '0', '0', '0', '0',
 '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0']
['4', 'Get Shorty (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)', '0', '1', '0', '0',
 '0', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
['5', 'Copycat (1995)', '01-Jan-1995', '',
 'http://us.imdb.com/M/title-exact?Copycat%20(1995)', '0', '0', '0', '0', '0',
 '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0']
['6', 'Sh

In [144]:
def open_file(csv_file_path: str) -> TextIO:
    with open(csv_file_path, encoding="utf-8") as f:
        return f

def read_user(csv_file_path):
    out = []
    with open(csv_file_path, encoding="utf-8") as f:
        reader = csv.DictReader(f,fieldnames=["userId", "age", "gender", "occupation", "zipCode"] ,delimiter="|")
        for i, row in enumerate(reader):
            row["userId"] = int(row["userId"])
            out.append(row)
    return out

def populate_user_txn(tx, rows):
    query = """
    UNWIND $users as user
    CREATE (Person:User {
    userId: user.userId,
    age: user.age,
    gender: user.gender,
    occupation: user.occupation,
    zipCode: user.zipCode})
    """
    try:
        tx.run(query, users=rows)
    except Exception as e:
        raise e

def create_users(driver: Driver):
    with driver.session(database="neo4j") as session:
        session.execute_write(
            populate_user_txn,
            read_user(r"../data/ml-100k/u.user")
        )
    return


def populate_occupation_tnx(tx, rows):
    query = """
    UNWIND $occupations as occupation
    CREATE (:Occupation {
    occupation: occupation.occupation,
    })
    """
    try:
        tx.run(query, occupation=rows)
    except Exception as e:
        raise e

def read_items(csv_file_path):
    out = []
    with open(csv_file_path, encoding="iso-8859-1") as f:

        reader = csv.DictReader(f, fieldnames= [
            "movieId", "title", "releaseDate", "videoReleaseDate", "IMDbURL",
            "unknown", "action", "adventure", "animation", "childrens", "comedy",
            "crime", "documentary", "drama", "fantasy", "filmNoir", "horror",
            "musical", "mystery", "romance", "sciFi", "thriller", "war", "western"
        ], delimiter="|")

        base = ["movieId", "title", "releaseDate", "videoReleaseDate", "IMDbURL"]

        genres = ["unknown", "action", "adventure", "animation", "childrens", "comedy",
        "crime", "documentary", "drama", "fantasy", "filmNoir", "horror",
        "musical", "mystery", "romance", "sciFi", "thriller", "war", "western"]

        for i, row in enumerate(reader):
            for genre in genres:
                if row[genre] == '1':
                    row["movieId"] = int(row["movieId"])
                    new_row = {key: value for key, value in row.items() if key in base}| {"genre": genre}
                    out.append(new_row)
    return out


def populate_movies_and_genres_txn(tx, rows):
    query = """
            UNWIND $rows AS row
            MERGE (m:Movie {
                movieId: row.movieId
            })
            SET m.title = row.title,
                m.releaseDate = row.releaseDate,
                m.videoReleaseDate = row.videoReleaseDate,
                m.IMDbURL = row.IMDbURL

            MERGE (g:Genre { name: row.genre })

            MERGE (m)-[:HAS_GENRE]->(g)
        """
    try:
        result = tx.run(query, rows=rows)
        return result
    except Exception as e:
        raise e

def create_movies_and_genres(driver: Driver):
    with driver.session(database="neo4j") as session:
        session.execute_write(
            populate_movies_and_genres_txn,
            read_items(r"../data/ml-100k/u.item")
        )
    return

def read_data(csv_file_path):
    with open(csv_file_path) as f:
        reader = csv.DictReader(f, fieldnames=["userId", "movieId", "rating", "timestamp"], dialect="excel-tab")
        out = []
        for row in reader:
            row["movieId"] = int(row["movieId"])
            row["userId"] = int(row["userId"])
            row["rating"] = float(row["rating"])
            out.append(row)
        return out

def populate_data_txn(tx, rows):
    query = """
    UNWIND $rows AS row

    MATCH (u:User { userId: row.userId })
    MATCH (m:Movie { movieId: row.movieId })

    MERGE (u)-[r:RATED]->(m)
    SET r.rating = row.rating,
        r.timestamp = row.timestamp
    """
    try:
        result = tx.run(query, rows=rows)
        return result
    except Exception as e:
        raise e

def create_data(driver: Driver):
    with driver.session(database="neo4j") as session:
        session.execute_write(
            populate_data_txn,
            read_data(r"../data/ml-100k/u.data")
        )
    return


def create_user_index_txn(tx):
    query = """
    CREATE INDEX user_userId IF NOT EXISTS FOR (u:User) ON (u.userId)
    """
    return tx.run(query)

def create_movie_index_txn(tx):
    query = """
    CREATE INDEX movie_movieId IF NOT EXISTS FOR (m:Movie) ON (m.movieId)
    """
    return tx.run(query)
def create_index(driver):
    with driver.session(database="neo4j") as session:
        session.execute_write(
           create_user_index_txn
        )
        session.execute_write(
            create_movie_index_txn
        )
    return


In [145]:
driver = GraphDatabase.driver(URI, auth=AUTH)
try:
    create_users(driver)
    create_movies_and_genres(driver)
    create_index(driver)
    create_data(driver)
except Exception as e:
    raise e
finally:
    driver.close()

In [146]:
def graph_traversal(tx, userId):
    query = """
    WITH $userId AS targetUser, 4.0 AS ratingThreshold
    MATCH (target:User {userId: targetUser})-[:RATED]->(m:Movie)
    WITH target, collect(m) AS watched, ratingThreshold
    MATCH (target)-[:RATED]->(common:Movie)<-[:RATED]-(similar:User)
    WHERE target <> similar
    MATCH (similar)-[r:RATED]->(rec:Movie)
    WHERE NOT rec IN watched AND r.rating >= ratingThreshold
    WITH rec, avg(r.rating) AS avgRating
    RETURN rec.title AS movie, avgRating
    ORDER BY avgRating DESC
    LIMIT 10
    """
    result = tx.run(query, userId=userId)
    return [record.data() for record in result]

driver = GraphDatabase.driver(URI, auth=AUTH)
try:
    with driver.session(database="neo4j") as session:
        reccs = session.execute_read(graph_traversal, 13)
        for rec in reccs:
            print(rec)
except Exception:
    raise
finally:
    driver.close()

{'movie': 'Quiet Room, The (1996)', 'avgRating': 5.0}
{'movie': 'Lassie (1994)', 'avgRating': 5.0}
{'movie': 'Incognito (1997)', 'avgRating': 5.0}
{'movie': 'They Made Me a Criminal (1939)', 'avgRating': 5.0}
{'movie': 'Lay of the Land, The (1997)', 'avgRating': 5.0}
{'movie': 'Flintstones, The (1994)', 'avgRating': 5.0}
{'movie': 'Maya Lin: A Strong Clear Vision (1994)', 'avgRating': 5.0}
{'movie': 'Turbo: A Power Rangers Movie (1997)', 'avgRating': 5.0}
{'movie': 'Solo (1996)', 'avgRating': 5.0}
{'movie': 'Rendezvous in Paris (Rendez-vous de Paris, Les) (1995)', 'avgRating': 5.0}


In [149]:
def find_movie_triangles(tx, rating_threshold=4.0):
    query = """
    WITH $ratingThreshold AS ratingThreshold
    MATCH (m1:Movie)-[:HAS_GENRE]->(g:Genre)<-[:HAS_GENRE]-(m2:Movie),
          (m1)-[:HAS_GENRE]->(g)<-[:HAS_GENRE]-(m3:Movie)
    WHERE m1.movieId < m2.movieId AND m2.movieId < m3.movieId
    MATCH (u:User)-[r1:RATED]->(m1),
          (u)-[r2:RATED]->(m2),
          (u)-[r3:RATED]->(m3)
    WHERE r1.rating >= ratingThreshold
      AND r2.rating >= ratingThreshold
      AND r3.rating >= ratingThreshold
    RETURN g.name AS genre,
           m1.title AS movie1,
           m2.title AS movie2,
           m3.title AS movie3,
           count(u) AS numUsers
    ORDER BY numUsers DESC
    """
    result = tx.run(query, ratingThreshold=rating_threshold)
    return [record.data() for record in result]


driver = GraphDatabase.driver(URI, auth=AUTH)
try:
    with driver.session(database="neo4j") as session:
        triangles = session.execute_read(find_movie_triangles, 4)
        for tri in triangles[:10]:
            print(tri)
except Exception:
    raise
finally:
    driver.close()
