# GDMA Project
Author: Julian Schelb (1069967)

In [4]:
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Connection to the database instance

In [5]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "subatomic-shrank-Respond"))
database_name = "cddb"
session = driver.session(database = database_name)

***

### Task 4: Searching and Ranking

Implement a simple search engine that enables search by artist, album and
song name/title. The results must be ranked based on importance. It is up to
you to come up with how the importance of each result is computed and you
must justify your decision (it goes without saying that you need to come up
with a meaningful definition). However, the importance should ideally take into
account user preferences/likes. As such, this task is split in two parts:

**1. Write a Cypher query that adds a relationship :LIKES between a node with**

label :User and an artist, album, or song. Every user should be identified
just by a numerical userID (no more information is necessary). If a user
already exists in the system, no additional node should be added. After
coming up with the necessary Cypher query, add a significant number of
users and likes.

**2. Implement a simple Python function that has the following arguments:**

- the userID of the user submitting the search (the user ID may not
exist in the database),
- a string that contains one or more keywords for the search, and
- an optional argument that indicates whether the search is on all or
a specific field, i.e., artist, album, song.
The search must return exactly 10 results.

Python must only be used to call the database. You should not write any
code in Python that implements functionality necessary for the task. However,
submitting multiple queries in the same function call is allowed. Also, for this
task of the project, you are not only allowed but also encouraged to use functions
from the GDS library of Neo4j. Hence, before making any decisions, have a
careful look at the available functions. Again, you have to justify the use of any
function that you employ

***

#### Adding Example Users

Function for creating a user node:

In [3]:
def createUser(driver, database_name: str = "cddb", user_id: int = None):
    
    if type(user_id) is not int:
        raise TypeError('User ID must be a Number')
    if user_id < 0:
        raise ValueError('User ID must be a positive Number')
        
        
    query = """
    MERGE (u:User {id:  $user_id})
    RETURN u.id as user_id
    """
    
    with driver.session(database = database_name) as session:
        session.run(query, user_id = user_id)
    

In [4]:
createUser(driver, user_id = 99)

Function for deleting a user node and all relations:

In [5]:
def deleteUser(driver, database_name: str = "cddb", user_id: int = None):
    
    if type(user_id) is not int:
        raise TypeError('User ID must be a Number')
    if user_id < 0:
        raise ValueError('User ID must be a positive Number')
        
        
    query = """
    MATCH (u:User)
    WHERE u.id = $user_id
    DETACH DELETE u
    """
    
    with driver.session(database = database_name) as session:
        session.run(query, user_id = user_id)

In [6]:
deleteUser(driver, user_id = 99)

Function for letting user like a song, album or artist:

In [7]:
def addLike(driver, database_name: str = "cddb", user_id: int = None, node_id: int = None, node_label: str = None):
    
    # Validate user_id
    if type(user_id) is not int:
        raise TypeError('User ID must be a Number')
    if user_id < 0:
        raise ValueError('User ID must be a positive Number')
    
    # Validate node_id
    if type(node_id) is not int:
        raise TypeError('Node ID must be a Number')
    if node_id < 0:
        raise ValueError('Node ID must be a positive Number')
        
    # Validate node_label    
    if node_label not in ["Song", "Album", "Artist"]:
        raise ValueError('Node type must be Song, Album or Artist')
    
    createUser(driver, user_id = user_id)
    
    query = f"""
    MATCH (u:User)
    WHERE u.id = {user_id}
    MATCH (n: {node_label})
    WHERE n.id = {node_id}
    MERGE (u)-[r:LIKES]->(n)
    """
    
    with driver.session(database = database_name) as session:
        session.run(query)
    

In [8]:
addLike(driver, user_id = 99, node_id = 10, node_label = "Song")

Function for creating an example user who listens to a given genre:

In [9]:
def createExampleUser(driver, user_id: int = 1, genre: str = "rock", limit: int = 50):
    
    ####### CREATE USER #######
    
    deleteUser(driver, user_id = user_id)
    createUser(driver, user_id = user_id)
    
    ####### LIKE SONGS #######
    
    query = """
    MATCH (g:Genre)<-[r:BELONGS_TO]-(c:CD)
    MATCH (c)-[r2:CONTAINS]->(s:Song)
    WHERE g.genre = $genre
    WITH DISTINCT s 
    LIMIT $limit
    RETURN s.id as id
    """
    
    with driver.session(database = database_name) as session:
        results = session.run(query, user_id = user_id, genre = genre, limit = limit)
        for row in results:
            addLike(driver, user_id = user_id, node_id = row["id"], node_label = "Song")
            
    ####### LIKE ALBUMS #######
    
    query = """
    MATCH (g:Genre)<-[r:BELONGS_TO]-(c:CD)
    MATCH (c)-[r2:CONTAINS]->(s:Album)
    WHERE g.genre = $genre
    WITH DISTINCT s 
    LIMIT $limit
    RETURN s.id as id
    """
    
    with driver.session(database = database_name) as session:
        results = session.run(query, user_id = user_id, genre = genre, limit = limit)
        for row in results:
            addLike(driver, user_id = user_id, node_id = row["id"], node_label = "Album")
            
    ####### LIKE ARTISTS #######
    
    query = """
    MATCH (g:Genre)<-[r:BELONGS_TO]-(c:CD)
    MATCH (c)-[r2:CONTAINS]->(s:Artist)
    WHERE g.genre = $genre
    WITH DISTINCT s 
    LIMIT $limit
    RETURN s.id as id
    """
    
    with driver.session(database = database_name) as session:
        results = session.run(query, user_id = user_id, genre = genre, limit = limit)
        for row in results:
            addLike(driver, user_id = user_id, node_id = row["id"], node_label = "Artist")
    

In [10]:
createExampleUser(driver, user_id = 99, genre = "rock", limit = 50)

**Adding some Likes to emulate User Preference:**

In [11]:
# User 1 likes "Rock" music
createExampleUser(driver, user_id = 1, genre = "rock", limit = 50)
# User 2 likes "classic" music
createExampleUser(driver, user_id = 2, genre = "classic", limit = 50)
# User 3 likes "pop" music
createExampleUser(driver, user_id = 3, genre = "pop", limit = 50)
# User 4 likes "hip-hop" music
createExampleUser(driver, user_id = 4, genre = "hip-hop", limit = 50)
# User 5 likes "hip-hop" music
createExampleUser(driver, user_id = 5, genre = "hard rock", limit = 50)

***

### Implementation Explained:

This search incorporates nodes corresponding to the input string on the one hand, and the previously liked albums, artists and songs on the other hand. Two separate scores are calculated in a similar manner. Separately calculated Centrality Scores form the basis. Separate graph projections are created for nodes that match the search input and for nodes that have been previously liked. Subsequently, a centrality score is calculated for each node in both. The basic idea is that the most relevant CDs will have a high centrality score. The "Content Match" score and the "User Preference" score are then combined in a weighted manner. The ten CDs with the highest combined score are presented to the user. 

![](Figures/search.png)

***

### Precompute User Preference:

Function for creating graph projection with all nodes liked by the user and CDs connected to those nodes.

In [13]:
def loadUserPreferences(driver, database_name = "cddb", user_id: int = None):
        
    ####### DELETE EXISTING PROJECTION #######
    
    query_delete = """
    CALL gds.graph.drop('searchdomain_preference', false) 
    YIELD graphName 
    RETURN graphName
    """
    
    ####### CREATE NEW PROJECTION #######
    
    query_create = f"""
    // CREATE NEW PROJECTION WITH SEARCH RELEVANT SUB GRAPH
    CALL gds.graph.project.cypher(
      'searchdomain_preference',
      
      ' // Liked Artists, Albums and Songs
        MATCH (u:User)-[:LIKES]->(n) 
        WHERE u.id = {user_id}
            AND (n:Song OR n:Album OR n:Artist) 
        RETURN id(n) AS id, labels(n) AS labels 
        UNION
        // CDs linked to liked Artists, Albums and Songs
        MATCH (u:User)-[:LIKES]->(x)-[:APPEARED_ON]->(n:CD) 
        WHERE u.id = {user_id}
        RETURN id(n) AS id, labels(n) AS labels ',

        'MATCH (u:User)-[:LIKES]->(n)
        WHERE u.id = {user_id}
        AND (n:CD OR n:Song OR n:Album OR n:Artist) 
        MATCH (n)-[r:APPEARED_ON]->(m:CD) 
        RETURN id(n) AS source, id(m) AS target, type(r) AS type' 
    )
    YIELD
      graphName, nodeCount AS nodes, relationshipCount AS rels
    RETURN graphName, nodes, rels
    """

    with driver.session(database = database_name) as session:
        results = session.run(query_delete)
        results = session.run(query_create)


In [14]:
loadUserPreferences(driver, user_id = 1)

Calculate how much a user preferes a CD using the centrality of a CD in the subgraph ob liked nodes by the user as importance measure.

In [15]:
def calcPreferredCD(driver, database_name = "cddb", user_id: int = None):
    
    ####### CHECK IF PROJECTION EXISTS #######
    
    query_check = """
    CALL gds.graph.exists("searchdomain_preference")
    YIELD graphName, exists
    RETURN graphName, exists
    """
    
    ####### DELETE EXISTING RELATIONS #######
    
    query_delete = """
    WITH $user_id as userID 
    MATCH (:User {id: userID})-[r:PREFERRES]->(c:CD) 
    DELETE r
    """
    
    ####### CREATE NEW RELATIONS #######
    
    query_create = """
    WITH $user_id as userId
    CALL gds.eigenvector.stream('searchdomain_preference')
    YIELD nodeId, score
    WITH gds.util.asNode(nodeId).id AS nodeId, score, userId
    MATCH (c:CD {id: nodeId})
    MATCH (u:User {id: userId})
    MERGE (c)<-[r:PREFERRES {score: score}]-(u)
    """

    with driver.session(database = database_name) as session:
        
        # Check if projection exists
        results = session.run(query_check, user_id = user_id)
        graph_exists = bool(results.single()["exists"])

        if graph_exists:
            results = session.run(query_delete, user_id = user_id)
            results = session.run(query_create, user_id = user_id)

In [16]:
calcPreferredCD(driver, user_id = 1)

Calculate how much a user prefers a Genre by counting how many liked nodes are associated with which genre.

In [17]:
def calcPreferredGenre(driver, database_name = "cddb", user_id: int = None):
    
    ####### DELETE EXISTING RELATIONS #######
    
    query_delete = """
    WITH $user_id as userID 
    MATCH (:User {id: userID})-[r:PREFERRES]->(g:Genre) 
    DELETE r
    """
    
    ####### CREATE NEW RELATIONS #######
    
    query_create = """
    WITH $user_id as userID
    
    // Determine total count for normalization
    CALL {
        MATCH (u:User)-[r1:LIKES]->(n:Album)
        MATCH (n)-[r2:APPEARED_ON]-(c:CD)
        MATCH (c)-[:BELONGS_TO]->(g:Genre) 
        WHERE u.id = 1
        RETURN COUNT(DISTINCT c.id) as countTotal
    }
    
    // Determine count per genre
    MATCH (u:User)-[r1:LIKES]->(n:Album)
    MATCH (n)-[r2:APPEARED_ON]-(c:CD)
    MATCH (c)-[:BELONGS_TO]->(g:Genre) 
    WHERE u.id = userID
    WITH g, c, u, countTotal, userID
    WITH 
        DISTINCT g.id as id, g.genre as genre, u, g, 
        count(DISTINCT c.id) as countGenre, countTotal,
        userID
    WITH id, genre, userID, u, g,  countGenre, countTotal, 
    (toFloat(countGenre) / toFloat(countTotal)) as prob
    
    // Create relation between user and genre
    MERGE (u)-[:PREFERRES {score: prob}]->(g)
    RETURN *
    ORDER BY countGenre DESC
    """

    with driver.session(database = database_name) as session:
        results = session.run(query_delete, user_id = user_id)
        results = session.run(query_create, user_id = user_id)

In [18]:
calcPreferredGenre(driver, user_id = 1)

Function for precompute preferences per user:

In [19]:
def processUserPreference(driver, database_name = "cddb"):

    query = """
    MATCH (u:User)
    RETURN u.id as userID
    """

    with driver.session(database = database_name) as session:
        results = session.run(query)
        for row in results:
            user_id = row["userID"]
            print(f"Indexing Preferences for User {user_id}")
            loadUserPreferences(driver, user_id = user_id)
            calcPreferredCD(driver, user_id = user_id)
            calcPreferredGenre(driver, user_id = user_id)

In [20]:
processUserPreference(driver)

Indexing Preferences of User 1
Indexing Preferences of User 99
Indexing Preferences of User 2
Indexing Preferences of User 3
Indexing Preferences of User 4
Indexing Preferences of User 5


***

### Search

Function for finding similar nodes (artists, albums, songs) based on text similarity to the search input. CDs connected to many similar nodes are regarded as more important. This content match score is combined with the user preference score for CDs and genre. Finally, a re-ranked list of relevant CDs is augmented with some meta information and presented to the user. 

In [36]:
def searchFor(driver, database_name = "cddb", user_id = 1, search_input = "", search_mask = "all"):

    query = """// Search input
    WITH
    //1 as userID,
    //'Jimi Hendrix purple haze are you experienced' as searchQuery,
    //'Ludwig van Beethoven Für Elise Symphony' as searchQuery,
    //'all' as searchMask
    $user_id as userID,
    $search_input as searchQuery,
    $search_mask as searchMask
    
    // Find matching artists, songs and albums 
    // based on text similarity
    CALL {
        // Artists
        WITH searchQuery, searchMask
        CALL db.index.fulltext.queryNodes('artists', searchQuery) 
        YIELD node, score
        WHERE searchMask = 'all' or searchMask = 'artists'
        RETURN node, score
        UNION 

        // Songs
        WITH searchQuery, searchMask
        CALL db.index.fulltext.queryNodes('songs', searchQuery) 
        YIELD node, score
        WHERE searchMask = 'all' or searchMask = 'songs'
        RETURN node, score
        UNION 

        // Albums
        WITH searchQuery, searchMask
        CALL db.index.fulltext.queryNodes('albums', searchQuery) 
        YIELD node, score
        WHERE searchMask = 'all' or searchMask = 'albums'
        RETURN node, score
    }
    WITH node, score, userID
    MATCH (node)-[r:APPEARED_ON]->(c:CD)
    WITH DISTINCT c.id as id, c,  sum(score) as score_content, userID

    // Get more information about the CDs 
    MATCH (c)-[:CONTAINS]->(ar:Artist)
    MATCH (c)-[:CONTAINS]->(ab:Album)
    MATCH (c)-[:CONTAINS]->(so:Song)
    MATCH (c)-[:BELONGS_TO]->(ge:Genre)
    OPTIONAL MATCH (c)<-[r:PREFERRES]-(u:User {id: userID})
    OPTIONAL MATCH (ge)<-[r2:PREFERRES]-(u2:User {id: userID})

    // Compile final list of search results
    RETURN 
    id, userID as user, score_content, sum(r.score) as score_pref, r2.score as score_genre ,
    score_content + (sum(r.score)  * score_content) + (COALESCE(r2.score, 0)  * score_content) as score_combined,
    collect(DISTINCT ge.genre) as genres,
    collect(DISTINCT ar.artist) as artists,
    collect(DISTINCT ab.album) as albums, 
    collect(DISTINCT so.song) as songs

    ORDER BY score_combined DESC
    LIMIT 100"""

    results = pd.DataFrame([dict(_) for _ in session.run(query, user_id = user_id, 
                                                         search_input = search_input, 
                                                         search_mask = search_mask)])
    return results
    #dtf_data.head(30)

***

### Demonstrating Search

Exampe Query 1:

In [39]:
results = searchFor(driver, database_name = "cddb", 
                    user_id = 1,
                    search_input = "Jimi Hendrix purple haze are you experienced", 
                    search_mask = "all")
results.head(10)

Unnamed: 0,id,user,score_content,score_pref,score_genre,score_combined,genres,artists,albums,songs
0,677,1,31.538232,0.56632,0.663265,70.317167,[rock],[jimi hendrix],[are you experienced],"[third stone from the sun, remember, fire, can..."
1,35734,1,31.538232,0.55419,0.663265,69.934627,[rock],[jimi hendrix],[are you experienced],"[i don't live today, 51st anniversary, stone f..."
2,136907,1,31.538232,0.55419,0.663265,69.934627,[rock],[jimi hendrix],[are you experienced],"[love or confusion, remember, highway chile, m..."
3,46232,1,26.723665,0.692738,0.663265,62.961037,[rock],[jimi hendrix],[experience hendrix the best of jimi hendrix],"[foxey lady, dolly dagger, bold as love, if 6 ..."
4,162186,1,26.723665,0.658101,0.663265,62.035412,[rock],[jimi hendrix],[experience hendrix: the best of jimi hendrix],"[if six was nine, foxey lady, bold as love, ni..."
5,7923,1,56.690317,0.0,0.071429,60.739626,[blues],[signature licks],[jimi hendrix],"[fire (verse);, foxey lady (solo, slow);, fire..."
6,1936,1,24.346131,0.692738,0.663265,57.35956,[rock],[jimi hendrix],[experience hendrix],"[bold as love, night bird flying, if 6 was 9, ..."
7,33321,1,28.278246,0.21237,0.663265,53.039673,[rock],[jimi hendrix],[astro man(alchemy); - studio outtakes 1966-68],"[purple haze 1 (4);, 51st anniversary (5);, la..."
8,2023,1,26.723665,0.692738,0.071429,47.14499,[blues],[jimi hendrix],[experience hendrix - the best of jimi hendrix],"[bold as love, night bird flying, if 6 was 9, ..."
9,17266,1,20.769874,0.346369,0.663265,41.739848,[rock],[jimi hendrix],[are you experienced],"[3rd stone from the sun, remember, foxy lady, ..."


Example Query 2:

In [40]:
results = searchFor(driver, database_name = "cddb", 
                    user_id = 2,
                    search_input = "Ludwig van Beethoven Für Elise Symphony", 
                    search_mask = "all")
results.head(10)

Unnamed: 0,id,user,score_content,score_pref,score_genre,score_combined,genres,artists,albums,songs
0,140152,2,171.903979,0.0,0.510204,259.610091,[classical],[pollini maurizio],[kaiser 5.2],[erläuterungen von joachim kaiser mit musikbei...
1,117461,2,42.674408,0.090324,0.510204,68.301594,[classical],[ludwig van beethoven],[symphony no. 3 and no. 8],"[beethoven, symphony no. 3 hero, op. 55, beeth..."
2,35385,2,64.986593,0.0,,64.986593,[comedy],[kai + sven],[eins dreissig],"[ein praktikant für alle fälle holzfäller, die..."
3,49068,2,40.887329,0.0,0.510204,61.748212,[classical],[strauss],[alpine symphony op64 - ashkenazy],"[an alpine symphony op64 sonnenuntergang, an a..."
4,128963,2,37.272938,0.0,0.510204,56.289743,[classical],[classical],"[kamien, roger - music an appreciation - 5th e...","[mozart piana concerto in a major, k. 488, fir..."
5,125232,2,35.750519,0.0,0.510204,53.99058,[classical],[orkest van de weense staatsopera olv anton pa...,"[sissi, de muziek van een sprookjes-keizerin]","[leichte kavallerie van frans van suppe, wiene..."
6,48561,2,34.394371,0.0,0.510204,51.942519,[classical],[jan decleir],[peter & de wolf],"[voorstelling van de steeldrum, voorstelling v..."
7,49477,2,34.323475,0.0,0.510204,51.835453,[classical],"[mozart, wolfgang amadeus]",[mozart - complete symphoniesof12 [sir neville...,"[symphony 1 in g, kv 11 ,75b 1. allegro, symph..."
8,44122,2,29.651714,0.21452,0.510204,51.14102,[classical],[mozart],[mozart: salzburg symphonies],"[symphony no 5 in b, k allegro, symphony no 1 ..."
9,112843,2,33.304469,0.0,0.510204,50.296544,[classical],[wilhelm furtwängler],[beethoven furtwängler],[ludwig van beethoven pinao concerto no.5 in e...


In [1]:
from search import Search

**Create Instance of Search Engine:**

In [3]:
searchEngine = Search(url = "bolt://localhost:7687", user= "neo4j", 
                      password = "subatomic-shrank-Respond", database_name = "cddb")

**Search for Jimi Hendrix:**

In [4]:
user_id = 1
search_input = "Jimi Hendrix purple haze are you experienced"
search_mask = "all"

results = searchEngine.searchInGraph(user_id = 1, search_input = search_input, search_mask= search_mask)
results.head(30)

Unnamed: 0,nodeId,count,score_cont,score_pref,artists,albums,songs
0,7923,1,0.265149,0.0,[signature licks],[jimi hendrix],"[fire (verse);, foxey lady (solo, slow);, fire..."
1,2023,2,0.133564,0.034758,[jimi hendrix],[experience hendrix - the best of jimi hendrix],"[bold as love, night bird flying, if 6 was 9, ..."
2,35734,2,0.133564,0.034758,[jimi hendrix],[are you experienced],"[i don't live today, 51st anniversary, stone f..."
3,677,2,0.133564,0.034758,[jimi hendrix],[are you experienced],"[third stone from the sun, remember, fire, can..."
4,46232,2,0.133564,0.034758,[jimi hendrix],[experience hendrix the best of jimi hendrix],"[foxey lady, dolly dagger, bold as love, if 6 ..."
5,136907,2,0.133564,0.034758,[jimi hendrix],[are you experienced],"[love or confusion, remember, highway chile, m..."
6,162186,2,0.133564,0.034758,[jimi hendrix],[experience hendrix: the best of jimi hendrix],"[if six was nine, foxey lady, bold as love, ni..."
7,20222,2,0.133564,0.017759,[jimi hendrix],[best of jimi hendrix],"[blues, blues, free spirit, star spangled bann..."
8,33321,2,0.133564,0.017759,[jimi hendrix],[astro man(alchemy); - studio outtakes 1966-68],"[purple haze 1 (4);, 51st anniversary (5);, la..."
9,30138,2,0.133564,0.017759,[jimi hendrix],[astro man box set],"[can you see me (pre unre);, purple haze (pre ..."


**Search for Beethoven:**

In [5]:
user_id = 2
search_input = "Ludwig van Beethoven Für Elise Symphony"
search_mask = "all"

results = searchEngine.searchInGraph(user_id = 1, search_input = search_input, search_mask= search_mask)
results.head(30)

Unnamed: 0,nodeId,count,score_cont,score_pref,artists,albums,songs
0,140152,1,0.14467,0.0,[pollini maurizio],[kaiser 5.2],[erläuterungen von joachim kaiser mit musikbei...
1,33062,1,0.0,0.255736,[ben harper],[fight for your mind],"[one road to freedom, give a man a home, oppre..."
2,167085,1,0.0,0.238738,[jannedaarc],[arcadia],"[romancã, heavy damage, acid breath, dolls, wi..."
3,28698,1,0.0,0.221739,[ben harper],[fight for your mind],"[oppression, excuse me mr, give a man a home, ..."
4,141988,1,0.108912,0.0,[philip jones bläserensemble],[trumpet voluntary],"[sonata pian' e forte (giovanni gabrieli);, ea..."
5,166362,1,0.0,0.187743,[the coral],[the invisible invasion],"[cripples crown, gina jones, leeslunchboxbyblu..."
6,13220,1,0.0,0.170744,[groundhogs],[who will save the world],"[music is the food of thought, wages of peace,..."
7,11783,1,0.0,0.170744,[groundhogs],[who will save the world],"[wages of peace, the grey maze, body in mind, ..."
8,124071,1,0.073155,0.0,[ludwig van beethoven],[beethoven for meditation],"[piano and wind quintet andante cantabile, sep..."
9,114751,1,0.073155,0.0,[ludwig van beethoven],[beethoven greatest hits],"[choral fantasy conclusion, moonlight sonata a..."
