# GDMA - Assignment 2

Autor: Julian Schelb (1069967)

In [1]:
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### Connection to the database instance

In [2]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "subatomic-shrank-Respond"))

### Question 1

Import the input graph into Neo4j.

#### Remove all Nodes

In [122]:
query = """
match (a) delete a
"""

with driver.session() as session:
    result = session.run(query)
    
    
query = """
match (a) -[r] -> () delete a, r
"""

with driver.session() as session:
    result = session.run(query)
    

#### Import Data from CSV File

In [123]:
query = """
LOAD CSV FROM 'file:///edges.csv' AS row FIELDTERMINATOR ';'
WITH row[0] as edgeId, row[1] as sourceId, row[2] as targetId, row[3] as cost1, row[4] as cost2
MERGE (s:GenericNode {id: sourceId})
MERGE (t:GenericNode {id: targetId})
MERGE (s)-[:RELATED_TO {edgeId: toInteger(edgeId), cost1: toInteger(cost1), cost2: toInteger(cost2)}]->(t)   
RETURN s, t
"""

with driver.session() as session:
    result = session.run(query)

### Question 2

Implement approximate distance query processing with landmarks. First,
select X landmarks at random (how many landmarks you select is up to
you, but keep the number low). Assign a label :Landmark to X number of
nodes. Then write the Cypher code required to compute all the distances
from and to the selected landmarks. For the computation of the distances
consider only cost1. Store the distances in relationships between nodes
and landmarks. Then, write the appropriate Cypher code to compute the
approximate distance between any given pair of nodes. The approximate
distance should be the average of the best upper bound and the best lower
bound, with these bounds being computed using the landmarks

#### Remove Landmark Label

In [126]:
query = """
MATCH (a)
REMOVE a :Landmark 
RETURN a
"""

with driver.session() as session:
    result = session.run(query)

#### Randomly assign Landmark Label

In [128]:
query = """
MATCH (a)-[:RELATED_TO]->(t)
WHERE  a <> t
WITH DISTINCT a
ORDER BY rand()
LIMIT 2
SET a :Landmark 
RETURN a.id as landmark_node_id
"""
        
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,landmark_node_id
0,5
1,0


#### Calculate Distance to every Landmark

Drop projection if exists already:

In [133]:
#query = """
#CALL gds.graph.drop('nativeGraph') YIELD graphName;
#"""

#dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
#dtf_data

Unnamed: 0,graphName
0,nativeGraph


In [134]:
query = """
CALL gds.graph.list()
"""
        
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Create graph projection:

In [135]:
query = """
CALL gds.graph.project(
  'nativeGraph',
  'GenericNode',
  {
    RELATED_TO: {
      type: 'RELATED_TO',
      properties: ['edgeId', 'cost1']
    }
  }
)
YIELD graphName
"""

with driver.session() as session:
    result = session.run(query)
    for record in result:
        print(record["graphName"])

nativeGraph


Calculate shortest distances and save them as edges:

In [136]:
query = """
CALL gds.alpha.allShortestPaths.stream('nativeGraph', {
  relationshipWeightProperty: 'cost1'
})
YIELD sourceNodeId, targetNodeId, distance
WITH sourceNodeId, targetNodeId, distance
WHERE gds.util.isFinite(distance) = true

MATCH (source:GenericNode) WHERE id(source) = sourceNodeId
MATCH (target:Landmark) WHERE id(target) = targetNodeId
WITH source, target, distance WHERE source <> target

MERGE (source)-[:SHORTEST_DISTANCE {distance: distance }]->(target)
MERGE (source)<-[:SHORTEST_DISTANCE {distance: distance }]-(target)
RETURN source.id AS source, target.id AS target, distance
ORDER BY source ASC, target ASC, distance DESC
LIMIT 1000
"""
        
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,source,target,distance
0,0,3,3.0
1,0,5,6.0
2,0,6,8.0
3,1,0,5.0
4,1,3,2.0
5,1,5,5.0
6,1,6,7.0
7,2,0,4.0
8,2,3,3.0
9,2,5,6.0


**Choose source and target node id:** 

In [153]:
source_node_id = "1"
source_target_id = "3"

In [154]:
query = """
MATCH (s)-[r:SHORTEST_DISTANCE]->(l:Landmark)-[r2:SHORTEST_DISTANCE]->(t)
WHERE s.id = '{0}' AND t.id = '{1}'
WITH 
//l.id, 
//r.distance, r2.distance, 
min(r.distance + r2.distance) as upper, 
max(abs(r2.distance - r.distance)) as lower
RETURN upper, lower, (upper + lower) / 2 as distance_approx
"""
query = query.format(source_node_id, source_target_id)      
        
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,upper,lower,distance_approx
0,8.0,2.0,5.0


### Question 3

Compute all shortest paths between any pair of nodes. Then print for
each edge (distinguished by edgeId), the number of shortest paths that
contain that edge.

In [143]:
query = """
//MATCH (source: GenericNode {id: "4"}), (target: GenericNode {id: "1"})
MATCH (source: GenericNode), (target: GenericNode)
CALL gds.shortestPath.dijkstra.stream('nativeGraph', {
    sourceNode: source,
    targetNode: target,
    relationshipWeightProperty: 'cost1'
})
YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
WITH path, nodeIds, totalCost
UNWIND 
    range(0, length(path) - 1) as id_pos
    //relationships(path) as rels_in_path
MATCH (n1)-[r:RELATED_TO]->(n2)
WHERE id(n1) = nodeIds[id_pos]
AND id(n2) = nodeIds[id_pos+1]
RETURN r.edgeId as edgeId, count(*) as count
ORDER BY r.edgeId
"""

dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,edgeId,count
0,1,1
1,2,6
2,4,5
3,5,1
4,6,1
5,7,1
6,8,3
7,9,3
8,10,6
9,11,5


### Question 4

Implement a scheme to compute all paths between nodes given a constraint
on both costs e.g., find all paths between node 0 and node 6 such that the
cumulative cost1 of the path is less than 16, and the cumulative cost2
of the path is more than 10.


In [140]:
query = """
MATCH p=(s:GenericNode{id:"1"})-[r:RELATED_TO*..7]->(t:GenericNode{id:"6"})
UNWIND r as r_i
WITH p, sum(r_i.cost1) as cost1, sum(r_i.cost2) as cost2
WHERE cost1 < 16 AND cost2 > 10
RETURN [n in nodes(p)| n.id] as path, cost1, cost2
"""

dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,path,cost1,cost2
0,"[1, 0, 3, 5, 4, 6]",15,22
1,"[1, 0, 3, 5, 6]",14,21
2,"[1, 3, 4, 7, 4, 6]",15,33
3,"[1, 3, 4, 7, 6]",11,25
4,"[1, 3, 4, 7, 6, 5, 6]",15,41
...,...,...,...
114,"[1, 6, 4, 5, 4, 7, 6]",15,29
115,"[1, 6, 4, 5, 4, 6]",13,24
116,"[1, 6, 4, 5, 4, 6, 7, 6]",15,29
117,"[1, 6, 4, 5, 6]",12,23
