# GDMA - Assignment 4

Author: Julian Schelb (1069967)

In [1]:
from neo4j import GraphDatabase
import matplotlib.pyplot as plt
import pandas as pd

### Connection to the database instance

In [2]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "subatomic-shrank-Respond"))
session = driver.session()

### Question 1

Import the graph from the attached email-Eu-core.txt file. The file
represents a directed graph. 

Write the necessary Cypher and Python code to compute the significance
of each pattern. Use the configuration model and generate 5 random
networks. All subgraph matching queries must be executed in Neo4j. For
patterns (a) and (b) consider all matches (non-induced sugraphs) while
for pattern (c) consider induced subgraphs.

**Import Data:**

In [21]:
query = """
LOAD CSV FROM 'file:///email-Eu-core.txt' AS row FIELDTERMINATOR ' '
WITH row[0] as sourceId, row[1] as targetId
MERGE (s:Node {id: sourceId})
MERGE (t:Node {id: targetId})
MERGE (s)-[:RELATED_TO]->(t)   
RETURN s, t
"""

with driver.session() as session:
    result = session.run(query)

**Pattern a:**

In [5]:
query = """
MATCH (a:Node)-[r]->(b:Node)-[r2]->(c:Node)
MATCH (a)-[r3]->(c) 
RETURN count(*)
"""

dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,count(*)
0,432159


**Pattern b:**

In [6]:
query = """
MATCH (a:Node)-[r]->(b:Node)-[r2]->(d:Node)
MATCH (a)-[r3]->(c:Node)-[r4]->(d)
RETURN count(*)
"""

dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,count(*)
0,21106565


In [7]:
query = """
MATCH (a)
MATCH (a)-[r1]->(b:Node)
MATCH (a)-[r2]->(c:Node)
MATCH (a)-[r3]->(d:Node)
RETURN count(*)
"""

dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data

Unnamed: 0,count(*)
0,206182145


**Create Random Graphs:**

* https://neo4j.com/labs/apoc/4.1/overview/apoc.node/apoc.node.degree/
* https://neo4j.com/labs/apoc/4.2/overview/apoc.generate/apoc.generate.simple/

Determining degree distribution of original graph:

In [29]:
query = """
MATCH (n:Node)
RETURN collect(apoc.node.degree(n)) AS degree_dist
"""

degree_dist = []
with driver.session(database = "neo4j") as session:
    result = session.run(query)
    for record in result:
        degree_dist = record["degree_dist"]

print(degree_dist[:5])

[72, 160, 279, 61, 75]


Generate Random Graph 1:

In [33]:
database_name = "randomone"

#### Drop existing database if exists
query = "DROP DATABASE $database_name IF EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)

#### Create new database
query = "CREATE DATABASE $database_name IF NOT EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)
    
#### Generate random graph    
query = "CALL apoc.generate.simple($degree_dist, \"Node\", \"RELATED_TO\")"
with driver.session(database = database_name) as session:
    result = session.run(query, degree_dist = degree_dist)

Generate Random Graph 2:

In [34]:
database_name = "randomtwo"

#### Drop existing database if exists
query = "DROP DATABASE $database_name IF EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)

#### Create new database
query = "CREATE DATABASE $database_name IF NOT EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)
    
#### Generate random graph    
query = "CALL apoc.generate.simple($degree_dist, \"Node\", \"RELATED_TO\")"
with driver.session(database = database_name) as session:
    result = session.run(query, degree_dist = degree_dist)

Generate Random Graph 3:

In [35]:
database_name = "randomthree"

#### Drop existing database if exists
query = "DROP DATABASE $database_name IF EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)

#### Create new database
query = "CREATE DATABASE $database_name IF NOT EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)
    
#### Generate random graph    
query = "CALL apoc.generate.simple($degree_dist, \"Node\", \"RELATED_TO\")"
with driver.session(database = database_name) as session:
    result = session.run(query, degree_dist = degree_dist)

Generate Random Graph 4:

In [36]:
database_name = "randomfour"

#### Drop existing database if exists
query = "DROP DATABASE $database_name IF EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)

#### Create new database
query = "CREATE DATABASE $database_name IF NOT EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)
    
#### Generate random graph    
query = "CALL apoc.generate.simple($degree_dist, \"Node\", \"RELATED_TO\")"
with driver.session(database = database_name) as session:
    result = session.run(query, degree_dist = degree_dist)

Generate Random Graph 5:

In [37]:
database_name = "randomfive"

#### Drop existing database if exists
query = "DROP DATABASE $database_name IF EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)

#### Create new database
query = "CREATE DATABASE $database_name IF NOT EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)
    
#### Generate random graph    
query = "CALL apoc.generate.simple($degree_dist, \"Node\", \"RELATED_TO\")"
with driver.session(database = database_name) as session:
    result = session.run(query, degree_dist = degree_dist)

***

In [48]:
from statistics import mean, pstdev

**Significance Pattern a:**

In [60]:
query = """
MATCH (a:Node)-[r]->(b:Node)-[r2]->(c:Node)
MATCH (a)-[r3]->(c) 
RETURN count(*) as count
"""

#################### Real Graph ####################

N_real = None
with driver.session() as session:
    result = session.run(query)
    for record in result:
        N_real = record["count"]
        
print("Count in real Graph:", N_real)

#################### Random Graph ####################

graphs = ["randomone", "randomtwo",  "randomthree",  "randomfour",  "randomfive"]
N_rand = []

for graph in graphs:

    count = None
    with driver.session(database = graph) as session:
        result = session.run(query)
        for record in result:
            count = record["count"]

    N_rand.append(count)

#################### Significance ####################
    
N_rand_mean = mean(N_rand)
N_rand_stdev = pstdev(N_rand)

print("Counts in random Graph:", N_rand)
print("Mean:", round(N_rand_mean, 2))
print("Stdev:", round(N_rand_stdev, 2))

Z = (N_real - N_rand_mean) / N_rand_stdev

print("Signficance", round(Z, 2))

Count in real Graph: 432159
Counts in random Graph: [272348, 270810, 274345, 272582, 269799]
Mean: 271976.8
Stdev: 1563.12
Signficance 102.48


***

**Significance Pattern b:**

In [61]:
query = """
MATCH (a:Node)-[r]->(b:Node)-[r2]->(d:Node)
MATCH (a)-[r3]->(c:Node)-[r4]->(d)
RETURN count(*) as count
"""

#################### Real Graph ####################

N_real = None
with driver.session() as session:
    result = session.run(query)
    for record in result:
        N_real = record["count"]
        
print("Count in real Graph:", N_real)

#################### Random Graph ####################

graphs = ["randomone", "randomtwo",  "randomthree",  "randomfour",  "randomfive"]
N_rand = []

for graph in graphs:

    count = None
    with driver.session(database = graph) as session:
        result = session.run(query)
        for record in result:
            count = record["count"]

    N_rand.append(count)

#################### Significance ####################
    
N_rand_mean = mean(N_rand)
N_rand_stdev = pstdev(N_rand)

print("Counts in random Graph:", N_rand)
print("Mean:", round(N_rand_mean, 2))
print("Stdev:", round(N_rand_stdev, 2))

Z = (N_real - N_rand_mean) / N_rand_stdev

print("Signficance", round(Z, 2))

Count in real Graph: 21106592
Counts in random Graph: [16095615, 15967752, 16310247, 16073420, 15751269]
Mean: 16039660.6
Stdev: 182114.33
Signficance 27.82


***

**Significance Pattern c:**

In [62]:
query = """
MATCH (a)
MATCH (a)-[r1]->(b:Node)
MATCH (a)-[r2]->(c:Node)
MATCH (a)-[r3]->(d:Node)
RETURN count(*) as count
"""

#################### Real Graph ####################

N_real = None
with driver.session() as session:
    result = session.run(query)
    for record in result:
        N_real = record["count"]
        
print("Count in real Graph:", N_real)

#################### Random Graph ####################

graphs = ["randomone", "randomtwo",  "randomthree",  "randomfour",  "randomfive"]
N_rand = []

for graph in graphs:

    count = None
    with driver.session(database = graph) as session:
        result = session.run(query)
        for record in result:
            count = record["count"]

    N_rand.append(count)

#################### Significance ####################
    
N_rand_mean = mean(N_rand)
N_rand_stdev = pstdev(N_rand)

print("Counts in random Graph:", N_rand)
print("Mean:", round(N_rand_mean, 2))
print("Stdev:", round(N_rand_stdev, 2))

Z = (N_real - N_rand_mean) / N_rand_stdev

print("Signficance", round(Z, 2))

Count in real Graph: 206182244
Counts in random Graph: [736846445, 741154895, 734797151, 741974711, 746228033]
Mean: 740200247
Stdev: 4020838.56
Signficance -132.81


***

### Question 2

Import the graph from the file karate.csv. Use the Louvain algorithm
to split the input graph into communities. You can find the description of
the user-defined procedure for the Louvain algorithm and other community
detection algorithms in the following link:

https://neo4j.com/docs/graph-data-science/current/algorithms/community/

Consider only the final community structure (disregard any intermediate communities). Then, print for each community (identified by the
communityId yielded by the user-defined procedure) the list of its border
nodes (identified by their nid given in the input file).

In [4]:
database_name = "karate"

**Import Data:**

In [64]:
#### Drop existing database if exists
query = "DROP DATABASE $database_name IF EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)

#### Create new database
query = "CREATE DATABASE $database_name IF NOT EXISTS"
with driver.session() as session:
    result = session.run(query, database_name = database_name)
    
#### Import data

query = """
LOAD CSV FROM 'file:///karate.csv' AS row FIELDTERMINATOR ';'
WITH row[0] as sourceId, row[1] as targetId
MERGE (s:Node {id: sourceId})
MERGE (t:Node {id: targetId})
MERGE (s)-[:RELATED_TO]->(t)   
RETURN s, t
"""

with driver.session(database = database_name) as session:
    result = session.run(query)

**Determine Communities:**

In [8]:
#### Drop existing graph projection if exists
query = """
CALL gds.graph.drop('karateCommunity', false) 
"""

with driver.session(database = database_name) as session:
    result = session.run(query)

#### Create new graph projection
query = """
CALL gds.graph.project('karateCommunity', 'Node', {RELATED_TO: { orientation: 'UNDIRECTED'}})
"""

with driver.session(database = database_name) as session:
    result = session.run(query)

#### Determine communities
query = """
CALL gds.louvain.write('karateCommunity', { writeProperty: 'community' })
YIELD communityCount, modularity, modularities
"""

with driver.session(database = database_name) as session:
    result = session.run(query)

**Print Border Nodes:**

In [12]:
query = """
MATCH (n:Node)
MATCH (n)-[r:RELATED_TO]-(m:Node)
WHERE n.community <> m.community
WITH n.community as community, n.id as nodeId
RETURN DISTINCT community, collect(DISTINCT nodeId) as border_nodes
"""

session = driver.session(database = database_name)
dtf_data = pd.DataFrame([dict(_) for _ in session.run(query)])
dtf_data.head()

Unnamed: 0,community,border_nodes
0,5,"[1, 2, 3, 10, 14, 20]"
1,4,"[5, 6, 7, 11]"
2,11,"[24, 29, 32, 28]"
3,25,"[9, 30, 31, 33, 34]"
