This file implements the graph evolutions by generating synthetic review and affliation data, connecting to the neo4j database and finally executing cypher commands.

In [2]:
from faker import Faker
import pandas as pd
import random
from tqdm import tqdm
import uuid
from neo4j import GraphDatabase
import subprocess

In [3]:
# ENTER NEO4J CREDENTIALS HERE
uri = "bolt://localhost:7687"  
username = "neo4j"  
password = "12345678"  
import_folder = "/opt/homebrew/Cellar/neo4j/5.19.0/libexec/import" # CHANGE THIS TO YOUR IMPORT FOLDER OF DATABASE

In [4]:
# Start neo4j
try:
    command = "neo4j start"
    result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
    print(result.stdout)
except subprocess.CalledProcessError as e:
    print("Failed to start neo4j - it may already be running") 

Directories in use:
home:         /opt/homebrew/Cellar/neo4j/5.19.0/libexec
config:       /opt/homebrew/Cellar/neo4j/5.19.0/libexec/conf
logs:         /opt/homebrew/var/log/neo4j
plugins:      /opt/homebrew/Cellar/neo4j/5.19.0/libexec/plugins
import:       /opt/homebrew/Cellar/neo4j/5.19.0/libexec/import
data:         /opt/homebrew/var/neo4j/data
certificates: /opt/homebrew/Cellar/neo4j/5.19.0/libexec/certificates
licenses:     /opt/homebrew/Cellar/neo4j/5.19.0/libexec/licenses
run:          /opt/homebrew/Cellar/neo4j/5.19.0/libexec/run
Starting Neo4j.
Started neo4j (pid:14139). It is available at http://localhost:7474
There may be a short delay until the server is ready.



# Generate reviews

In [5]:
# Import reviewed_for relationship file
reviewed_paper = pd.read_csv('data/r_reviewed_paper.csv')
reviewed_paper.head()

fake = Faker()

# Lists to store the new data
reviews = []
reviewed_by = []
review_for = []

# Loop through each row in the reviewed_paper dataframe
for index, row in tqdm(reviewed_paper.iterrows(), total=reviewed_paper.shape[0], desc="Creating fake reviews"):
    review_id = str(uuid.uuid4())
    content = fake.text()
    suggested_decision = random.choice(['Accept', 'Reject', 'Revise'])
    reviewer_id = row[':START_ID']
    paper_id = row[':END_ID']
    
    # Create Review node
    reviews.append({
        'Id:ID': review_id,
        'content': content,
        'suggested_decision': suggested_decision
    })

    # Create REVIEWED relationship
    reviewed_by.append({
        ':START_ID': review_id,
        ':END_ID': reviewer_id
    })

    # Create REVIEWED_FOR relationship
    review_for.append({
        ':START_ID': review_id,
        ':END_ID': paper_id
    })
    

# Save to csv
reviews_df = pd.DataFrame(reviews)
reviewed_by_df = pd.DataFrame(reviewed_by)
reviewed_for_df = pd.DataFrame(review_for)

review_file = f"{import_folder}/reviews.csv"
reviewed_by_file = f"{import_folder}/reviewed_by.csv"
reviewed_for_file = f"{import_folder}/reviewed_for.csv"

reviews_df.to_csv(review_file, index=False)
reviewed_by_df.to_csv(reviewed_by_file, index=False)
reviewed_for_df.to_csv(reviewed_for_file, index=False)

print("CSV files for reviews generated.")

Creating fake reviews: 100%|██████████| 15000/15000 [00:00<00:00, 17976.22it/s]


CSV files for reviews generated.


# Generate affliation

In [6]:
NUM_COMPANIES = 50
NUM_UNIVERSITIES = 100

In [7]:
authors = pd.read_csv('data/authors.csv')

# Generate nodes of universities and companies
organizations = [
    {'Id:ID': str(uuid.uuid4()),
      'name': fake.company(),
        'type': 'Company'} for _ in range(NUM_COMPANIES)
] + [
    {'Id:ID': str(uuid.uuid4()),
      'name': fake.company() + "University",
        'type': 'University'} for _ in range(NUM_UNIVERSITIES)
]

# Assign each author to a random organization / university
affiliations = []
for _, author in tqdm(authors.iterrows(), total=authors.shape[0], desc="Creating affiliations"):
    org = random.choice(organizations)
    affiliations.append({
        ':START_ID': author['Id:ID'],
        ':END_ID': org['Id:ID']
    })

# Save organizations to CSV
organizations_df = pd.DataFrame(organizations)
organizations_df.to_csv(f"{import_folder}/organizations.csv", index=False)

# Save affiliations to CSV
affiliations_df = pd.DataFrame(affiliations)
affiliations_df.to_csv(f'{import_folder}/r_affiliated_with.csv', index=False)

print("Organizations and affiliations CSV files generated.")

Creating affiliations: 100%|██████████| 100/100 [00:00<00:00, 41319.12it/s]

Organizations and affiliations CSV files generated.





# Update graph

In [8]:
# Establish connection to the Neo4j database
driver = GraphDatabase.driver(uri, auth=(username, password))

# Create query execution function
def execute_query(query):
    try:
        with driver.session() as session:
            session.run(query)
        print("Successfull")
    except Exception as e:
        print("An error occurred:", e)

## Reviews

In [9]:
# Define the Cypher query to create Review nodes 
cypher_query_review_node = """
LOAD CSV WITH HEADERS FROM 'file:///reviews.csv' AS row
CREATE (:REVIEW {Id: row.`Id:ID`, name: row.name, type: row.type, suggested_decision: row.suggested_decision})
"""

execute_query(cypher_query_review_node)

Successfull


In [None]:
# Define the Cypher query to create REVIEWED_BY relationship
cypher_query_reviewed_by = """
LOAD CSV WITH HEADERS FROM 'file:///reviewed_by.csv' AS row
MATCH (a:REVIEW {Id: row.`:START_ID`}), (b:AUTHOR {Id: row.`:END_ID`})
CREATE (a)-[:REVIEW_AUTHOR]->(b)
"""

execute_query(cypher_query_reviewed_by)

In [None]:
# Define the Cypher query to create REVIEWED_FOR relationship
cypher_query_reviewed_for = """
LOAD CSV WITH HEADERS FROM 'file:///reviewed_for.csv' AS row
MATCH (a:REVIEW {Id: row.`:START_ID`}), (b:PAPER {Id: row.`:END_ID`})
CREATE (a)-[:REVIEWED_FOR]->(b)
"""

execute_query(cypher_query_reviewed_for)

Successfull


In [10]:
# Delete publised_in relationship if majority of reviews didnt say accept
delete_published_in_query = """
MATCH (r:REVIEW)-[:REVIEWED_FOR]->(p:PAPER)
WITH p, 
     SUM(CASE WHEN r.suggested_decision = 'Accept' THEN 1 ELSE 0 END) AS acceptCount,
     SUM(CASE WHEN r.suggested_decision IN ['Reject', 'Revise'] THEN 1 ELSE 0 END) AS rejectOrReviseCount,
     COUNT(r) AS totalReviews
WHERE rejectOrReviseCount > acceptCount
MATCH (p)-[rel:PUBLISHED_IN]->(j:JOURNAL)
DELETE rel
RETURN p.title AS PaperTitle, j.name AS Journal
"""

execute_query(delete_published_in_query)

Successfull


## Affiliations

In [11]:
# Define the Cypher query to create organization nodes
cypher_query_organization = """
LOAD CSV WITH HEADERS FROM 'file:///organizations.csv' AS row
CREATE (:ORGANIZATION {Id: row.`Id:ID`, name: row.name, type: row.type})
"""

execute_query(cypher_query_organization)

Successfull


In [12]:
# Define the Cypher query to create affiliation relationships
cypher_query_affiliation = """
LOAD CSV WITH HEADERS FROM 'file:///r_affiliated_with.csv' AS row
MATCH (a:AUTHOR {Id: row.`:START_ID`}), (o:ORGANIZATION {Id: row.`:END_ID`})
CREATE (a)-[:AFFILIATED_WITH]->(o);
"""
execute_query(cypher_query_affiliation)

Successfull
