### Setup

In [2]:
# databases
import psycopg2
import sqlparse
import redis
from neo4j import GraphDatabase

# external core
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import geopandas as gpd

%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # Higher resolution figures

# utility
import re
import time
import csv
import nltk
import getpass
from datetime import datetime
from tqdm.notebook import tqdm

### PostgreSQL Loading

#### Connect to Postgres

**Warning**: Please make sure to have the database `accident_analysis` created, if not, run the following query manually:
```sql
CREATE DATABASE accident_analysis;
```

In [5]:
# connect to postgres
pg_password = getpass.getpass("Enter Postgres Password: ")
pg = psycopg2.connect(
    dbname="accident_analysis",
    user="postgres",
    password=pg_password,
    host="localhost",
    port="5433"
)

#### Create Schema

In [6]:
schema = ["""
    CREATE TABLE IF NOT EXISTS accidents (
        report_id VARCHAR PRIMARY KEY,
        date_time TIMESTAMP,
        violation_section VARCHAR,
        violation_type CHAR(2),
        injured INT NOT NULL,
        killed INT NOT NULL,
        hit_run_lvl VARCHAR
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS roads (
        objectid VARCHAR PRIMARY KEY,
        roadsegid VARCHAR,
        roadid VARCHAR,
        road_name VARCHAR,
        speed INT,
        oneway CHAR(1),
        firedriv CHAR(1)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS reports (
        service_request_id VARCHAR PRIMARY KEY,
        date_requested TIMESTAMP,
        case_age_days INT,
        service_name VARCHAR,
        service_name_detail VARCHAR,
        status VARCHAR,
        street_address VARCHAR,
        council_district INT,
        case_origin VARCHAR
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS accidents_on_road (
        report_id VARCHAR REFERENCES accidents(report_id),
        objectid VARCHAR REFERENCES roads(objectid)
    );
    """,
    """
    CREATE TABLE IF NOT EXISTS reports_on_road (
        service_request_id VARCHAR REFERENCES reports(service_request_id),
        objectid VARCHAR REFERENCES roads(objectid)
    );
    """
]
cur = pg.cursor()
for table_schema in schema:
    cur.execute(table_schema)
pg.commit()
cur.close()

#### Load Table

In [7]:
def load_table(table_name, cols=None):
    # subset
    if cols is None:
        file_to_load = f'../data/{table_name}.csv'
    else:
        file_to_load = f'../data/{table_name}_sql.csv'
        pd.read_csv(f'../data/{table_name}.csv', usecols=cols).to_csv(file_to_load, index=False)
    
    # load
    cur = pg.cursor()
    try:
        with open(file_to_load, 'r') as f:
            next(f) # header
            cur.copy_from(f, table_name, sep=',')
        pg.commit()
    except Exception as e:
        print('ERROR:', e)
        pg.rollback()
    cur.close()

In [8]:
load_table('accidents', cols=[
    'report_id', 'date_time', 
    'violation_section', 'violation_type',
    'injured', 'killed', 'hit_run_lvl'
])

In [9]:
load_table('roads', cols=[
    'objectid', 'roadsegid', 'roadid', 'rd20full',
    'speed', 'oneway', 'firedriv'
])

In [10]:
load_table('reports', cols=[
    'service_request_id', 'date_requested', 
    'case_age_days', 'service_name', 'service_name_detail',
    'status', 'street_address', 
    'council_district', 'case_origin'
])

In [11]:
load_table('accidents_on_road')
load_table('reports_on_road')

In [None]:
cur.close()
pg.close()

### Neo4J Graph Loading

#### Connect to Neo4J

In [2]:
uri = "bolt://localhost:7666"
username = "neo4j"
password = "password"

def create_neo4j_session(uri, username, password):
    try:
        driver = GraphDatabase.driver(uri, auth=(username, password))
        session = driver.session()
        return session
    except Exception as e:
        print(f"Failed to create Neo4j session: {e}")
        return None

#### Add Constraints

In [3]:
session = create_neo4j_session(uri, username, password)

accident_constraint = """
CREATE CONSTRAINT accident_id IF NOT EXISTS 
FOR (n:Accident) REQUIRE n.id IS UNIQUE
"""
result = session.run(accident_constraint)

road_constraint = """
CREATE CONSTRAINT road_id IF NOT EXISTS
FOR (n:Road) REQUIRE n.id IS UNIQUE
"""
result = session.run(road_constraint)

report_constraint = """
CREATE CONSTRAINT report_id IF NOT EXISTS
FOR (n:Report) REQUIRE n.id IS UNIQUE
"""
result = session.run(report_constraint)

# Close the session and driver
session.close()

#### Add Nodes

***Note***: Due to the data size, we only load accidents starting from 2022, which we believe is a representative sample of all accidents.

In [None]:
# insert nodes
session = create_neo4j_session(uri, username, password)

accident_node = f"""
LOAD CSV WITH HEADERS FROM 'file:///accidents.csv' AS row
WITH row
WHERE date(left(row.date_time, 10)) >= date('2022-01-01')
CALL {{
    WITH row
    MERGE (acc:Accident {{
        id: row.report_id,
        pt: point({{longitude: toFloat(row.x), latitude: toFloat(row.y)}})
    }})
    RETURN count(*) AS count
}}
IN TRANSACTIONS OF 200 ROWS
RETURN count(*) AS totalCount

"""
result = session.run(accident_node)

report_node = f"""
LOAD CSV WITH HEADERS FROM 'file:///get-it-done-reports.csv' AS row
WITH row
WHERE date(left(row.date_requested, 10)) >= date('2022-01-01')
CALL {{
    WITH row
    MERGE (acc:Report {{
        id: row.service_request_id,
        pt: point({{longitude: toFloat(row.lng), latitude: toFloat(row.lat)}}),
        service_name: row.service_name
    }})
    RETURN count(*) AS count  
}} IN TRANSACTIONS OF 200 ROWS
RETURN count(*) AS totalCount
"""

result = session.run(report_node)

road_node = """
LOAD CSV WITH HEADERS FROM 'file:///roads.csv' AS row
CALL {
    WITH row
    MERGE (acc:Road {
        id: row.objectid,
        segment_id: row.roadsegid,
        road_id: row.roadid,
        name: row.rd20full
    })
    RETURN count(*) AS count  
} IN TRANSACTIONS OF 200 ROWS
RETURN count(*) AS totalCount
"""

result = session.run(road_node)

# Close the session and driver
session.close()

#### Add Relationships

In [6]:
### accident - accident relationships
session = create_neo4j_session(uri, username, password)

acc_dist_query = """
MATCH (acc1:Accident), (acc2:Accident)
WHERE id(acc1) > id(acc2)
WITH acc1, acc2, point.distance(acc1.pt, acc2.pt) AS pt_dist
WHERE pt_dist <= 500
MERGE (acc1)-[:WITHIN_RANGE {dist: pt_dist}]->(acc2)
"""

result = session.run(acc_dist_query)
session.close()

In [13]:
### accident - road relationships
session = create_neo4j_session(uri, username, password)

acc_road_rel_query = """
LOAD CSV WITH HEADERS FROM 'file:///accidents_on_road.csv' AS row
CALL {
    WITH row
    MATCH (a:Accident {id: row.report_id})
    MATCH (r:Road {id: row.objectid})
    CALL apoc.merge.relationship(
        a, 'HAPPENS_ON', {}, {}, r, {}
    ) YIELD rel
    RETURN count(*) AS count
    
} IN TRANSACTIONS OF 200 ROWS
RETURN count(*) AS totalCount
"""

result = session.run(acc_road_rel_query)
session.close()

In [15]:
### report - road relationships
session = create_neo4j_session(uri, username, password)

report_road_rel_query = """
LOAD CSV WITH HEADERS FROM 'file:///reports_on_road.csv' AS row
CALL {
    WITH row
    MATCH (a:Report {id: row.service_request_id})
    MATCH (r:Road {id: row.objectid})
    CALL apoc.merge.relationship(
        a, 'REPORTED_ON', {}, {}, r, {}
    ) YIELD rel
    RETURN count(*) AS count
    
} IN TRANSACTIONS OF 200 ROWS
RETURN count(*) AS totalCount
"""

result = session.run(report_road_rel_query)
session.close()