# Included Modules and Packages

In [None]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2
from geographiclib.geodesic import Geodesic

# Supporting Code

In [4]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","w205"))

In [5]:
session = driver.session(database="neo4j")

In [58]:
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, return a tuple with total cost and minutes"
    
    query = "CALL gds.graph.drop('ds_graph', false)"
    session.run(query)

    query = "CALL gds.graph.create('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)

    for r in result:
        
        total_cost = int(r['totalCost'])
        
#         print("\n--------------------------------")
#         print("   Total Cost: ", total_cost)
#         print("   Minutes: ", round(total_cost / 60.0,1))
#         print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        
        for n in nodes:
            
#             print(n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i])))
            
            previous = int(costs[i])
            i += 1
    return total_cost, round(total_cost / 60.0,1)

In [7]:
def my_calculate_box(point, miles):
    "Given a point and miles, calculate the box in form left, right, top, bottom"
    
    geod = Geodesic.WGS84

    kilometers = miles * 1.60934
    meters = kilometers * 1000

    g = geod.Direct(point[0], point[1], 270, meters)
    left = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 90, meters)
    right = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 0, meters)
    top = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 180, meters)
    bottom = (g['lat2'], g['lon2'])
    
    return(left, right, top, bottom)

In [69]:
def my_station_get_zips(station, miles):
    "given a station, pull all zip codes with miles distance, print them, sum the population"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = "select zip, population from zip_codes "
    query += " where latitude >= " + str(bottom[0])
    query += " and latitude <= " + str(top [0])
    query += " and longitude >= " + str(left[1])
    query += " and longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Zip Codes within " + str(miles) + " mile(s) of " + station + " BART Station")
#     print("-------------------------------------------------------------------------------\n")
    
    total_population = 0
    
    for row in rows:
        zip = row[0]
        population = row[1]
#         print("     zip:", zip, "  population: ", f'{population:10,}')
        total_population += population
    return float(total_population)    
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Total Population: ", f'{total_population:10,}')
#     print("-------------------------------------------------------------------------------")

In [12]:
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [9]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [10]:
cursor = connection.cursor()

# Generate Data Frame with All Stations

In [56]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select station
from stations
order by station

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

# Drop the Downtown Berkeley record

df = df[df["station"] != "Downtown Berkeley"]

In [64]:
# Add total cost and minutes to the data frame

total_cost = []
minutes = []

for station in df["station"]:
    total_cost.append(my_neo4j_shortest_path("depart Downtown Berkeley", "arrive " + station)[0])
    minutes.append(my_neo4j_shortest_path("depart Downtown Berkeley", "arrive " + station)[1])

df["total_cost"] = total_cost
df["total_minutes"] = minutes

df

Unnamed: 0,station,total_cost,total_minutes
0,12th Street,720,12.0
1,16th Street Mission,1860,31.0
2,19th Street,600,10.0
3,24th Street Mission,1980,33.0
4,Antioch,3659,61.0
5,Ashby,180,3.0
6,Balboa Park,2280,38.0
7,Bay Fair,1920,32.0
8,Berryessa,4200,70.0
9,Castro Valley,2214,36.9


In [72]:
pop_5 = []

for station in df["station"]:
    pop_5.append(my_station_get_zips(station, 5))
    
df["pop_5"] = pop_5

In [73]:
df

Unnamed: 0,station,total_cost,total_minutes,pop_5
0,12th Street,720,12.0,510498.0
1,16th Street Mission,1860,31.0,870044.0
2,19th Street,600,10.0,576172.0
3,24th Street Mission,1980,33.0,989138.0
4,Antioch,3659,61.0,152632.0
5,Ashby,180,3.0,487206.0
6,Balboa Park,2280,38.0,936912.0
7,Bay Fair,1920,32.0,457901.0
8,Berryessa,4200,70.0,559010.0
9,Castro Valley,2214,36.9,403433.0
