# Option: Adding Pickup Locations

## Included Modules and Packages

In [122]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2
from geographiclib.geodesic import Geodesic

## Supporting Code

In [123]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","w205"))

In [124]:
session = driver.session(database="neo4j")

In [125]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [126]:
cursor = connection.cursor()

In [127]:
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [128]:
def my_calculate_box(point, miles):
    "Given a point and miles, calculate the box in form left, right, top, bottom"
    
    geod = Geodesic.WGS84

    kilometers = miles * 1.60934
    meters = kilometers * 1000

    g = geod.Direct(point[0], point[1], 270, meters)
    left = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 90, meters)
    right = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 0, meters)
    top = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 180, meters)
    bottom = (g['lat2'], g['lon2'])
    
    return(left, right, top, bottom)

In [129]:
def my_station_get_zips(station, miles):
    "given a station, pull all zip codes with miles distance, print them, sum the population"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = "select zip, population from zip_codes "
    query += " where latitude >= " + str(bottom[0])
    query += " and latitude <= " + str(top [0])
    query += " and longitude >= " + str(left[1])
    query += " and longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Zip Codes within " + str(miles) + " mile(s) of " + station + " BART Station")
#     print("-------------------------------------------------------------------------------\n")
    
    total_population = 0
    
    for row in rows:
        zip = row[0]
        population = row[1]
#         print("     zip:", zip, "  population: ", f'{population:10,}')
        total_population += population
    return float(total_population)    
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Total Population: ", f'{total_population:10,}')
#     print("-------------------------------------------------------------------------------")

In [130]:
def cleanse_stations(df):
    """Returns a data frame with unique station names cleansed of line(s) and depart, arrive"""
    
    words = ["blue", "green", "orange", "red", "yellow", "orange", "gray", "depart", "arrive"]
    regex_pattern = r'\b(?:{})\b'.format('|'.join(words))
    df["name"] = df["name"].str.replace(regex_pattern, '')
    return df

In [131]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

## Generate Data Frame for Analysis

In [219]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select station
from stations
order by station

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

##### Add population within 5 miles of the station. Based on prior analysis, we found that customers who sign up for delivery live within 5 miles.

In [220]:
pop_5 = []

for station in df["station"]:
    pop_5.append(my_station_get_zips(station, 5))
    
df["pop_5"] = pop_5

##### Add degree centrality, which measures the number of incoming and outgoing connections

In [221]:
# Degree centrality for the connected graph

query = """

CALL gds.degree.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as degree
ORDER BY degree DESC, name

"""

deg_df = my_neo4j_run_query_pandas(query)

In [222]:
# Remove the line and depart / arrive designations

deg_df = cleanse_stations(deg_df)

# Keep the entry for each station with the maximum degree centrality

deg_df = deg_df.groupby(["name"])["degree"].max()
deg_df = deg_df.to_frame()

# Add degree centrality to df

df.set_index("station", inplace=True)
df["degree_centrality"] = deg_df["degree"].values

  df["name"] = df["name"].str.replace(regex_pattern, '')


##### Add betweenness centrality, which measures the number of paths which pass through a node (station)

In [223]:
# Betweenness centrality

query = """

CALL gds.betweenness.stream('ds_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as betweenness
ORDER BY betweenness DESC

"""

bet_df = my_neo4j_run_query_pandas(query)

In [224]:
# Remove the line and depart / arrive designations

bet_df = cleanse_stations(bet_df)

# Keep the entry for each station with the maximum betweenness centrality

bet_df = bet_df.groupby(["name"])["betweenness"].max()
bet_df = bet_df.to_frame()

# Add degree centrality to df

df["bet_centrality"] = bet_df["betweenness"].values

  df["name"] = df["name"].str.replace(regex_pattern, '')


##### Add PageRank for each station, which measures the influence of that station in the graph

In [225]:
# PageRank for each station

query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY page_rank DESC, name ASC

"""

max_iterations = 20
damping_factor = 0.05

pr_df = my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)

In [227]:
# Remove the line and depart / arrive designations

pr_df = cleanse_stations(pr_df)

# Keep the entry for each station with the maximum page rank

pr_df = pr_df.groupby(["name"])["page_rank"].max()
pr_df = pr_df.to_frame()

# Add degree centrality to df

df["page_rank"] = pr_df["page_rank"].values

  df["name"] = df["name"].str.replace(regex_pattern, '')


In [228]:
df

Unnamed: 0_level_0,pop_5,degree_centrality,bet_centrality,page_rank
station,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
12th Street,510498.0,5.0,5139.715461,1.006042
16th Street Mission,870044.0,6.0,3010.550494,1.003696
19th Street,576172.0,5.0,4820.250748,1.006131
24th Street Mission,989138.0,6.0,2829.403538,1.003696
Antioch,152632.0,2.0,325.0,1.014835
Ashby,487206.0,4.0,2460.860672,1.009097
Balboa Park,936912.0,6.0,2437.338289,1.005317
Bay Fair,457901.0,5.0,3348.740208,1.013135
Berryessa,559010.0,3.0,179.812881,1.003167
Castro Valley,403433.0,3.0,1589.0,1.024964
