# Neo4j for Identifying Best Pick-Up Locations

## Import Modules and Set Up Functions

In [1]:
import csv

import math
import numpy as np
import pandas as pd

import neo4j
import psycopg2
from geographiclib.geodesic import Geodesic

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [3]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [4]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [5]:
cursor = connection.cursor()

In [6]:
def my_read_csv_file(file_name, limit):
    "read the csv file and print only the first limit rows"
    
    csv_file = open(file_name, "r")
    
    csv_data = csv.reader(csv_file)
    
    i = 0
    
    for row in csv_data:
        i += 1
        if i <= limit:
            print(row)
            
    print("\nPrinted ", min(limit, i), "lines of ", i, "total lines.")

In [7]:
### For Graph Database relationships

## Query all scheduled trains going from one BART station to another and save in dataframe

rollback_before_flag = True
rollback_after_flag = True

query = """

select a.line, a.station as from_station, b.station as to_station, t.travel_time
from lines a
  join lines b
    on a.line = b.line and b.sequence = (a.sequence + 1)
  join travel_times t
    on (a.station = t.station_1 and b.station = t.station_2)
        or (a.station = t.station_2 and b.station = t.station_1)
order by line, from_station, to_station

"""

bart_df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

In [8]:
def my_calculate_box(point, miles):
    "Given a point and miles, calculate the box in form left, right, top, bottom"
    
    geod = Geodesic.WGS84

    kilometers = miles * 1.60934
    meters = kilometers * 1000

    g = geod.Direct(point[0], point[1], 270, meters)
    left = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 90, meters)
    right = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 0, meters)
    top = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 180, meters)
    bottom = (g['lat2'], g['lon2'])
    
    return(left, right, top, bottom)

## Stations Info: Population and Customers Within 1 Mile + Distance to Berkeley Store

In [9]:
def my_station_get_customers(station, miles):
    "given a station, pull all zip codes with miles distance and sum the customers"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = """select c.zip, count(c.customer_id), z.latitude, z.longitude 
                from customers c join zip_codes z
                on c.zip = z.zip
                group by c.zip, z.latitude, z.longitude"""
    query += " having z.latitude >= " + str(bottom[0])
    query += " and z.latitude <= " + str(top [0])
    query += " and z.longitude >= " + str(left[1])
    query += " and z.longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    total_customers = 0
    
    for row in rows:
        zip = row[0]
        customers = row[1]
        total_customers += customers
    return total_customers

In [10]:
def my_station_get_population(station, miles):
    "given a station, pull all zip codes with miles distance, print them, sum the population"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = "select zip, population from zip_codes "
    query += " where latitude >= " + str(bottom[0])
    query += " and latitude <= " + str(top [0])
    query += " and longitude >= " + str(left[1])
    query += " and longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Zip Codes within " + str(miles) + " mile(s) of " + station + " BART Station")
#     print("-------------------------------------------------------------------------------\n")
    
    total_population = 0
    
    for row in rows:
        zip = row[0]
        population = row[1]
        #print("     zip:", zip, "  population: ", f'{population:10,}')
        total_population += population
        
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Total Population: ", f'{total_population:10,}')
#     print("-------------------------------------------------------------------------------")
    
    return total_population

In [11]:
# Get stations
connection.rollback()
    
query = "select station from stations "

stations = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

In [12]:
# get coordinates of Berkeley store
connection.rollback()
    
query = "select latitude, longitude from stores where city = 'Berkeley'  "

cursor.execute(query)

connection.rollback()

coordinates = cursor.fetchall()
lat = float(coordinates[0][0])
lon = float(coordinates[0][1])

In [13]:
## Calculate distance from Berkeley Store
def my_calculate_distance(station, store):
    "calculate the distance between the station and the Berkeley store"
    
    geod = Geodesic.WGS84

    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)

    g = geod.Inverse(float(point[0]), float(point[1]), store[0], store[1])
    miles = g['s12'] / 1000 * 0.621371
    
    return miles

In [14]:
df = pd.DataFrame(columns = ['Station', 'Population_Within_1_Mile', 'Customers_Within_1_Mile', 'Distance_From_Store'])
df = df.append({'Station' : 'Berkeley Store', 'Population_Within_1_Mile' : 0, 
                    'Customers_Within_1_Mile': 0, 'Distance_From_Store': 0},
                   ignore_index = True)
for station in list(stations.station):
    total_population = my_station_get_population(station, 1)
    total_customers = my_station_get_customers(station, 1)
    distance = my_calculate_distance(station, (lat, lon))
    if total_population > 0:
        df = df.append({'Station' : station, 'Population_Within_1_Mile' : total_population, 
                    'Customers_Within_1_Mile': total_customers, 'Distance_From_Store': round(distance,1)},
                   ignore_index = True)
df.sort_values('Population_Within_1_Mile', ascending = False)

Unnamed: 0,Station,Population_Within_1_Mile,Customers_Within_1_Mile,Distance_From_Store
29,Powell Street,140730,333,9.5
17,Glen Park,115068,103,12.7
4,24th Street Mission,108915,133,11.2
6,Balboa Park,106589,76,13.8
27,Pittsburg Center,96081,1,23.1
23,Montgomery Street,85465,274,9.0
11,Downtown Berkeley,82930,634,1.1
34,South Hayward,79235,9,18.9
9,Civic Center,74898,210,9.9
35,Union City,74601,9,22.6


## Neo4j Graph Set up Functions

In [15]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","w205"))

In [16]:
session = driver.session(database="neo4j")

In [17]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)

In [18]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [19]:
def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")


In [20]:
def my_neo4j_create_node(station_name, name_label):
    "create a node with label Station"
    
    query = """ CREATE (:"""
    query += name_label
    query += """ {name: $station_name})"""
    
    
    session.run(query, station_name=station_name)
    

In [21]:
def my_neo4j_create_relationship_one_way(from_station, to_station, to, weight, rel_type):
    "create a relationship one way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:"""
    query += to
    query += """)
    
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:""" + rel_type
    
    query += " {weight: $weight}]->(to)"
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [22]:
def my_neo4j_create_relationship_two_way(from_station, to_station, to, weight):
    "create relationships two way between two stations with a weight"
    
    query = """
    
    MATCH (from:Station), 
          (to:"""
    query += to
    query += """)
    
    WHERE from.name = $from_station and to.name = $to_station
    CREATE (from)-[:LINK {weight: $weight}]->(to),
           (to)-[:LINK {weight: $weight}]->(from)
    
    """
    
    session.run(query, from_station=from_station, to_station=to_station, weight=weight)
    

In [23]:
my_neo4j_wipe_out_database()

In [24]:
for index, row in df.iterrows():
    station = row[0]
    num_population = float(row[1])
    num_customers = float(row[2])
    distance_from_store = row[3]
    ratio_dp_c = distance_from_store * num_population / (num_customers + 0.1)    
    
    my_neo4j_create_node(station, "Station")
    if index > 0:
        my_neo4j_create_relationship_one_way(prev_station, station, "Station", ratio_dp_c, "LINK")
    
    prev_station = row[0]
    
for index, row in bart_df.iterrows():
    from_station = row[1]
    to_station = row[2]
    my_neo4j_create_relationship_one_way(from_station, to_station, "Station", 0, "LINK")

## Weighted Degree Centrality

In [25]:
query = "CALL gds.graph.drop('ds_graph', false)"
session.run(query)

query = "CALL gds.graph.create('ds_graph', 'Station','LINK', {relationshipProperties: 'weight'})"
session.run(query)

<neo4j.work.result.Result at 0x7f032c68ba60>

In [26]:
query = """

CALL gds.degree.stream(
   'ds_graph',
   { relationshipWeightProperty: 'weight' }
)
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS Station, score AS Degree_Centrality
ORDER BY Degree_Centrality DESC, Station DESC
"""

my_neo4j_run_query_pandas(query)

Unnamed: 0,Station,Degree_Centrality
0,Bay Fair,11404220.0
1,Orinda,2017701.0
2,Civic Center,257965.4
3,Embarcadero,235241.9
4,South Hayward,185272.8
5,San Leandro,164565.0
6,Downtown Berkeley,120693.9
7,Glen Park,103989.1
8,SFO,81509.1
9,MacArthur,42496.93


## Page Rank (Trials)

In [27]:
query = """

CALL gds.pageRank.stream('ds_graph',
                         { maxIterations: $max_iterations,
                           dampingFactor: $damping_factor,
                           relationshipWeightProperty: 'weight'}
                         )
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS Station, score as Page_Rank
ORDER BY Page_Rank DESC, Station ASC

"""

max_iterations = 20
damping_factor = 0.05

# df_pr = my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)[1:]

my_neo4j_run_query_pandas(query, max_iterations=max_iterations, damping_factor=damping_factor)[1:]



Unnamed: 0,Station,Page_Rank
1,Bay Fair,1.0
2,Berryessa,1.0
3,Civic Center,1.0
4,Colma,1.0
5,Downtown Berkeley,1.0
6,Dublin,1.0
7,El Cerrito Plaza,1.0
8,Embarcadero,1.0
9,Fremont,1.0
10,Fruitvale,1.0


In [28]:
query = """

MATCH (siteA:Station {name: $source})
CALL gds.pageRank.stream('ds_graph', {
  maxIterations: $max_iterations,
  dampingFactor: $damping_factor,
  sourceNodes: [siteA],
  relationshipWeightProperty: 'weight'
})
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score as page_rank
ORDER BY score DESC, name ASC

"""

source = "Berryessa"
max_iterations = 20
damping_factor = 0.85

my_neo4j_run_query_pandas(query, source=source, max_iterations=max_iterations, damping_factor=damping_factor)


Unnamed: 0,name,page_rank
0,Berryessa,0.15
1,Civic Center,0.1275
2,Colma,0.108375
3,Downtown Berkeley,0.092119
4,Dublin,0.078301
5,El Cerrito Plaza,0.066556
6,Embarcadero,0.056572
7,Fremont,0.048087
8,Fruitvale,0.040874
9,Glen Park,0.034743
