# Option: Using BART to Transport Product

## Included Modules and Packages

In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2
from geographiclib.geodesic import Geodesic

## Supporting Code

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [5]:
cursor = connection.cursor()

In [6]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [7]:
cursor = connection.cursor()

In [8]:
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [9]:
def my_calculate_box(point, miles):
    "Given a point and miles, calculate the box in form left, right, top, bottom"
    
    geod = Geodesic.WGS84

    kilometers = miles * 1.60934
    meters = kilometers * 1000

    g = geod.Direct(point[0], point[1], 270, meters)
    left = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 90, meters)
    right = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 0, meters)
    top = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 180, meters)
    bottom = (g['lat2'], g['lon2'])
    
    return(left, right, top, bottom)

In [10]:
def my_station_get_zips(station, miles):
    "given a station, pull all zip codes with miles distance, print them, sum the population"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = "select zip, population from zip_codes "
    query += " where latitude >= " + str(bottom[0])
    query += " and latitude <= " + str(top [0])
    query += " and longitude >= " + str(left[1])
    query += " and longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Zip Codes within " + str(miles) + " mile(s) of " + station + " BART Station")
#     print("-------------------------------------------------------------------------------\n")
    
    total_population = 0
    
    for row in rows:
        zip = row[0]
        population = row[1]
#         print("     zip:", zip, "  population: ", f'{population:10,}')
        total_population += population
    return float(total_population)    
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Total Population: ", f'{total_population:10,}')
#     print("-------------------------------------------------------------------------------")

In [11]:
def cleanse_stations(df):
    """Returns a data frame with unique station names cleansed of line(s) and depart, arrive"""
    
    words = ["blue", "green", "orange", "red", "yellow", "orange", "gray", "depart", "arrive"]
    regex_pattern = r'\b(?:{})\b'.format('|'.join(words))
    df["name"] = df["name"].str.replace(regex_pattern, '')
    return df

In [12]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

In [124]:
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, run and print the shortest path"
    
    query = "CALL gds.graph.drop('ds_graph', false)"
    session.run(query)

    query = "CALL gds.graph.create('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
    for r in result:
        
        total_cost = int(r['totalCost'])
        
        #print("\n--------------------------------")
        #print("   Total Cost: ", total_cost)
        #print("   Minutes: ", round(total_cost / 60.0,1))
        #print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        
        for n in nodes:
            
            #print(n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i])))
            
            previous = int(costs[i])
            i += 1
    
    return total_cost

In [16]:
def my_station_get_zip_list(station, miles):
    "given a station, pull all zip codes with miles distance, print them, sum the population"
    
    connection.rollback()
    
    query = "select latitude, longitude from stations "
    query += "where station = '" + station + "'"
    
    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
    for row in rows:
        latitude = row[0]
        longitude = row[1]
        
    point = (latitude, longitude)
        
    (left, right, top, bottom) = my_calculate_box(point, miles)
    
    query = "select zip, population from zip_codes "
    query += " where latitude >= " + str(bottom[0])
    query += " and latitude <= " + str(top [0])
    query += " and longitude >= " + str(left[1])
    query += " and longitude <= " + str(right[1])
    query += " order by 1 "

    cursor.execute(query)
    
    connection.rollback()
    
    rows = cursor.fetchall()
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Zip Codes within " + str(miles) + " mile(s) of " + station + " BART Station")
#     print("-------------------------------------------------------------------------------\n")
    
    total_population = 0
    
    zip_list = []
    
    for row in rows:
        zip = row[0]
        population = row[1]
#         print("     zip:", zip, "  population: ", f'{population:10,}')
        total_population += population
        zip_list.append(row[0])
    return zip_list  
    
#     print("\n-------------------------------------------------------------------------------")
#     print("  Total Population: ", f'{total_population:10,}')
#     print("-------------------------------------------------------------------------------")

## Where are our current customers?

In [54]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select count(customer_id), zip
from customers
group by zip

"""

customer_locs = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
customer_locs = customer_locs.set_index('zip')
customer_locs

Unnamed: 0_level_0,count
zip,Unnamed: 1_level_1
37062,8
94066,9
75023,7
33131,115
76180,1
...,...
33168,61
76012,1
33128,76
94591,5


# Create population/current customer table based on 5 mile radius from stations.

In [15]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select station
from stations
order by station

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df

Unnamed: 0,station
0,12th Street
1,16th Street Mission
2,19th Street
3,24th Street Mission
4,Antioch
5,Ashby
6,Balboa Park
7,Bay Fair
8,Berryessa
9,Castro Valley


In [18]:
pop_5 = []

for station in df["station"]:
    pop_5.append(my_station_get_zips(station, 5))
    
df["pop_5"] = pop_5
df

Unnamed: 0,station,pop_5
0,12th Street,510498.0
1,16th Street Mission,870044.0
2,19th Street,576172.0
3,24th Street Mission,989138.0
4,Antioch,152632.0
5,Ashby,487206.0
6,Balboa Park,936912.0
7,Bay Fair,457901.0
8,Berryessa,559010.0
9,Castro Valley,403433.0


In [55]:
zips_5 = []

for station in df["station"]:
    zips_5.append(my_station_get_zip_list(station, 5))
    
df["zips_5"] = zips_5
df

Unnamed: 0,station,pop_5,zips_5
0,12th Street,510498.0,"[94501, 94502, 94601, 94602, 94606, 94607, 946..."
1,16th Street Mission,870044.0,"[94102, 94103, 94104, 94105, 94107, 94108, 941..."
2,19th Street,576172.0,"[94501, 94502, 94563, 94601, 94602, 94603, 946..."
3,24th Street Mission,989138.0,"[94005, 94014, 94015, 94102, 94103, 94104, 941..."
4,Antioch,152632.0,"[94509, 94531, 94561]"
5,Ashby,487206.0,"[94530, 94563, 94602, 94606, 94607, 94608, 946..."
6,Balboa Park,936912.0,"[94005, 94014, 94015, 94080, 94102, 94103, 941..."
7,Bay Fair,457901.0,"[94541, 94542, 94544, 94546, 94577, 94578, 945..."
8,Berryessa,559010.0,"[95050, 95053, 95054, 95110, 95112, 95113, 951..."
9,Castro Valley,403433.0,"[94541, 94542, 94544, 94546, 94552, 94577, 945..."


In [68]:

def get_customers(list):
    
    customers = 0
    
    for zipcode in list:
        
        try:
            
            customers += int(customer_locs.loc[[str(zipcode)],"count"])
        
        except:
            
            continue
        
    return customers


In [88]:
df["current_customers_5"] = df['zips_5'].apply(get_customers)
df = df.sort_values(by=["current_customers_5"],axis=0,ascending=False)
df

Unnamed: 0,station,pop_5,zips_5,current_customers_5
5,Ashby,487206.0,"[94530, 94563, 94602, 94606, 94607, 94608, 946...",3638
39,Rockridge,536232.0,"[94501, 94516, 94563, 94601, 94602, 94606, 946...",3555
26,MacArthur,524629.0,"[94501, 94563, 94601, 94602, 94606, 94607, 946...",3439
15,Downtown Berkeley,448042.0,"[94530, 94563, 94602, 94607, 94608, 94609, 946...",3430
30,North Berkeley,445973.0,"[94130, 94530, 94602, 94607, 94608, 94609, 946...",3410
2,19th Street,576172.0,"[94501, 94502, 94563, 94601, 94602, 94603, 946...",3389
25,Lake Merritt,542120.0,"[94501, 94502, 94601, 94602, 94603, 94606, 946...",3049
0,12th Street,510498.0,"[94501, 94502, 94601, 94602, 94606, 94607, 946...",3021
49,West Oakland,473317.0,"[94130, 94501, 94502, 94601, 94602, 94606, 946...",2839
18,El Cerrito Plaza,355903.0,"[94530, 94608, 94609, 94618, 94702, 94703, 947...",2657


# Create population/current customer table based on 2 mile radius from stations.

In [106]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select station
from stations
order by station

"""

df2 = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df2

Unnamed: 0,station
0,12th Street
1,16th Street Mission
2,19th Street
3,24th Street Mission
4,Antioch
5,Ashby
6,Balboa Park
7,Bay Fair
8,Berryessa
9,Castro Valley


In [112]:
pop_2 = []

for station in df2["station"]:
    pop_2.append(my_station_get_zips(station, 2))
    
df2["pop_2"] = pop_2
df2

Unnamed: 0,station,pop_2
0,12th Street,175958.0
1,16th Street Mission,339093.0
2,19th Street,165215.0
3,24th Street Mission,315201.0
4,Antioch,66933.0
5,Ashby,173897.0
6,Balboa Park,253123.0
7,Bay Fair,93041.0
8,Berryessa,197640.0
9,Castro Valley,110328.0


### The pitsburg bart station has the same zip code as pittsburg center. It only doen't come up because the zip code area is so big it lies outside the box formed around the pitsburg bart station. 

### Filled in that value. 

In [113]:
df2.at[34,"pop_2"] = 96081

In [115]:
zips_2 = []

for station in df2["station"]:
    zips_2.append(my_station_get_zip_list(station, 2))
    
df2["zips_2"] = zips_2
df2

Unnamed: 0,station,pop_2,zips_2
0,12th Street,175958.0,"[94501, 94606, 94607, 94610, 94612]"
1,16th Street Mission,339093.0,"[94102, 94103, 94104, 94105, 94107, 94108, 941..."
2,19th Street,165215.0,"[94606, 94607, 94608, 94609, 94610, 94612]"
3,24th Street Mission,315201.0,"[94102, 94103, 94107, 94110, 94114, 94117, 941..."
4,Antioch,66933.0,[94509]
5,Ashby,173897.0,"[94608, 94609, 94618, 94702, 94703, 94704, 947..."
6,Balboa Park,253123.0,"[94110, 94112, 94127, 94131, 94134]"
7,Bay Fair,93041.0,"[94578, 94579, 94580]"
8,Berryessa,197640.0,"[95110, 95112, 95116, 95131, 95133]"
9,Castro Valley,110328.0,"[94541, 94546]"


In [118]:
df2.at[34,"zips_2"] = [94565]

In [119]:
df2["current_customers_2"] = df2['zips_2'].apply(get_customers)
df2 = df2.sort_values(by=["current_customers_2"],axis=0,ascending=False)
df2

Unnamed: 0,station,pop_2,zips_2,current_customers_2
5,Ashby,173897.0,"[94608, 94609, 94618, 94702, 94703, 94704, 947...",1384
15,Downtown Berkeley,153753.0,"[94618, 94702, 94703, 94704, 94705, 94706, 947...",1293
39,Rockridge,151725.0,"[94608, 94609, 94618, 94702, 94703, 94704, 94705]",1162
2,19th Street,165215.0,"[94606, 94607, 94608, 94609, 94610, 94612]",1124
30,North Berkeley,134802.0,"[94702, 94703, 94704, 94706, 94707, 94708, 947...",1118
26,MacArthur,143953.0,"[94607, 94608, 94609, 94610, 94612, 94618]",1081
0,12th Street,175958.0,"[94501, 94606, 94607, 94610, 94612]",915
25,Lake Merritt,175958.0,"[94501, 94606, 94607, 94610, 94612]",915
10,Civic Center,347795.0,"[94102, 94103, 94104, 94105, 94107, 94108, 941...",811
37,Powell Street,303556.0,"[94102, 94103, 94104, 94105, 94107, 94108, 941...",759


In [125]:
def dist_from_ashby(station):
    
    arrive = "arrive " + str(station)
    
    return my_neo4j_shortest_path("depart Ashby", arrive)


df2["travel_time_from_ashby"] = df2["station"].apply(dist_from_ashby)
df2

Unnamed: 0,station,pop_2,zips_2,current_customers_2,travel_time_from_ashby
5,Ashby,173897.0,"[94608, 94609, 94618, 94702, 94703, 94704, 947...",1384,0
15,Downtown Berkeley,153753.0,"[94618, 94702, 94703, 94704, 94705, 94706, 947...",1293,180
39,Rockridge,151725.0,"[94608, 94609, 94618, 94702, 94703, 94704, 94705]",1162,539
2,19th Street,165215.0,"[94606, 94607, 94608, 94609, 94610, 94612]",1124,420
30,North Berkeley,134802.0,"[94702, 94703, 94704, 94706, 94707, 94708, 947...",1118,300
26,MacArthur,143953.0,"[94607, 94608, 94609, 94610, 94612, 94618]",1081,240
0,12th Street,175958.0,"[94501, 94606, 94607, 94610, 94612]",915,540
25,Lake Merritt,175958.0,"[94501, 94606, 94607, 94610, 94612]",915,720
10,Civic Center,347795.0,"[94102, 94103, 94104, 94105, 94107, 94108, 941...",811,1500
37,Powell Street,303556.0,"[94102, 94103, 94104, 94105, 94107, 94108, 941...",759,1440
