# Project 3

### University of California, Berkeley
### Master of Information and Data Science Program (MIDS)
### w205 - Fundamentals of Data Engineering

* Year: 2022
* Semester: Summer
* Section: 02
* Instructor: Korin Reid
* Team Members:
    * team member 1 Iris Lew
    * team member 2 Ivy Chan
    * team member 3 Ghiwa Lamah


# Verify Shortest Paths

In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2

import json

import gmaps
import gmaps.geojson_geometries

from geographiclib.geodesic import Geodesic

In [2]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","w205"))

In [3]:
session = driver.session(database="neo4j")

In [4]:
def my_neo4j_shortest_path(from_station, to_station):
    "given a from station and to station, run and print the shortest path"
    
    query = "CALL gds.graph.drop('ds_graph', false)"
    session.run(query)

    query = "CALL gds.graph.create('ds_graph', 'Station', 'LINK', {relationshipProperties: 'weight'})"
    session.run(query)

    query = """

    MATCH (source:Station {name: $source}), (target:Station {name: $target})
    CALL gds.shortestPath.dijkstra.stream(
        'ds_graph', 
        { sourceNode: source, 
          targetNode: target, 
          relationshipWeightProperty: 'weight'
        }
    )
    YIELD index, sourceNode, targetNode, totalCost, nodeIds, costs, path
    RETURN
        gds.util.asNode(sourceNode).name AS from,
        gds.util.asNode(targetNode).name AS to,
        totalCost,
        [nodeId IN nodeIds | gds.util.asNode(nodeId).name] AS nodes,
        costs
    ORDER BY index

    """

    result = session.run(query, source=from_station, target=to_station)
    
    for r in result:
        
        total_cost = int(r['totalCost'])
        
        print("\n--------------------------------")
        print("   Total Cost: ", total_cost)
        print("   Minutes: ", round(total_cost / 60.0,1))
        print("--------------------------------")
        
        nodes = r['nodes']
        costs = r['costs']
        
        i = 0
        previous = 0
        
        for n in nodes:
            
            print(n + ", " + str(int(costs[i]) - previous)  + ", " + str(int(costs[i])))
            
            previous = int(costs[i])
            i += 1
    

## Use the function my_neo4j_shortest_path() to check and compare the travel times between Downtown Berkeley to the end stations of all lines. Our main store is located at Berkeley.

In [5]:
# Richmond-Millbrae+SFO Line (Red) end station 
# Berryessa/North San Jose-Richmond Line (orange) end station
my_neo4j_shortest_path('depart Downtown Berkeley', 'arrive Richmond')


--------------------------------
   Total Cost:  780
   Minutes:  13.0
--------------------------------
depart Downtown Berkeley, 0, 0
orange Downtown Berkeley, 0, 0
orange North Berkeley, 120, 120
orange El Cerrito Plaza, 180, 300
orange El Cerrito del Norte, 180, 480
orange Richmond, 300, 780
arrive Richmond, 0, 780


In [6]:
# Antioch-SFO+Millbrae Line (Yellow) end station 
my_neo4j_shortest_path('depart Downtown Berkeley', 'arrive Antioch')


--------------------------------
   Total Cost:  3659
   Minutes:  61.0
--------------------------------
depart Downtown Berkeley, 0, 0
orange Downtown Berkeley, 0, 0
orange Ashby, 180, 180
orange MacArthur, 240, 420
yellow MacArthur, 59, 479
yellow Rockridge, 240, 719
yellow Orinda, 300, 1019
yellow Lafayette, 300, 1319
yellow Walnut Creek, 300, 1619
yellow Pleasant Hill, 120, 1739
yellow Concord, 360, 2099
yellow North Concord, 180, 2279
yellow Pittsburg, 360, 2639
yellow Pittsburg Center, 600, 3239
yellow Antioch, 420, 3659
arrive Antioch, 0, 3659


In [7]:
# Richmond-Millbrae+SFO Line (Red) end station 
# Antioch-SFO+Millbrae Line (Yellow) end station 
# Dublic/Pleasanton-Daly City Line (Blue) end station
# Berryessa/North San Jose-Daly City Line (Green) end station
my_neo4j_shortest_path('depart Downtown Berkeley', 'arrive Daly City')


--------------------------------
   Total Cost:  2520
   Minutes:  42.0
--------------------------------
depart Downtown Berkeley, 0, 0
red Downtown Berkeley, 0, 0
red Ashby, 180, 180
red MacArthur, 240, 420
red 19th Street, 180, 600
red 12th Street, 120, 720
red West Oakland, 300, 1020
red Embarcadero, 420, 1440
red Montgomery Street, 60, 1500
red Powell Street, 120, 1620
red Civic Center, 60, 1680
red 16th Street Mission, 180, 1860
red 24th Street Mission, 120, 1980
red Glen Park, 180, 2160
red Balboa Park, 120, 2280
red Daly City, 240, 2520
arrive Daly City, 0, 2520


In [8]:
# Richmond-Millbrae+SFO Line (Red) end station 
# Antioch-SFO+Millbrae Line (Yellow) end station 
my_neo4j_shortest_path('depart Downtown Berkeley', 'arrive Millbrae')


--------------------------------
   Total Cost:  3600
   Minutes:  60.0
--------------------------------
depart Downtown Berkeley, 0, 0
red Downtown Berkeley, 0, 0
red Ashby, 180, 180
red MacArthur, 240, 420
red 19th Street, 180, 600
red 12th Street, 120, 720
red West Oakland, 300, 1020
red Embarcadero, 420, 1440
red Montgomery Street, 60, 1500
red Powell Street, 120, 1620
red Civic Center, 60, 1680
red 16th Street Mission, 180, 1860
red 24th Street Mission, 120, 1980
red Glen Park, 180, 2160
red Balboa Park, 120, 2280
red Daly City, 240, 2520
red Colma, 240, 2760
red South San Francisco, 180, 2940
red San Bruno, 240, 3180
red Millbrae, 420, 3600
arrive Millbrae, 0, 3600


In [9]:
# Dublic/Pleasanton-Daly City Line (Blue) end station
my_neo4j_shortest_path('depart Downtown Berkeley', 'arrive Dublin')


--------------------------------
   Total Cost:  2994
   Minutes:  49.9
--------------------------------
depart Downtown Berkeley, 0, 0
orange Downtown Berkeley, 0, 0
orange Ashby, 180, 180
orange MacArthur, 240, 420
orange 19th Street, 180, 600
orange 12th Street, 120, 720
orange Lake Merritt, 180, 900
orange Fruitvale, 300, 1200
orange Coliseum, 240, 1440
blue Coliseum, 54, 1494
blue San Leandro, 240, 1734
blue Bay Fair, 240, 1974
blue Castro Valley, 240, 2214
blue West Dublin, 600, 2814
blue Dublin, 180, 2994
arrive Dublin, 0, 2994


In [10]:
# Berryessa/North San Jose-Daly City Line (Green) end station
# Berryessa/North San Jose-Richmond Line (orange) end station
my_neo4j_shortest_path('depart Downtown Berkeley', 'arrive Berryessa')


--------------------------------
   Total Cost:  4200
   Minutes:  70.0
--------------------------------
depart Downtown Berkeley, 0, 0
orange Downtown Berkeley, 0, 0
orange Ashby, 180, 180
orange MacArthur, 240, 420
orange 19th Street, 180, 600
orange 12th Street, 120, 720
orange Lake Merritt, 180, 900
orange Fruitvale, 300, 1200
orange Coliseum, 240, 1440
orange San Leandro, 240, 1680
orange Bay Fair, 240, 1920
orange Hayward, 240, 2160
orange South Hayward, 240, 2400
orange Union City, 300, 2700
orange Fremont, 300, 3000
orange Warm Springs, 360, 3360
orange Milpitas, 540, 3900
orange Berryessa, 300, 4200
arrive Berryessa, 0, 4200


In [11]:
# Oakland International Airport (Gray)
my_neo4j_shortest_path('depart Downtown Berkeley', 'arrive OAK')


--------------------------------
   Total Cost:  1974
   Minutes:  32.9
--------------------------------
depart Downtown Berkeley, 0, 0
orange Downtown Berkeley, 0, 0
orange Ashby, 180, 180
orange MacArthur, 240, 420
orange 19th Street, 180, 600
orange 12th Street, 120, 720
orange Lake Merritt, 180, 900
orange Fruitvale, 300, 1200
orange Coliseum, 240, 1440
gray Coliseum, 54, 1494
gray OAK, 480, 1974
arrive OAK, 0, 1974


In [12]:
# San Francisco International Airport (Red & Yellow)
my_neo4j_shortest_path('depart Downtown Berkeley', 'arrive SFO')


--------------------------------
   Total Cost:  3468
   Minutes:  57.8
--------------------------------
depart Downtown Berkeley, 0, 0
red Downtown Berkeley, 0, 0
red Ashby, 180, 180
red MacArthur, 240, 420
red 19th Street, 180, 600
red 12th Street, 120, 720
red West Oakland, 300, 1020
red Embarcadero, 420, 1440
red Montgomery Street, 60, 1500
red Powell Street, 120, 1620
red Civic Center, 60, 1680
red 16th Street Mission, 180, 1860
red 24th Street Mission, 120, 1980
red Glen Park, 180, 2160
red Balboa Park, 120, 2280
yellow Balboa Park, 48, 2328
yellow Daly City, 240, 2568
yellow Colma, 240, 2808
yellow South San Francisco, 180, 2988
yellow San Bruno, 240, 3228
yellow SFO, 240, 3468
arrive SFO, 0, 3468


## Analysis on Shortest Paths from the Downtown Berkeley Station to all end stations of all BART lines.

We applied Dijkstra's algorithm on Neo4j to identify the shortest paths from the Downtown Berkeley station to the end stations of all BART lines. The end stations include Richmond, Antioch, Daly City, Millbrae, Dublin, Berryessa, Oakland Airport, and San Francisco Airport. The commute time ranges from 13 minutes to 70 minutes.

The shortest trip time is from the Downtown Berkeley Station to the Richmond Station, which takes 13 minutes with either the red or the orange line. The most extended trip is from the Downtown Berkeley Station to the Berryessa Station, which takes 70 minutes or 1 hour and 10 minutes with the orange line. Coincidently, both trips do not need to transfer at any transferring stations.

For the trips require the transfer, such as the trips from the Downtown Berkeley station to the Antoich, Dublin, Oakland Airport, and San Francisco Airport stations, we transfer at the MacArthur, Coliseum, and Balboa Park stations.

Obviously, the closer to the Downtown Berkeley Station, the shorter the commute time.



## use my_select_query_pandas() function to run a select query and return rows in a Pandas dataframe

In [13]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)
    

In [14]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

## Create a cursor for the connection

In [15]:
cursor = connection.cursor()

## Connect Google Map

In [16]:
f = open('gmap_api_key.txt', 'r')
my_api_key = f.read()
f.close()

gmaps.configure(api_key=my_api_key)

## Berkeley Store latitude & longtitude

In [17]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select * from stores
where city = 'Berkeley';

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df

Unnamed: 0,store_id,street,city,state,zip,latitude,longitude
0,1,3000 Telegraph Ave,Berkeley,CA,94705,37.8555,-122.2604


In [18]:
rollback_before_flag = True
rollback_after_flag = True

query = """

select distinct z.*
from stores as s
     join zip_codes as z
         on s.zip = z.zip
where s.city = 'Berkeley'
order by 1,2

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df

Unnamed: 0,zip,latitude,longitude,city,state,population,area,density,time_zone
0,94705,37.8652,-122.2382,Berkeley,CA,13365,3.4614,3861.16,America/Los_Angeles


## Basic Simple Map

In [19]:
#sather_gate_berkeley = (37.870260430419115, -122.25950168579497)
berkeley_store = (37.8652, -122.2382)

gmaps.figure(center=berkeley_store, zoom_level=9)

Figure(layout=FigureLayout(height='420px'))

## Transit Layer

In [20]:
fig = gmaps.figure(center=berkeley_store, zoom_level=10)

fig.add_layer(gmaps.transit_layer())

fig

Figure(layout=FigureLayout(height='420px'))