### Graph Algorithm: Harmonic Centrality

We use the customers table and zip codes table to generate a random location for each ACME Gourmet customer. Then, we use harmonic centrality algorithm, which is a type of closeness centrality algorithm, to figure out which are the top stations to have as a food pick-up that are closest to a majority of the customers.

In [1]:
import neo4j

import csv

import math
import numpy as np
import pandas as pd

import psycopg2
import random
from geographiclib.geodesic import Geodesic

**Setup the connection to the database to get access to the ACME Gourmet tables and BART stations data**

In [2]:
#
# function to run a select query and return rows in a pandas dataframe
# pandas puts all numeric values from postgres to float
# if it will fit in an integer, change it to integer
#

def my_select_query_pandas(query, rollback_before_flag, rollback_after_flag):
    "function to run a select query and return rows in a pandas dataframe"
    
    if rollback_before_flag:
        connection.rollback()
    
    df = pd.read_sql_query(query, connection)
    
    if rollback_after_flag:
        connection.rollback()
    
    # fix the float columns that really should be integers
    
    for column in df:
    
        if df[column].dtype == "float64":

            fraction_flag = False

            for value in df[column].values:
                
                if not np.isnan(value):
                    if value - math.floor(value) != 0:
                        fraction_flag = True

            if not fraction_flag:
                df[column] = df[column].astype('Int64')
    
    return(df)

In [3]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)

In [4]:
cursor = connection.cursor()

**Get list of all the BART stations**

In [5]:
#Get list of all the BART stations
rollback_before_flag = True
rollback_after_flag = True

query = """
    
select * from stations;

"""
cursor.execute(query)

connection.rollback()

station_df_rows = cursor.fetchall()

station_df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
station_df

Unnamed: 0,station,latitude,longitude,transfer_time
0,12th Street,37.803608,-122.272006,282
1,16th Street Mission,37.764847,-122.420042,287
2,19th Street,37.807869,-122.26898,67
3,24th Street Mission,37.752,-122.4187,277
4,Antioch,37.996281,-121.783404,0
5,Ashby,37.853068,-122.269957,299
6,Balboa Park,37.721667,-122.4475,48
7,Bay Fair,37.697,-122.1265,63
8,Berryessa,37.368361,-121.874655,288
9,Castro Valley,37.690748,-122.075679,0


**Join the customer with the zip codes table to get coordinates for each zipcode**

In [6]:

rollback_before_flag = True
rollback_after_flag = True

query = """
    
select c.customer_id, c.street, c.city, c.state, c.zip,z.latitude, z.longitude,
        c.closest_store_id, c.distance
        from customers c join zip_codes z 
        on c.zip = z.zip where c.state = 'CA';

"""
cursor.execute(query)

connection.rollback()

customer_df_rows = cursor.fetchall()

customer_df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)

customer_df

Unnamed: 0,customer_id,street,city,state,zip,latitude,longitude,closest_store_id,distance
0,1,5 Ramsey Place,Oakland,CA,94609,37.8343,-122.2643,1,1
1,2,6 Londonderry Plaza,Oakland,CA,94609,37.8343,-122.2643,1,1
2,3,548 Mcguire Parkway,Oakland,CA,94609,37.8343,-122.2643,1,1
3,4,99 Kennedy Court,Oakland,CA,94609,37.8343,-122.2643,1,1
4,5,51 Mcbride Drive,Oakland,CA,94609,37.8343,-122.2643,1,1
...,...,...,...,...,...,...,...,...,...
8133,8134,331 Sommers Park,San Geronimo,CA,94963,38.0138,-122.6703,1,25
8134,8135,5 Esker Park,San Geronimo,CA,94963,38.0138,-122.6703,1,25
8135,8136,1947 Thackeray Road,San Geronimo,CA,94963,38.0138,-122.6703,1,25
8136,8137,90777 Heath Crossing,San Geronimo,CA,94963,38.0138,-122.6703,1,25


**Generate a random location for each customer**

In [7]:
#Functions to help generate a random location for each customer
def my_calculate_box(point, miles):
    "Given a point and miles, calculate the box in form left, right, top, bottom"
    
    geod = Geodesic.WGS84

    kilometers = miles * 1.60934
    meters = kilometers * 1000

    g = geod.Direct(point[0], point[1], 270, meters)
    left = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 90, meters)
    right = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 0, meters)
    top = (g['lat2'], g['lon2'])

    g = geod.Direct(point[0], point[1], 180, meters)
    bottom = (g['lat2'], g['lon2'])
    
    return(left, right, top, bottom)

def generate_customer_location(row, miles= 3):
    #point = (latitude, longitude)
    point = (row['latitude'], row['longitude'])
    left, right, top, bottom = my_calculate_box(point, miles)
    random_point = (random.uniform( bottom[0], top[0] ), random.uniform( left[1], right[1]))
    return random_point

In [8]:
customer_df['Location'] = customer_df.apply(generate_customer_location, axis = 1)
customer_df

Unnamed: 0,customer_id,street,city,state,zip,latitude,longitude,closest_store_id,distance,Location
0,1,5 Ramsey Place,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.85139088905951, -122.30743812535414)"
1,2,6 Londonderry Plaza,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.83059005558272, -122.25539511233912)"
2,3,548 Mcguire Parkway,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.85864464606292, -122.24750243938001)"
3,4,99 Kennedy Court,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.84682971461261, -122.237875096205)"
4,5,51 Mcbride Drive,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.866367896567766, -122.30630049312991)"
...,...,...,...,...,...,...,...,...,...,...
8133,8134,331 Sommers Park,San Geronimo,CA,94963,38.0138,-122.6703,1,25,"(38.05153190607294, -122.67972215490339)"
8134,8135,5 Esker Park,San Geronimo,CA,94963,38.0138,-122.6703,1,25,"(38.03811262045287, -122.64173442894977)"
8135,8136,1947 Thackeray Road,San Geronimo,CA,94963,38.0138,-122.6703,1,25,"(38.05651479855831, -122.67887076468683)"
8136,8137,90777 Heath Crossing,San Geronimo,CA,94963,38.0138,-122.6703,1,25,"(37.991664447295236, -122.66829535993755)"


**Create functions to find the distance from each customer's location to each of the stations, find the closest station, and the distance to the closest station for each customer.**

In [11]:
#  Given two points in (latitude, longitude) format, calculate the distance between them
# in miles. 

def my_calculate_distance(point_1, point_2):
    "Given two points in (latitude, longitude) format, calculate the distance between them in miles"
    
    geod = Geodesic.WGS84

    g = geod.Inverse(point_1[0], point_1[1], point_2[0], point_2[1])
    miles = g['s12'] / 1000 * 0.621371
    
    return miles

def distance_to_station():
    # Create a list to store the results
    distance_data = []

    # Iterate over customers and stores to calculate distances
    for _, customer in customer_df.iterrows():
       
        for _, station in station_df.iterrows():
            
            distance = compute_distance(customer, store)
            distance_data.append({
                'customer_id': customer['customer_id'],
                'store_id': store['store_id'],
                'distance_km': distance
            })

    # Convert the results to a DataFrame
    distance_df = pd.DataFrame(distance_data)

def distances_to_stations(c):
    distances_dict = {}
    
    lats = station_df['latitude']
    longs = station_df['longitude']
    stations = station_df['station']
    
    for i in range(len(stations)):
        point1 = (c['latitude'], c['longitude'])
        point2 = (lats[i],longs[i])
      
        dist = my_calculate_distance(point1, point2)
        
        distances_dict[stations[i]] = dist
    return distances_dict

def find_closest_station(c):
    distances_lst = []
    
    lats = station_df['latitude']
    longs = station_df['longitude']
    stations = station_df['station']
    
    for i in range(len(stations)):
        point1 = c['Location']
        point2 = (lats[i],longs[i])
      
        dist = my_calculate_distance(point1, point2)

        distances_lst.append(dist)
        
    min_ind = distances_lst.index(min(distances_lst))
    closest_station = stations[min_ind]

    return closest_station

def find_closest_station_distance(c):
    distances_lst = []
   
    lats = station_df['latitude']
    longs = station_df['longitude']
    stations = station_df['station']
    
    for i in range(len(stations)):
        point1 = c['Location']
        point2 = (lats[i],longs[i])
      
        dist = my_calculate_distance(point1, point2)
        distances_lst.append(dist)
        

    return min(distances_lst)

In [12]:
customer_df["Closest_Station"] = customer_df.apply(find_closest_station, axis = 1)
customer_df["Distance_to_Closest_Station"] = customer_df.apply(find_closest_station_distance, axis = 1)
customer_df["Distances_to_stations"] = customer_df.apply(distances_to_stations, axis = 1)
customer_df

Unnamed: 0,customer_id,street,city,state,zip,latitude,longitude,closest_store_id,distance,Location,Closest_Station,Distance_to_Closest_Station,Distances_to_stations
0,1,5 Ramsey Place,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.85139088905951, -122.30743812535414)",Ashby,2.052949,"{'12th Street': 2.1583386063566827, '16th Stre..."
1,2,6 Londonderry Plaza,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.83059005558272, -122.25539511233912)",MacArthur,0.669436,"{'12th Street': 2.1583386063566827, '16th Stre..."
2,3,548 Mcguire Parkway,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.85864464606292, -122.24750243938001)",Rockridge,1.010386,"{'12th Street': 2.1583386063566827, '16th Stre..."
3,4,99 Kennedy Court,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.84682971461261, -122.237875096205)",Rockridge,0.794156,"{'12th Street': 2.1583386063566827, '16th Stre..."
4,5,51 Mcbride Drive,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.866367896567766, -122.30630049312991)",North Berkeley,1.398841,"{'12th Street': 2.1583386063566827, '16th Stre..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8133,8134,331 Sommers Park,San Geronimo,CA,94963,38.0138,-122.6703,1,25,"(38.05153190607294, -122.67972215490339)",Richmond,19.504532,"{'12th Street': 26.15039154998929, '16th Stree..."
8134,8135,5 Esker Park,San Geronimo,CA,94963,38.0138,-122.6703,1,25,"(38.03811262045287, -122.64173442894977)",Richmond,17.235257,"{'12th Street': 26.15039154998929, '16th Stree..."
8135,8136,1947 Thackeray Road,San Geronimo,CA,94963,38.0138,-122.6703,1,25,"(38.05651479855831, -122.67887076468683)",Richmond,19.603756,"{'12th Street': 26.15039154998929, '16th Stree..."
8136,8137,90777 Heath Crossing,San Geronimo,CA,94963,38.0138,-122.6703,1,25,"(37.991664447295236, -122.66829535993755)",Richmond,17.621787,"{'12th Street': 26.15039154998929, '16th Stree..."


In [13]:
customer_df.shape

(8138, 13)

## Neo4J Graphs and Graphing Algorithms

**Create graph database connecting the customer nodes to the station nodes and adding the distances as the weights for the edges.**

In [14]:
driver = neo4j.GraphDatabase.driver(uri="neo4j://neo4j:7687", auth=("neo4j","ucb_mids_w205"))

In [15]:
session = driver.session(database="neo4j")

In [16]:
def my_neo4j_run_query_pandas(query, **kwargs):
    "run a query and return the results in a pandas dataframe"
    
    result = session.run(query, **kwargs)
    
    df = pd.DataFrame([r.values() for r in result], columns=result.keys())
    
    return df

def my_neo4j_number_nodes_relationships():
    "print the number of nodes and relationships"
   
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    print("-------------------------")
    print("  Nodes:", number_nodes)
    print("  Relationships:", number_relationships)
    print("-------------------------")
    
def my_neo4j_create_station_node(station_name, latitude, longitude):
    "create a node with label Station"
    
    query = """
    
    CREATE (:Station {name: $station_name, latitude: $latitude, longitude: $longitude})
    
    
    """
    
    session.run(query, station_name=station_name, latitude = latitude, longitude=longitude)

    
def my_neo4j_create_customer_node(customer, latitude, longitude):
    "create a node with label Customer"
    
    query = """
    
    CREATE (:Customer {name: $customer, latitude: $latitude, longitude: $longitude})
    
    """
    
    session.run(query, customer=customer, latitude = latitude, longitude = longitude)
    
    
def my_neo4j_create_relationship_customer_to_station(customer, to_station, weight):
    "create a relationship one way between Customer to Station"
    
    query = """
    
    MATCH (from:Customer), 
          (to:Station)
    WHERE from.name = $customer and to.name = $to_station
    CREATE (from)-[:DISTANCE_LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, customer=customer, to_station=to_station, weight=weight)

def my_neo4j_create_relationship_station_to_customer(from_station, customer, weight):
    "create a relationship one way between Station to Customer"
    
    query = """
    
    MATCH (from:Station), 
          (to:Customer)
    WHERE from.name = $from_station and to.name = $customer
    MERGE (from)-[:DISTANCE_LINK {weight: $weight}]->(to)
    
    """
    
    session.run(query, from_station=from_station, customer=customer, weight=weight)
    
    



In [17]:
def my_neo4j_nodes_relationships():
    "print all the nodes and relationships"
   
    print("-------------------------")
    print("  Nodes:")
    print("-------------------------")
    
    query = """
        match (n) 
        return n.name as node_name, labels(n) as labels
        order by n.name
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_nodes = df.shape[0]
    
    display(df)
    
    print("-------------------------")
    print("  Relationships:")
    print("-------------------------")
    
    query = """
        match (n1)-[r]->(n2) 
        return n1.name as node_name_1, labels(n1) as node_1_labels, 
            type(r) as relationship_type, n2.name as node_name_2, labels(n2) as node_2_labels
        order by node_name_1, node_name_2
    """
    
    df = my_neo4j_run_query_pandas(query)
    
    number_relationships = df.shape[0]
    
    display(df)
    
    density = (2 * number_relationships) / (number_nodes * (number_nodes - 1))
    
    print("-------------------------")
    print("  Density:", f'{density:.1f}')
    print("-------------------------")
    

In [18]:
connection = psycopg2.connect(
    user = "postgres",
    password = "ucb",
    host = "postgres",
    port = "5432",
    database = "postgres"
)
cursor = connection.cursor()

In [30]:
def my_neo4j_wipe_out_database():
    "wipe out database by deleting all nodes and relationships"
    
    query = "match (node)-[relationship]->() delete node, relationship"
    session.run(query)
    
    query = "match (node) delete node"
    session.run(query)
    
my_neo4j_wipe_out_database()

In [31]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 0
  Relationships: 0
-------------------------


**Create the Station nodes**

In [32]:
#Create station nodes
connection.rollback()

query = """

select *
from stations
order by station

"""

cursor.execute(query)

connection.rollback()

rows = cursor.fetchall()

for row in rows:
 
    station = row[0]
    latitude = np.float(row[1])
    longitude = np.float(row[2])
    
    my_neo4j_create_station_node(station,latitude,longitude)
    #my_neo4j_create_station_node('arrive ' + station)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  latitude = np.float(row[1])
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  longitude = np.float(row[2])


In [111]:
customer_df.head()

Unnamed: 0,customer_id,street,city,state,zip,latitude,longitude,closest_store_id,distance,Location,Distances_to_stations
0,1,5 Ramsey Place,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.79362775231072, -122.2923343548381)","{'12th Street': 2.1583386063566827, '16th Stre..."
1,2,6 Londonderry Plaza,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.84784724056436, -122.24762796911567)","{'12th Street': 2.1583386063566827, '16th Stre..."
2,3,548 Mcguire Parkway,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.85848112224604, -122.21211014788798)","{'12th Street': 2.1583386063566827, '16th Stre..."
3,4,99 Kennedy Court,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.865220266734035, -122.27299579864972)","{'12th Street': 2.1583386063566827, '16th Stre..."
4,5,51 Mcbride Drive,Oakland,CA,94609,37.8343,-122.2643,1,1,"(37.87113529224656, -122.24499348484237)","{'12th Street': 2.1583386063566827, '16th Stre..."


**Create customer nodes and relationship to station nodes**

In [33]:
#Create customer nodes and relationship to station nodes
connection.rollback()


customer_id_lst = customer_df['customer_id']
customer_loc_lst =  customer_df['Location']

distances_to_stations_lst = customer_df['Distances_to_stations']

for c in range(len(customer_id_lst)):
    latitude =  np.float(customer_loc_lst.iloc[c][0])
    longitude = np.float(customer_loc_lst.iloc[c][1])
    customer = customer_id_lst.iloc[c]
    my_neo4j_create_customer_node(customer,latitude,longitude)
    for s in distances_to_stations_lst[c]:
        if distances_to_stations_lst[c][s] <= 5:
            my_neo4j_create_relationship_station_to_customer(s,customer_id_lst.iloc[c],
             distances_to_stations_lst[c][s])


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  latitude =  np.float(customer_loc_lst.iloc[c][0])
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  longitude = np.float(customer_loc_lst.iloc[c][1])


In [23]:
my_neo4j_number_nodes_relationships()

-------------------------
  Nodes: 8188
  Relationships: 4197
-------------------------


### Apply Harmonic Centrality Algorithm

In [24]:
# Harmonic Centrality Algorithm on graph with stations and customers <= 1 miles apart
session.run("CALL gds.graph.drop('station_customer_graph', false) YIELD graphName")

#load graph
query = """
CALL gds.graph.project(
    'station_customer_graph',
    ['Station', 'Customer'],           
    { DISTANCE_LINK: {        
        orientation: 'UNDIRECTED',  
        properties: ['weight']   
    }}
)
"""

session.run(query)

#run harmonic centrality algorithm
query = """CALL gds.closeness.harmonic.stream('station_customer_graph', {})
YIELD nodeId, score
MATCH (n) 
WHERE id(n) = nodeId  
RETURN gds.util.asNode(nodeId).name AS user, score
ORDER BY score DESC"""



centrality_df = my_neo4j_run_query_pandas(query)
centrality_df.head(15)



Unnamed: 0,user,score
0,Downtown Berkeley,0.063637
1,North Berkeley,0.062009
2,312,0.039025
3,313,0.039025
4,314,0.039025
5,315,0.039025
6,316,0.039025
7,317,0.039025
8,318,0.039025
9,319,0.039025


In [29]:
# Harmonic Centrality Algorithm on graph with stations and customers <= 2 miles apart
session.run("CALL gds.graph.drop('station_customer_graph', false) YIELD graphName")

#load graph
query = """
CALL gds.graph.project(
    'station_customer_graph',
    ['Station', 'Customer'],           
    { DISTANCE_LINK: {        
        orientation: 'UNDIRECTED',  
        properties: ['weight']   
    }}
)
"""

session.run(query)

#run harmonic centrality
query = """CALL gds.closeness.harmonic.stream('station_customer_graph', {})
YIELD nodeId, score
MATCH (n) 
WHERE id(n) = nodeId  
RETURN gds.util.asNode(nodeId).name AS user, score
ORDER BY score DESC"""



centrality_df = my_neo4j_run_query_pandas(query)
centrality_df.head(15)



Unnamed: 0,user,score
0,Ashby,0.229986
1,Rockridge,0.204091
2,North Berkeley,0.196542
3,19th Street,0.193932
4,MacArthur,0.188558
5,Lake Merritt,0.178569
6,12th Street,0.178569
7,Downtown Berkeley,0.176779
8,1,0.16237
9,2,0.16237


In [34]:
# Harmonic Centrality Algorithm on graph with stations and customers <= 5 miles apart
session.run("CALL gds.graph.drop('station_customer_graph', false) YIELD graphName")

#load graph
query = """
CALL gds.graph.project(
    'station_customer_graph',
    ['Station', 'Customer'],           
    { DISTANCE_LINK: {        
        orientation: 'UNDIRECTED',  
        properties: ['weight']   
    }}
)
"""

session.run(query)

#run harmonic centrality
query = """CALL gds.closeness.harmonic.stream('station_customer_graph', {})
YIELD nodeId, score
MATCH (n) 
WHERE id(n) = nodeId  
RETURN gds.util.asNode(nodeId).name AS user, score
ORDER BY score DESC"""



centrality_df = my_neo4j_run_query_pandas(query)
centrality_df.head(15)



Unnamed: 0,user,score
0,Rockridge,0.545508
1,MacArthur,0.534447
2,Ashby,0.532159
3,Downtown Berkeley,0.514728
4,12th Street,0.511372
5,19th Street,0.510476
6,Lake Merritt,0.503554
7,West Oakland,0.49747
8,North Berkeley,0.492991
9,El Cerrito Plaza,0.448476


In [35]:
# Degree Centrality on graph with stations and customers <= 5 miles apart
query = """
CALL gds.degree.stream('station_customer_graph')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).name AS name, score AS degree
ORDER BY degree DESC, name
 """
degree_centrality_df = my_neo4j_run_query_pandas(query)
degree_centrality_df.head(15)

Unnamed: 0,name,degree
0,Rockridge,3366.0
1,Ashby,3254.0
2,MacArthur,3237.0
3,Downtown Berkeley,2988.0
4,12th Street,2954.0
5,19th Street,2943.0
6,Lake Merritt,2858.0
7,North Berkeley,2773.0
8,West Oakland,2489.0
9,El Cerrito Plaza,2288.0


### Additional Data Insights

1. Number of Sales for each city in the bay area
2. Average distance away from the closest bart station within each city
3. Most frequent closest bart station

In [88]:
#1 Number of Sales for each city in the bay area
rollback_before_flag = True
rollback_after_flag = True

query = """
with tbl1 as (select customer_id, count(sale_id) as sales_count from sales group by 1)
select city, sum(sales_count) from tbl1 sa join customers c on sa.customer_id = c.customer_id where state = 'CA'
group by c.city order by 2 desc;

"""

df = my_select_query_pandas(query, rollback_before_flag, rollback_after_flag)
df

Unnamed: 0,city,sum
0,Oakland,110086
1,Berkeley,70488
2,San Francisco,49845
3,Richmond,17520
4,Emeryville,15175
...,...,...
62,Ross,30
63,San Carlos,29
64,Pleasanton,27
65,Canyon,27


In [90]:
#2 Average distance away from the closest bart station within each city
customer_df.groupby('Closest_Station')['Distance_to_Closest_Station'].mean().sort_values(ascending= True).head(50)

Closest_Station
12th Street             0.422102
MacArthur               0.901350
Ashby                   1.059794
19th Street             1.115687
Colma                   1.235279
Powell Street           1.269491
Hayward                 1.400989
North Berkeley          1.434199
Lake Merritt            1.475941
Rockridge               1.491691
Downtown Berkeley       1.501118
Glen Park               1.563950
SFO                     1.675848
El Cerrito Plaza        1.750014
West Oakland            1.815549
24th Street Mission     1.837307
Bay Fair                1.855029
OAK                     1.860322
San Leandro             1.897692
Balboa Park             1.919701
Fruitvale               1.926582
South San Francisco     2.037429
16th Street Mission     2.123284
Fremont                 2.168845
Coliseum                2.176654
Pleasant Hill           2.257058
Orinda                  2.388504
Embarcadero             2.410027
Pittsburg               2.511795
Concord                 2.5

In [89]:
#3 Most frequent closest bart station
customer_df['Closest_Station'].value_counts().head(10)

El Cerrito del Norte    605
Richmond                593
Fruitvale               482
El Cerrito Plaza        455
Rockridge               453
Orinda                  430
West Oakland            361
Embarcadero             335
North Berkeley          330
Downtown Berkeley       298
Name: Closest_Station, dtype: int64

In [92]:
customer_df['city'].value_counts().head(10)

Oakland          1928
San Francisco    1437
Berkeley         1077
Richmond          382
Alameda           304
El Cerrito        258
Walnut Creek      231
Emeryville        230
El Sobrante       224
San Leandro       205
Name: city, dtype: int64