### NYC Citi Bike Share Station Departure and Arrival Imbalance Analysis

In [1]:
import pandas as pd
import folium

#### Data exploration
The NYC bike share program makes its data public, you can download it here:
https://www.citibikenyc.com/system-data. Here we lookinto September 2019's data.



In [2]:
bike_data = pd.read_csv("201909-citibike-tripdata.csv")
bike_data.head(3)

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,327,2019-09-01 00:00:01.9580,2019-09-01 00:05:29.3410,3733,Avenue C & E 18 St,40.730563,-73.973984,504,1 Ave & E 16 St,40.732219,-73.981656,39213,Subscriber,1968,1
1,1145,2019-09-01 00:00:04.1430,2019-09-01 00:19:09.8360,3329,Degraw St & Smith St,40.682915,-73.993182,270,Adelphi St & Myrtle Ave,40.693083,-73.971789,21257,Customer,1969,0
2,1293,2019-09-01 00:00:07.3090,2019-09-01 00:21:40.7580,3168,Central Park West & W 85 St,40.784727,-73.969617,423,W 54 St & 9 Ave,40.765849,-73.986905,15242,Customer,1969,0


Simple preprocessing: 1) xhange column names; 2) convert time strings into DateTime objects

In [3]:
bike_data.rename(columns={'starttime': 'Start Time'}, inplace=True)
bike_data.rename(columns={'stoptime': 'Stop Time'}, inplace=True)
bike_data.rename(columns={'start station id': 'Start Station ID'}, inplace=True)
bike_data.rename(columns={'start station name': 'Start Station Name'}, inplace=True)
bike_data.rename(columns={'start station latitude': 'Start Station Latitude'}, inplace=True)
bike_data.rename(columns={'start station longitude': 'Start Station Longitude'}, inplace=True)
bike_data.rename(columns={'end station id': 'End Station ID'}, inplace=True)
bike_data.rename(columns={'end station name': 'End Station Name'}, inplace=True)
bike_data.rename(columns={'end station latitude': 'End Station Latitude'}, inplace=True)
bike_data.rename(columns={'end station longitude': 'End Station Longitude'}, inplace=True)
bike_data["Start Time"] = pd.to_datetime(bike_data["Start Time"])
bike_data["Stop Time"] = pd.to_datetime(bike_data["Stop Time"])
bike_data["hour"] = bike_data["Start Time"].map(lambda x: x.hour)

To analyze the departure and arrival imbalance, I generate a dataframe that shows the station name, coordinates, hourly departure, and hourly arrival.

In [4]:
def trip_counts_by_hour(hour):
    '''
    Input:
        hour: integer indicating hour of the date, e.g. 16 is 16:00
    Output:
        trip_counts_df: Pandas data frame that shows the station names, locations, (aggrgated) hourly departures and arrivals
    '''
    
    # bike station ID, Name, and coordinates
    locations = bike_data.groupby("Start Station ID").first()
    locations = locations.loc[:, ["Start Station Latitude",
                                 "Start Station Longitude",
                                 "Start Station Name"]]
    # pick an hour
    hour_df = bike_data[bike_data["hour"]==hour]
    
    # no. of departures for each station
    departure_df =  hour_df.groupby("Start Station ID").count()
    departure_df = departure_df.iloc[:,[0]]
    departure_df.columns= ["Departure Count"]
    
    # no. of arrivals for each station
    arrival_df =  hour_df.groupby("End Station ID").count().iloc[:,[0]]
    arrival_df.columns= ["Arrival Count"]
    
    # for each bike station shown depatures and arrivals
    trip_counts_df = departure_df.join(locations).join(arrival_df)
    trip_counts_df.rename(columns={'Start Station Latitude': 'Latitude'}, inplace=True)
    trip_counts_df.rename(columns={'Start Station Longitude': 'Longitude'}, inplace=True)
    trip_counts_df.rename(columns={'Start Station Name': 'Name'}, inplace=True)    
    new_cols = ['Name', 'Latitude', 'Longitude', 'Departure Count', 'Arrival Count']
    trip_counts_df = trip_counts_df[new_cols]
    return trip_counts_df


# Print out a few lines of new dataframe
trip_counts_by_hour(18).head(3)

Unnamed: 0_level_0,Name,Latitude,Longitude,Departure Count,Arrival Count
Start Station ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
72,W 52 St & 11 Ave,40.767272,-73.993929,554,579
79,Franklin St & W Broadway,40.719116,-74.006667,293,241
82,St James Pl & Pearl St,40.711174,-74.000165,167,170


In [5]:
def plot_station_counts(trip_counts, scale):
    '''
    Input:
        trip_counts: a Pandas dataframe that shows the a dataframe that shows the 
        station name, coordinates, hourly departure, and hourly arrival
        scale: a number that scale the radius, which is proportional to the value of net departures
    Output:
        folim map: 
    '''
    
    folium_map = folium.Map(location=[40.738, -73.98],
                            zoom_start=12.5,
                            tiles="CartoDB positron",
                            width='70%')

    for index, row in trip_counts.iterrows():
        # net departure
        net_departures = int(row["Departure Count"]-row["Arrival Count"])
        
        # Generate popup messages that are shown on click
        popup_text = "{}<br> No. of departures: {}<br> No. of arrivals: {}<br> Net departures: {}"
        popup_text = popup_text.format(row["Name"],
                          row["Departure Count"], int(row["Arrival Count"]),
                          net_departures)
        
        # radius of circles, which are proportional to the net departures
        radius = net_departures/scale
        
        # Choose the color of the marker, showing net departure or net arrival
        if net_departures>0: # net departure
            color="#E37222" 
        else:
            color="#007849" # net arrival  
            
        # Add all the markers on the map
        folium.CircleMarker(location=(row["Latitude"],
                                      row["Longitude"]),
                            radius=radius,
                            color=color,
                            popup=popup_text,
                            fill=True).add_to(folium_map)
    return folium_map

In [6]:
trip_counts = trip_counts_by_hour(18)
trip_counts = trip_counts.fillna(0)
plot_station_counts(trip_counts, 40)