# Interactive Maps with Python and Folium
## Using data from NYC CitiBike program



In [1]:
import pandas as pd
import folium
from datetime import datetime
# Dependencies and Setup
import numpy as np
from itertools import combinations
# import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import csv
# Python SQL toolkit and Object Relational Mapper
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect
from sqlalchemy import Column, Integer, String, Float, Date, Text
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base() 
from flask import Flask, jsonify
import pymongo
import sqlite3
import os
os.environ["PATH"] += os.pathsep + "."
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import math as math
from pylab import rcParams

%matplotlib inline

pulled up a map of New York City to add a circle marker.

In [2]:
folium_map = folium.Map(location=[40.738, -73.98],
                        zoom_start=13,
                        tiles="CartoDB dark_matter")

folium.CircleMarker(location=[40.738, -73.98],fill=True).add_to(folium_map)
folium_map

## NYC bike trips
Next, we will load some data. The NYC bike share program makes its data public, it can be downloaded here:
https://www.citibikenyc.com/system-data.

We will use pandas to load the data into python, we’ll convert time strings into DateTime objects
That last line also adds a column to the table indicating the hours of the day. 

In [4]:
from datetime import datetime

bike_data = pd.read_csv("data/trips_.csv")
bike_data["starttime"] = pd.to_datetime(bike_data["starttime"])
bike_data["stoptime"] = pd.to_datetime(bike_data["stoptime"])
bike_data["hour"] = bike_data["starttime"].map(lambda x: x.hour)

FileNotFoundError: [Errno 2] File b'data/trips_.csv' does not exist: b'data/trips_.csv'

In [None]:
bike_data.head()

### Net Arrivals/Departures
We will explore if there is net migration of bikes from one bike station to another and if this migration depends 
on the time of day. We will create a DataFrame containing the locations of each station.

### Pre-processing data
We'll write a function that does the following: 
- generate a DataFrame containing locations of stations
- generates a DataFrame containing the number of trips originating at each station. 
- generates a DataFrame containing the number of trips arriving at each station. 
- join the three dataframes into one.

In [None]:
def get_trip_counts_by_hour(selected_hour):
    # make a DataFrame with locations for each bike station
    locations = bike_data.groupby("start station id").first()
    locations = locations.loc[:, ["start station latitude",
                                 "start station longitude",
                                 "start station name"]]
    
    #select one time of day
    subset = bike_data[bike_data["hour"]==selected_hour]
    
    # count trips for each destination
    departure_counts =  subset.groupby("start station id").count()
    departure_counts = departure_counts.iloc[:,[0]]
    departure_counts.columns= ["Departure Count"]
    
    # count trips for each origin
    arrival_counts =  subset.groupby("start station id").count().iloc[:,[0]]
    arrival_counts.columns= ["Arrival Count"]

    #join departure counts, arrival counts, and locations
    trip_counts = departure_counts.join(locations).join(arrival_counts)
    return trip_counts

# print a sample to check our code works
get_trip_counts_by_hour(6).head()

### Select one hour of the day, and count trips with the same departure point.

Now we'll write a function that generates a new folium map and adds circle markers for each station.

In [None]:
def plot_station_counts(trip_counts):
    # generate a new map
    folium_map = folium.Map(location=[40.738, -73.98],
                            zoom_start=13,
                            tiles="CartoDB dark_matter",
                            width='50%')

    # for each row in the data, add a cicle marker
    for index, row in trip_counts.iterrows():
        # calculate net departures
        net_departures = (row["Departure Count"]-row["Arrival Count"])
        
        # generate the popup message that is shown on click.
        popup_text = "{}<br> total departures: {}<br> total arrivals: {}<br> net departures: {}"
        popup_text = popup_text.format(row["start station name"],
                          row["Arrival Count"],
                          row["Departure Count"],
                          net_departures)
        
        # radius of circles
        radius = net_departures/20
        
        # choose the color of the marker
        if net_departures>0:
            # color="#FFCE00" # orange
            # color="#007849" # green
            color="#E37222" # tangerine
        else:
            # color="#0375B4" # blue
            # color="#FFCE00" # yellow            
            color="#0A8A9F" # teal
        
        # add marker to the map
        folium.CircleMarker(location=(row["start station latitude"],
                                      row["start station longitude"]),
                            radius=radius,
                            color=color,
                            popup=popup_text,
                            fill=True).add_to(folium_map)
    return folium_map

### Showing Real data
We'll make 2 maps to show the different patterns for bike migration at 9 am and 6pm.

In [None]:
# plot net departures at 9AM

trip_counts = get_trip_counts_by_hour(9)
plot_station_counts(trip_counts)

In [None]:
folium_map.save("net departures at 9AM.html")

In [None]:
# plot net departures at 6PM

trip_counts = get_trip_counts_by_hour(18)
folium_map = plot_station_counts(trip_counts)
folium_map

In [None]:
folium_map.save("net departures at 6PM.html")

In [None]:
df = bike_data
df['Age'] = 2019 - df['birth year'];
df['Age'] = df['Age'].astype(int);
df

In [None]:
df = bike_data
trips_df = df.groupby(['start station name','end station name']).size().reset_index(name = 'Number of Trips')
trips_df

In [None]:
from datetime import datetime

In [None]:
#Ensure data is formatted correctly to avoid errors in the visuals
bike_data['starttime'] = pd.to_datetime(bike_data['starttime'])
bike_data['stoptime'] = pd.to_datetime(bike_data['stoptime'])
bike_data['start station name'] = bike_data['start station name'].astype('category')
bike_data['end station name'] = bike_data['end station name'].astype('category')
bike_data['usertype'] = bike_data['usertype'].astype('category')
bike_data['gender'] = bike_data['gender'].astype('category')
round(df.describe(),2)

In [None]:
#Quasi Confirm Hypothesis 
df_bikenum = pd.DataFrame()
df_bikenum['First Bike'] = df[df['tripduration'] < 90]['start station name'] 
df_bikenum['Second Bike'] = df[df['tripduration'] < 90]['end station name']
df_bikenum.head(100)

In [None]:
#Clear up enviornment and drop double count
df_bikenum = pd.DataFrame()
del(df_bikenum)
df = df.drop(df.index[(df['tripduration'] < 90) & 
                          (df['start station latitude'] == df['end station latitude'])])

df.head()

In [None]:
#Data for Top 10 Stations visual
top10 = pd.DataFrame() 
top10['station']= df['start station name'].value_counts().head().index
top10['Number of Starts']= df['start station name'].value_counts().head().values
top10['station'] = top10['station'].cat.remove_unused_categories()
top10['station'] = top10['station'].astype('object')
top10.sort_values(by = 'Number of Starts', ascending = False)

In [None]:
#Plot for Part 1: Top 10 Stations
ax = sns.barplot('station', 'Number of Starts', data = top10, palette="GnBu_d")
ax.set_title('Top 10 Citi Bike Stations by Number of Starts', fontsize = 12)
rcParams['figure.figsize'] = 12,9
ax.set_xticklabels(ax.get_xticklabels(),rotation=40, ha = 'right')
for index, row in top10.iterrows():
    ax.text(index,row['Number of Starts']-1000,row['Number of Starts'], 
            color='white', ha="center", fontsize = 10)
plt.show()

In [None]:
top_stations_df = df.groupby(['start station id']).size().reset_index(name = 'Number of Stations');
top_stations_df = top_stations_df.sort_values('Number of Stations', ascending = False);
top_stations_df

In [None]:
#Identify the 10 most popular trips
trips_df = pd.DataFrame()
trips_df = df.groupby(['start station name','end station name']).size().reset_index(name = 'Number of Trips')
trips_df = trips_df.sort_values('Number of Trips', ascending = False)
trips_df["start station name"] = trips_df["start station name"].astype(str)
trips_df["end station name"] = trips_df["end station name"].astype(str)
trips_df["Trips"] = trips_df["start station name"] + " to " + trips_df["end station name"]
trips_df = trips_df[:10]
trips_df = trips_df.drop(['start station name', "end station name"], axis = 1)
trips_df = trips_df.reset_index()
trips_df.head()

In [None]:
ax4 = sns.barplot('Number of Trips','Trips', data = trips_df,palette="GnBu_d")
ax4.set_title('Most Popular Trips', fontsize = 20)
ax4.set_ylabel("Trips",fontsize=16)
ax4.set_xlabel("Number of Trips",fontsize=16)
for index, row in trips_df.iterrows():
    ax4.text(row['Number of Trips']-250,index,row['Number of Trips'], 
             color='white', ha="center",fontsize = 10)
plt.show()

In [None]:
### Busiest Bike by Times and Minutes Used

Busiest bike and count can be identified by a groupby function
Function above will also identify the number of times the bike was used
A similar groupby function which calls for the sum on minutes can identify the number of minutes the bike was used.

In [None]:
bike_use_df = df.groupby(['bikeid']).size().reset_index(name = 'Number of Bikes');
bike_use_df = bike_use_df.sort_values('Number of Bikes', ascending = False);
bike_use_df

In [None]:
#Bike usage based on number of times used
bike_use_df = pd.DataFrame()
bike_use_df = df.groupby(['bikeid']).size().reset_index(name = 'Number of Times Used')
bike_use_df = bike_use_df.sort_values('Number of Times Used', ascending = False)

bike_use_df = bike_use_df[:10]
bike_use_df['bikeid'] = bike_use_df['bikeid'].astype(str)
bike_use_df['bikeid'] = ('Bike ' + bike_use_df['bikeid'])
bike_use_df = bike_use_df.reset_index()
bike_use_df.head()

In [None]:
#Visual of most used bike based on Number of Trips
ax8 = sns.barplot('Number of Times Used', 'bikeid',data = bike_use_df, palette="GnBu_d")
ax8.set_title('Most Popular Bikes by Number of Times Used')
for index, row in bike_use_df.iterrows():
    ax8.text(row['Number of Times Used']-100,index,row['Number of Times Used'], color='white', ha="center", fontsize =10)
plt.show()