In [25]:
%matplotlib qt
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt

In [95]:
# Load taxi datasets
taxi_y = pd.read_csv("../data/nyc_yellow_taxi.csv.zip")
taxi_g = pd.read_csv("../data/nyc_green_taxi.csv.zip")

In [96]:
# Filter taxi datasets (drop where latiude is not given, so prior to 2016)
taxi_y = taxi_y[taxi_y.pickup_longitude.notnull()]
taxi_g = taxi_g[taxi_g.pickup_longitude.notnull()]

In [100]:
# Sample datasets (so we don't get crushing memory)
taxi_g_sample = taxi_g.sample(n=1000)
taxi_y_sample = taxi_g.sample(n=1000)

In [103]:
# Get some generalised idea of the map
fig, ax = plt.subplots(figsize=(10,10))

sns.scatterplot(x='pickup_longitude', y='pickup_latitude', hue='total_amount', size='trip_distance', data=taxi_y_sample, ax=ax)
ax.set_xlim(-74.1, -73.7)
ax.set_ylim(40.6, 40.9)

(40.6, 40.9)

In [104]:
# This converts the string with datatime to a datatime class (so it can be worked on)
taxi_g_sample['pickup_datetime'] = pd.to_datetime(taxi_g_sample['pickup_datetime'])
taxi_g_sample.index = taxi_g_sample['pickup_datetime']

taxi_y_sample['pickup_datetime'] = pd.to_datetime(taxi_y_sample['pickup_datetime'])
taxi_y_sample.index = taxi_y_sample['pickup_datetime'] 

In [105]:
# Separate data per year to see if regions where taxis are taken changes over time

taxi_g_sample['year'] = taxi_g_sample.pickup_datetime.dt.year
taxi_y_sample['year'] = taxi_y_sample.pickup_datetime.dt.year

In [108]:
g = sns.FacetGrid(taxi_y_sample, col="year")
g.map(plt.scatter, "pickup_longitude", "pickup_latitude", alpha=.7)
g.set(xlim=(-74.1, -73.7), ylim=(40.6, 40.9));

In [89]:
sns.pairplot(taxi_g_sample, hue="year");

In [111]:
from bokeh.io import output_file, output_notebook, show
from bokeh.models import (
  GMapPlot, GMapOptions, ColumnDataSource, Circle, LogColorMapper, BasicTicker, ColorBar,
    DataRange1d, PanTool, WheelZoomTool, BoxSelectTool
)
from bokeh.models.mappers import ColorMapper, LinearColorMapper
from bokeh.palettes import Viridis5

In [270]:
def plot_on_gmaps(dataframe, latitude_attr, longitude_attr, size_attr, color_attr, centre_lat=40.8, centre_lon=-74, zoom=11, title=''):
    

    map_options = GMapOptions(lat=centre_lat, lng=centre_lon, map_type="roadmap", zoom=zoom)

    plot = GMapPlot(map_options=map_options)
    plot.api_key = "AIzaSyBYrbp34OohAHsX1cub8ZeHlMEFajv15fY"

    source = ColumnDataSource(
        data=dict(
            lat = getattr(dataframe, latitude_attr).tolist(),
            lon= getattr(dataframe, longitude_attr).tolist(),
            size= getattr(dataframe, size_attr).tolist(),
            color= getattr(dataframe, color_attr).tolist()
        )
    )

    # Transform color value (so the year) to a color
    color_mapper = LinearColorMapper(palette=Viridis5)

    circle = Circle(x="lon", y="lat", size="size", fill_color={'field': 'color', 'transform': color_mapper}, fill_alpha=0.5, line_color=None)
    plot.add_glyph(source, circle)

    color_bar = ColorBar(color_mapper=color_mapper, ticker=BasicTicker(),
                         label_standoff=12, border_line_color=None, location=(0,0))
    plot.add_layout(color_bar, 'right')
    plot.title.text = title


    output_notebook()
    plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool())
    show(plot)

In [324]:
plot_on_gmaps(taxi_g_sample, latitude_attr='pickup_latitude', longitude_attr='pickup_longitude', size_attr='trip_distance', color_attr='year', centre_lat=40.8, centre_lon=-74)

## Mapping bikes in NY

In [136]:
bikes_keys = pd.read_csv("../data/nyc_bikeshare_key.csv")

In [138]:
import random
f = "../data/nyc_bikeshare.csv"

# Count the lines
num_lines = sum(1 for l in open(f))
# Sample size - in this case ~1%
size = int(num_lines / 100)

# The row indices to skip - make sure 0 is not included to keep the header!
skip_idx = random.sample(range(1, num_lines), num_lines - size)

# Read the data
bikes = pd.read_csv(f, skiprows=skip_idx,)

In [142]:
# Add latitude and longitude columns for each station ID
bikes_sample = bikes.sample(1000)

In [190]:
bikes_sample

bikes_sample_coords = bikes_sample.merge(bikes_keys, left_on='start_station_id', right_on='station_id')
bikes_sample_coords.head()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,end_station_id,bikeid,usertype,birthyear,gender,station_id,station_name,station_latitude,station_longitude
0,1844,07-08-18 17:05:21,07-08-18 17:36:06,3164.0,358.0,19636,1.0,1989.0,1,3164,Columbus Ave & W 72 St,40.777057,-73.978985
1,924,08-12-19 19:04:28,08-12-19 19:19:53,3164.0,3145.0,38552,0.0,1990.0,2,3164,Columbus Ave & W 72 St,40.777057,-73.978985
2,886,09-27-15 09:38:43,09-27-15 09:53:29,3164.0,479.0,22716,1.0,1984.0,2,3164,Columbus Ave & W 72 St,40.777057,-73.978985
3,1143,05-31-19 18:07:17,05-31-19 18:26:20,3164.0,3520.0,30722,1.0,1969.0,0,3164,Columbus Ave & W 72 St,40.777057,-73.978985
4,1253,09-19-14 15:50:47,09-19-14 16:11:40,327.0,529.0,21569,1.0,1979.0,1,327,Vesey Pl & River Terrace,40.715338,-74.016584


In [163]:
def plot_on_gmaps_simple(dataframe, latitude_attr, longitude_attr, centre_lat=40.8, centre_lon=-74, zoom=11):
    

    map_options = GMapOptions(lat=centre_lat, lng=centre_lon, map_type="roadmap", zoom=zoom)

    plot = GMapPlot(map_options=map_options)
    plot.api_key = "AIzaSyBYrbp34OohAHsX1cub8ZeHlMEFajv15fY"

    source = ColumnDataSource(
        data=dict(
            lat = getattr(dataframe, latitude_attr).tolist(),
            lon= getattr(dataframe, longitude_attr).tolist(),
        )
    )

    # Transform color value (so the year) to a color
    color_mapper = LinearColorMapper(palette=Viridis5)

    circle = Circle(x="lon", y="lat", fill_alpha=0.5, line_color=None)
    plot.add_glyph(source, circle)



    output_notebook()
    plot.add_tools(PanTool(), WheelZoomTool(), BoxSelectTool())
    show(plot)

In [174]:
plot_on_gmaps_simple(bikes_sample_coords, latitude_attr='station_latitude', longitude_attr='station_longitude', centre_lat=40.8, centre_lon=-74, zoom=12)

In [284]:
# Rescale the size of the dots
bikes_sample_coords_sum = bikes_sample_coords.groupby('station_id').mean()

station_id
72      1500.00
79      1571.00
83       427.00
116     1001.25
127      408.50
         ...   
3697     903.00
3701    1017.00
3709     465.00
3723     468.00
3798     298.00
Name: tripduration, Length: 403, dtype: float64

In [246]:
plot_on_gmaps(bikes_sample_coords_sum, latitude_attr='station_latitude', longitude_attr='station_longitude', size_attr='tripduration', color_attr='usertype', centre_lat=40.8, centre_lon=-74, zoom=12)

### Separate by year

In [225]:
# This converts the string with datatime to a datatime class (so it can be worked on)
bikes_sample_coords['starttime'] = pd.to_datetime(bikes_sample_coords['starttime'])
bikes_sample_coords.head()

Unnamed: 0,tripduration,starttime,stoptime,start_station_id,end_station_id,bikeid,usertype,birthyear,gender,station_id,station_name,station_latitude,station_longitude
0,1844,2018-07-08 17:05:21,07-08-18 17:36:06,3164.0,358.0,19636,1.0,1989.0,1,3164,Columbus Ave & W 72 St,40.777057,-73.978985
1,924,2019-08-12 19:04:28,08-12-19 19:19:53,3164.0,3145.0,38552,0.0,1990.0,2,3164,Columbus Ave & W 72 St,40.777057,-73.978985
2,886,2015-09-27 09:38:43,09-27-15 09:53:29,3164.0,479.0,22716,1.0,1984.0,2,3164,Columbus Ave & W 72 St,40.777057,-73.978985
3,1143,2019-05-31 18:07:17,05-31-19 18:26:20,3164.0,3520.0,30722,1.0,1969.0,0,3164,Columbus Ave & W 72 St,40.777057,-73.978985
4,1253,2014-09-19 15:50:47,09-19-14 16:11:40,327.0,529.0,21569,1.0,1979.0,1,327,Vesey Pl & River Terrace,40.715338,-74.016584


In [243]:
bikes_sample_coords['year'] = bikes_sample_coords.starttime.dt.year
bikes_sample_coords['year2'] = bikes_sample_coords.starttime.dt.year

In [277]:
bikes_sample_coords_years_sum = bikes_sample_coords.groupby(['year', 'start_station_id']).mean()
bikes_sample_coords_years_sum

# Rescale the size of the dots
bikes_sample_coords_years_sum['tripduration'] = bikes_sample_coords_years_sum['tripduration'] / bikes_sample_coords_years_sum['tripduration'].max()*500
bikes_sample_coords_years_sum.loc[2016]['tripduration'].max()

500.0

In [285]:
def remove_outlier(df_in, col_name):
    q1 = df_in[col_name].quantile(0.25)
    q3 = df_in[col_name].quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
    return df_out

In [288]:
# Clean up outliers
bikes_sample_coords_years_sum_filter = remove_outlier(bikes_sample_coords_years_sum, 'tripduration')

In [289]:
plot_on_gmaps(bikes_sample_coords_years_sum,
              latitude_attr='station_latitude',
              longitude_attr='station_longitude',
              size_attr='tripduration',
              color_attr='year2',
              centre_lat=40.8, centre_lon=-74, zoom=12)

In [309]:
# Plot only one of the years

year= 2014

for year in range(2013,2020):
    plot_on_gmaps(bikes_sample_coords_years_sum.loc[ year, : ],
              latitude_attr='station_latitude',
              longitude_attr='station_longitude',
              size_attr='tripduration',
              color_attr='year2',
              centre_lat=40.73, centre_lon=-74, zoom=12, 
              title=str(year))

In [295]:
bikes_sample_coords_years_sum

# Find central station (with most tripdurations)
bikes_sample_coords_sum.sort_values('tripduration')

Unnamed: 0_level_0,tripduration,start_station_id,end_station_id,bikeid,usertype,birthyear,gender,station_latitude,station_longitude,year,year2
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3094,89.000000,3094.0,3095.0,21325.000000,1.000000,1989.0,1.000000,40.716981,-73.944859,2017.000000,2017.000000
353,100.000000,353.0,3429.0,34372.000000,1.000000,1984.0,2.000000,40.685396,-73.974315,2019.000000,2019.000000
3297,122.000000,3297.0,3404.0,30151.000000,1.000000,1990.0,1.000000,40.668663,-73.979881,2018.000000,2018.000000
3671,131.000000,3671.0,3140.0,26581.000000,1.000000,1956.0,1.000000,40.774779,-73.954275,2019.000000,2019.000000
462,134.000000,462.0,3258.0,25575.000000,1.000000,1976.0,1.000000,40.746920,-74.004519,2018.000000,2018.000000
...,...,...,...,...,...,...,...,...,...,...,...
394,2597.666667,394.0,1785.0,22819.333333,0.666667,1971.5,0.666667,40.725213,-73.977688,2015.333333,2015.333333
3478,3001.000000,3478.0,3478.0,33742.000000,1.000000,1974.0,1.000000,40.657089,-74.008702,2019.000000,2019.000000
3289,3447.000000,3289.0,3293.0,28877.000000,0.000000,,0.000000,40.790179,-73.972889,2017.000000,2017.000000
3303,11710.000000,3303.0,3315.0,24965.000000,1.000000,1964.0,2.000000,40.684989,-73.994403,2016.000000,2016.000000


In [None]:
#The station with highest ride time is ID 3161: W 76 St & Columbus Ave
# lat: 40.780184
# lon: -73.977285

# The station with the most number of rides is ID 519: Pershing Square North
# 40.751873
# -73.977706


In [320]:
import math  
def calculateDistancefromCentral(x1,y1,x2=40.780184,y2=-73.977285):  
    dist = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)  
    return dist

In [321]:
bikes_sample_coords_years_sum['distancefromCentre'] = bikes_sample_coords_years_sum.apply(lambda x: calculateDistancefromCentral(x['station_latitude'], x['station_longitude']), axis=1)

In [322]:
bikes_sample_coords_years_sum

Unnamed: 0_level_0,Unnamed: 1_level_0,tripduration,end_station_id,bikeid,usertype,birthyear,gender,station_id,station_latitude,station_longitude,year2,distancefromCentre
year,start_station_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2013,116.0,3.563460,540.0,15218.0,1.0,1977.0,1.0,116.0,40.741776,-74.001497,2013.0,0.045403
2013,127.0,3.813235,382.0,20619.0,1.0,1986.0,2.0,127.0,40.731724,-74.006744,2013.0,0.056712
2013,146.0,13.117361,483.0,17922.0,1.0,1964.5,1.5,146.0,40.716250,-74.009106,2013.0,0.071415
2013,152.0,6.918773,233.0,16999.0,1.0,1965.0,1.0,152.0,40.714740,-74.009106,2013.0,0.072770
2013,174.0,4.620841,335.0,16326.0,0.0,,0.0,174.0,40.738177,-73.977387,2013.0,0.042008
...,...,...,...,...,...,...,...,...,...,...,...,...
2019,3721.0,1.581910,3619.0,16691.0,1.0,1989.0,1.0,3721.0,40.767549,-73.920933,2019.0,0.057751
2019,3723.0,3.896493,217.0,16167.0,0.0,1969.0,0.0,3723.0,40.695317,-73.990157,2019.0,0.085837
2019,3765.0,15.943984,471.0,38424.0,1.0,1967.0,2.0,3765.0,40.709897,-73.940080,2019.0,0.079527
2019,3798.0,2.481100,526.0,38994.0,1.0,1987.0,2.0,3798.0,40.752269,-73.982079,2019.0,0.028324


In [323]:
sns.boxplot(y='distancefromCentre', x='year2', data=bikes_sample_coords_years_sum,palette="GnBu_d",)

<matplotlib.axes._subplots.AxesSubplot at 0x2bf2f8f2e10>