In [1]:
import pandas as pd
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
df18 = pd.read_pickle("2018_all_data.pkl")

In [None]:
sample = df18.sample(n = 10000, random_state = 1)

In [None]:
hist = sns.histplot(data = sample, x = 'cost', kde = True, bins = int(np.log2(len(sample))) + 1)
plt.title('Trip Cost Distribution')
#plt.savefig('plots/cost_hist_10000_sturges.png')

In [None]:
sns.scatterplot(data = sample, x = 'approx_dist', y = 'cost')
plt.title('Trip cost vs approximate distance')
#plt.savefig('plots/approx_vs_cost.png')

In [None]:
sns.scatterplot(data = sample, x = 'trip_distance', y = 'approx_dist')
plt.plot([0,14], [0,14], 'r--')
plt.title('Distance Approximation Error')
#plt.savefig('plots/approx_vs_actual_dist.png')

In [None]:
dataDO = df18.groupby('DOBorough').size()

In [None]:
dataDO = dataDO.drop(['EWR', 'Staten Island'])

In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = dataDO.keys()
plt.pie(x=dataDO, explode=[0.1]*4, autopct="%.1f%%", pctdistance=1.15)
plt.title("Trips per Drop Off Borough", fontsize=14)
plt.legend(labels = labels)
plt.savefig('plots/DOBorough_pie.png')

In [None]:
dataPU = df18.groupby('PUBorough').size()

In [None]:
dataPU = dataPU.drop(['EWR', 'Staten Island'])

In [None]:
pie, ax = plt.subplots(figsize=[10,6])
labels = dataPU.keys()
plt.pie(x=dataPU, explode=[0.1]*4, autopct="%.1f%%", pctdistance=1.15)
plt.title("Trips per Pick Up Borough", fontsize=14)
plt.legend(labels = labels)
plt.savefig('plots/PUBorough_pie.png')

In [None]:
geo_sample = df18.groupby('PULocationID').sample(n = 10, replace = True)

In [None]:
sf = gpd.read_file('zone_shapefiles/fixed_taxi_zones.shp')

In [None]:
gdf = gpd.GeoDataFrame(pd.merge(geo_sample, sf, left_on='PULocationID', right_on='LocationID')).drop('PULocationID',axis=1)

In [None]:
geoJSON = gdf[['LocationID','geometry']].drop_duplicates('LocationID').to_json()

In [None]:
means = gdf.groupby('LocationID').mean().reset_index()

In [None]:
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on how to plot aggregated data.
m.add_child(folium.Choropleth(
    geo_data=geoJSON,
    data = means,
    columns = ["LocationID", "cost"],
    key_on="properties.LocationID",
    fill_color="YlOrBr",
    fill_opacity=0.7,
    line_opacity=0.2,
    name='choropleth',
    legend_name = "Average Trip Cost ($)"
))

#m.save('plots/trip_cost_PULocation.html')
m

In [None]:
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on how to plot aggregated data.
m.add_child(folium.Choropleth(
    geo_data=geoJSON,
    data = means,
    columns = ["LocationID", "trip_distance"],
    key_on="properties.LocationID",
    fill_color="YlOrBr",
    fill_opacity=0.7,
    line_opacity=0.2,
    name='choropleth',
    legend_name = "Average Trip Distance (miles)"
))

#m.save('plots/trip_dist_PULocation.html')
m

In [None]:
m = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)

# refer to the folium documentations on how to plot aggregated data.
m.add_child(folium.Choropleth(
    geo_data=geoJSON,
    data = means,
    columns = ["LocationID", "PU_crashes"],
    key_on="properties.LocationID",
    fill_color="YlOrBr",
    fill_opacity=0.7,
    line_opacity=0.2,
    name='choropleth',
    legend_name = "Average Number of Crashes"
))

m.save('plots/PU_crashes.html')
m

In [None]:
geo_sample['abs_dist_err'] = abs(geo_sample['trip_distance'] - geo_sample['approx_dist'])
geo_sample['dist_err'] = geo_sample['trip_distance'] - geo_sample['approx_dist']

In [None]:
from folium.plugins import HeatMap
from folium.plugins import FastMarkerCluster

In [None]:
crashes = pd.read_pickle('crashes_w_zones.pkl')

In [None]:
crash_heatmap = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
crash_heatmap.add_child(HeatMap(crashes[['LATITUDE', 'LONGITUDE']].values, radius=12.5))
crash_heatmap.save('plots/crash_heatmap.html')
crash_heatmap

In [None]:
crash_cluster = folium.Map(location=[40.66, -73.94], tiles="Stamen Terrain", zoom_start=10)
crash_cluster.add_child(FastMarkerCluster(data=crashes[['LATITUDE', 'LONGITUDE']].values))
crash_cluster

In [None]:
interest = ['passenger_count', 'trip_distance', 'cost', 'PU_crashes', 'DO_crashes','approx_dist', 'DailyAverageDryBulbTemperature',
       'DailyAverageRelativeHumidity', 'DailyAverageWindSpeed',
       'DailyPrecipitation', 'DailySnowDepth', 'DailySnowfall', 'PU_rides_in_zone',
       'DO_rides_in_zone']

In [None]:
sns.heatmap(sample[interest].corr())
plt.savefig('plots/corr_heatmap.png')