# Task 2.5 Geospatial Plotting Try 2

In [3]:
# import libraries
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS
import numpy as np
from matplotlib import pyplot as plt

In [6]:
# import necessary columns from csv file
dtype_mapping = {6: str} # force column as string data types
df = pd.read_csv('NY_Citi_Bike_Weather_Data.csv', usecols=['start_station_name', 'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng'], dtype=dtype_mapping)

In [8]:
df.dtypes

start_station_name     object
end_station_name       object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
dtype: object

In [10]:
df.head()

Unnamed: 0,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng
0,West End Ave & W 107 St,Mt Morris Park W & W 120 St,40.802117,-73.968181,40.804038,-73.945925
1,4 Ave & 3 St,Boerum Pl\t& Pacific St,40.673746,-73.985649,40.688489,-73.99116
2,1 Ave & E 62 St,5 Ave & E 29 St,40.761227,-73.96094,40.745168,-73.986831
3,2 Ave & E 96 St,5 Ave & E 29 St,40.783964,-73.947167,40.745168,-73.986831
4,6 Ave & W 34 St,5 Ave & E 29 St,40.74964,-73.98805,40.745168,-73.986831


In [12]:
# create a value column and group by start and end station 
df['value'] = 1
df = df.groupby(['start_station_name', 'end_station_name']).agg({
    'value': 'count',
    'start_lat': 'first',
    'start_lng': 'first',
    'end_lat': 'first',
    'end_lng': 'first'
}).reset_index()

In [14]:
df.head()

Unnamed: 0,start_station_name,end_station_name,value,start_lat,start_lng,end_lat,end_lng
0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792327,-73.9383,40.792327,-73.9383
1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792327,-73.9383,40.733812,-73.980544
2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792327,-73.9383,40.741444,-73.975361
3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792327,-73.9383,40.74714,-73.97113
4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792327,-73.9383,40.75002,-73.969053


In [16]:
print(df['value'].sum())
print(df.shape)

29768714
(1013422, 7)


#### The numbers no longer match because like values of start and stop stations have now been condensed to a single row.

In [20]:
df['value'].describe().round(2)

count    1013422.00
mean          29.37
std           99.25
min            1.00
25%            1.00
50%            4.00
75%           17.00
max        12041.00
Name: value, dtype: float64

In [22]:
df.rename(columns = {'start_station_name':'start_station','end_station_name' : 'end_station',
                    'value': 'trip_counts', 'start_lat': 'lat_x', 'start_lng': 'lng_x',
                    'end_lat': 'lat_y', 'end_lng': 'lng_y'}, inplace = True)

# Creating a Map

In [34]:
df.head()

Unnamed: 0,start_station,end_station,trip_counts,lat_x,lng_x,lat_y,lng_y
0,1 Ave & E 110 St,1 Ave & E 110 St,791,40.792327,-73.9383,40.792327,-73.9383
1,1 Ave & E 110 St,1 Ave & E 18 St,2,40.792327,-73.9383,40.733812,-73.980544
2,1 Ave & E 110 St,1 Ave & E 30 St,4,40.792327,-73.9383,40.741444,-73.975361
3,1 Ave & E 110 St,1 Ave & E 39 St,1,40.792327,-73.9383,40.74714,-73.97113
4,1 Ave & E 110 St,1 Ave & E 44 St,12,40.792327,-73.9383,40.75002,-73.969053


In [37]:
df.dtypes

start_station     object
end_station       object
trip_counts        int64
lat_x            float64
lng_x            float64
lat_y            float64
lng_y            float64
dtype: object

In [40]:
# create another smaller df for processing reasons
df_sample = df.groupby('start_station', group_keys=False).apply(
    lambda x: x.sample(frac=0.02, random_state=42)
).reset_index(drop=True)

  df_sample = df.groupby('start_station', group_keys=False).apply(


In [43]:
df_sample.shape

(20245, 7)

In [46]:
# make sure all string data is properly handled
for col in df_sample.select_dtypes(include=['object']).columns:
    df_sample[col] = df_sample[col].astype(str)

In [68]:
# save map to an HTML file with data
m = KeplerGl(height=700)
m.add_data(data=df_sample, name="bike_trips")
m.save_to_html(file_name='kepler_map.html', read_only=False)

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to kepler_map.html!


In [70]:
# display HTML file
from IPython.display import IFrame
IFrame(src='kepler_map.html', width=800, height=700)

In [72]:
# point-to-point data so need to configure the map properly
map_1 = KeplerGl(height=700)
map_1.add_data(data=df_sample, name="bike_trips")

# add specific configuration for point-to-point lines
map_config = {
    "version": "v1",
    "config": {
        "visState": {
            "layers": [
                {
                    "id": "line_layer",
                    "type": "line",
                    "config": {
                        "dataId": "bike_trips",
                        "columns": {
                            "lat0": "lat_x",
                            "lng0": "lng_x",
                            "lat1": "lat_y",
                            "lng1": "lng_y"
                        },
                        "isVisible": True
                    }
                }
            ]
        }
    }
}

map_1.config = map_config
map_1.save_to_html(file_name='bike_routes_map.html')

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter
Map saved to bike_routes_map.html!


In [74]:
# display HTML file
IFrame(src='bike_routes_map.html', width=800, height=700)

#### After looking at the map a pattern is clear that the most used routes are in the center and most population dense parts of the city. This makes sense additionally with the landmarks and tourist destinations in the area. If I were to do this exercise again or refine it. I would put the parameters I was looking for in the dataset before mapping it. For instance, filter the top 20 and bottom 20 routes and then map it for a clearer picture instead of random sampling. 

In [84]:
map_1.save_to_html(file_name='bike_routes_map.html')

Map saved to bike_routes_map.html!
