# Advanced Geospatial Plotting

## Import Libraries and Data

In [1]:
# Import libraries
import pandas as pd
import os
from keplergl import KeplerGl
from pyproj import CRS
import numpy as np
from matplotlib import pyplot as plt

In [2]:
# Import data
df = pd.read_csv('Data/Prepared Data/NY_merged_agg.csv', index_col = 0)

In [3]:
# Checking data frame
df.head()

Unnamed: 0,ride_id,rideable_type,start_time,end_time,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,date,avgTemp_C,avgTemp_F,daily_rides,_merge
0,115C78C3039FFA89,electric_bike,2022-01-01 09:21:14,2022-01-01 09:35:46,Essex Light Rail,JC038,Essex Light Rail,JC038,40.712774,-74.036486,40.712774,-74.036486,member,2022-01-01,11.6,52.88,592,both
1,7FFD810CAA7A919E,classic_bike,2022-01-01 02:43:56,2022-01-01 02:43:57,12 St & Sinatra Dr N,HB201,12 St & Sinatra Dr N,HB201,40.750604,-74.02402,40.750604,-74.02402,member,2022-01-01,11.6,52.88,592,both
2,E715E8432031B72C,classic_bike,2022-01-01 02:13:33,2022-01-01 02:18:42,Essex Light Rail,JC038,Washington St,JC098,40.712774,-74.036486,40.724294,-74.035483,member,2022-01-01,11.6,52.88,592,both
3,BF1B7B1E1961A87B,electric_bike,2022-01-01 17:18:46,2022-01-01 18:55:25,Grand St,JC102,W 27 St & 7 Ave,6247.06,40.715178,-74.037683,40.746647,-73.993915,casual,2022-01-01,11.6,52.88,592,both
4,4A01F0E53C6F4386,electric_bike,2022-01-01 11:23:32,2022-01-01 11:29:27,Christ Hospital,JC034,Hoboken Terminal - Hudson St & Hudson Pl,HB101,40.734786,-74.050444,40.735938,-74.030305,member,2022-01-01,11.6,52.88,592,both


## Data Preprocessing

In [4]:
# Create a value column and group by start and end station 
df['value'] = 1
df_group = df.groupby(['start_station_name', 'end_station_name'])['value'].count().reset_index()

In [5]:
# Checking grouped dataframe
df_group

Unnamed: 0,start_station_name,end_station_name,value
0,11 St & Washington St,11 St & Washington St,1132
1,11 St & Washington St,12 Ave & W 40 St,1
2,11 St & Washington St,12 St & Sinatra Dr N,253
3,11 St & Washington St,14 St Ferry - 14 St & Shipyard Ln,395
4,11 St & Washington St,4 St & Grand St,350
...,...,...,...
6948,York St & Marin Blvd,Van Vorst Park,18
6949,York St & Marin Blvd,Warren St,42
6950,York St & Marin Blvd,Washington St,16
6951,York St & Marin Blvd,Willow Ave & 12 St,1


In [6]:
# Checking results of aggregation to make sure nothing was lost
print(df_group['value'].sum())
print(df.shape)

892281
(895485, 19)


#### The sum of the value column does not match the total rows of the dataframe. There are likely missing values in the dataframe that explain this difference.

In [7]:
# Checking for missing values
df.isnull().sum()

ride_id                  0
rideable_type            0
start_time               0
end_time                 0
start_station_name      10
start_station_id        10
end_station_name      3204
end_station_id        3204
start_lat                0
start_lng                0
end_lat               1970
end_lng               1970
member_casual            0
date                     0
avgTemp_C                0
avgTemp_F                0
daily_rides              0
_merge                   0
value                    0
dtype: int64

In [8]:
# Creating new dataframe without missing values
df_nonull = df.dropna()

In [9]:
df_nonull.shape

(892281, 19)

#### After dropping the missing values, the sum of the value column in the grouped dataframe matches the total cleaned rows of the main dataframe.

In [10]:
# Renaming "value" column
df_group.rename(columns={'value':'trips'}, inplace=True)

In [11]:
# Merging latitude and longitude columns with grouped dataframe
df_merge = df_group.merge(df[['start_station_name', 'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng']].drop_duplicates(), 
                          on=['start_station_name', 'end_station_name'], how='inner', indicator = 'merge')

In [12]:
# Checking merged dataframe
df_merge

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng,merge
0,11 St & Washington St,11 St & Washington St,1132,40.749985,-74.027150,40.749985,-74.027150,both
1,11 St & Washington St,11 St & Washington St,1132,40.749985,-74.027150,40.749985,-74.027150,both
2,11 St & Washington St,11 St & Washington St,1132,40.750056,-74.027287,40.749985,-74.027150,both
3,11 St & Washington St,11 St & Washington St,1132,40.749973,-74.027456,40.749985,-74.027150,both
4,11 St & Washington St,11 St & Washington St,1132,40.750037,-74.027265,40.749985,-74.027150,both
...,...,...,...,...,...,...,...,...
120346,York St & Marin Blvd,Warren St,42,40.716627,-74.042243,40.721124,-74.038051,both
120347,York St & Marin Blvd,Warren St,42,40.716657,-74.042267,40.721124,-74.038051,both
120348,York St & Marin Blvd,Washington St,16,40.716615,-74.042412,40.724294,-74.035483,both
120349,York St & Marin Blvd,Willow Ave & 12 St,1,40.716615,-74.042412,40.751867,-74.030377,both


In [13]:
# Checking merge
df_merge['merge'].value_counts(dropna = False)

both          120351
left_only          0
right_only         0
Name: merge, dtype: int64

In [14]:
# Keeping only one instance of each start and end station pair
df_final = df_merge.drop_duplicates(subset=['start_station_name', 'end_station_name'])

In [15]:
# Checking df_final
df_final

Unnamed: 0,start_station_name,end_station_name,trips,start_lat,start_lng,end_lat,end_lng,merge
0,11 St & Washington St,11 St & Washington St,1132,40.749985,-74.027150,40.749985,-74.027150,both
122,11 St & Washington St,12 Ave & W 40 St,1,40.749985,-74.027150,40.760875,-74.002777,both
123,11 St & Washington St,12 St & Sinatra Dr N,253,40.749985,-74.027150,40.750604,-74.024020,both
142,11 St & Washington St,14 St Ferry - 14 St & Shipyard Ln,395,40.749985,-74.027150,40.752961,-74.024353,both
172,11 St & Washington St,4 St & Grand St,350,40.749985,-74.027150,40.742258,-74.035111,both
...,...,...,...,...,...,...,...,...
120343,York St & Marin Blvd,Van Vorst Park,18,40.716615,-74.042412,40.718489,-74.047727,both
120344,York St & Marin Blvd,Warren St,42,40.716615,-74.042412,40.721124,-74.038051,both
120348,York St & Marin Blvd,Washington St,16,40.716615,-74.042412,40.724294,-74.035483,both
120349,York St & Marin Blvd,Willow Ave & 12 St,1,40.716615,-74.042412,40.751867,-74.030377,both


In [16]:
# Dropping "merge" column from dataframe
df_final = df_final.drop(columns=['merge'])

## Plotting the Map

In [17]:
# Initializing KeplerGl instance
m = KeplerGl(height = 700, data={"data_1": df_final})
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'data_1':            start_station_name                   end_station_name  trips  \
0       11…

In [18]:
# Saving map to html file
m.save_to_html(file_name = 'NY CitiBike Trips Aggregated.html', read_only = False)

Map saved to NY CitiBike Trips Aggregated.html!
