Import necessary packages.

In [44]:
# Dataframes and numerical
import pandas as pd
import numpy as np

# Apache parquet files (to save space)
import pyarrow as pa
import pyarrow.parquet as pq

# Counter to measure progress of long script
from tqdm import tqdm_notebook

Load .parquet files into dataframe.

In [2]:
CB_Data = pq.read_table('CitiBike_data/202106-202205-citibike-tripdata.parquet').to_pandas()
CB_Stations = pq.read_table('CitiBike_data/202206-citibike-stations.parquet').to_pandas()

Check raw data of dataframes.

In [3]:
print(CB_Data.shape)
print(CB_Stations.shape)

(27380897, 18)
(1682, 4)


In [4]:
print(CB_Data.columns)
print(CB_Stations.columns)

Index(['rideable_type', 'started_at', 'ended_at', 'start_station_name',
       'end_station_name', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'year', 'month', 'week_of_year', 'day_of_week',
       'hour_of_day', 'duration_min', 'distance_mi', 'speed_mph'],
      dtype='object')
Index(['lat', 'lng', 'boro', 'hood'], dtype='object')


In [5]:
print(CB_Data.dtypes)
print(CB_Stations.dtypes)

rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
end_station_name              object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
year                           int64
month                          int64
week_of_year                   int64
day_of_week                    int64
hour_of_day                    int64
duration_min                 float64
distance_mi                  float64
speed_mph                    float64
dtype: object
lat     float64
lng     float64
boro     object
hood     object
dtype: object


In [6]:
CB_Data.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,year,month,week_of_year,day_of_week,hour_of_day,duration_min,distance_mi,speed_mph
0,Classic Bike,2021-06-01 23:12:34,2021-06-01 23:14:46,Driggs Ave & N 9 St,Bayard St & Leonard St,40.718169,-73.955201,40.719156,-73.948854,Member,2021,6,22,1,23,2.2,0.506033,13.800891
1,Classic Bike,2021-06-16 17:14:56,2021-06-16 17:29:15,Fulton St & Broadway,Mercer St & Spring St,40.711066,-74.009447,40.723627,-73.999496,Casual,2021,6,24,2,17,14.316667,1.553328,6.509873
2,Classic Bike,2021-06-07 19:41:55,2021-06-07 19:51:28,Devoe St & Lorimer St,Manhattan Av & Leonard St,40.713352,-73.949103,40.72084,-73.94844,Casual,2021,6,23,0,19,9.55,0.562419,3.533523
3,Electric Bike,2021-06-17 15:13:15,2021-06-17 15:33:25,Driggs Ave & N 9 St,Greenwich Ave & Charles St,40.718169,-73.955201,40.735238,-74.000271,Member,2021,6,24,3,15,20.166667,4.287591,12.756469
4,Electric Bike,2021-06-18 08:27:03,2021-06-18 08:53:37,Graham Ave & Conselyea St,E 30 St & Park Ave S,40.715143,-73.944507,40.744449,-73.983035,Member,2021,6,24,4,8,26.566667,4.680581,10.570947


In [7]:
CB_Stations.head()

Unnamed: 0,lat,lng,boro,hood
1 Ave & E 110 St,40.792327,-73.9383,Manhattan,East Harlem
1 Ave & E 16 St,40.732219,-73.981655,Manhattan,Stuyvesant Town
1 Ave & E 18 St,40.733812,-73.980544,Manhattan,Stuyvesant Town
1 Ave & E 30 St,40.741444,-73.975361,Manhattan,Gramercy
1 Ave & E 39 St,40.74714,-73.97113,Manhattan,Tudor City


In order to normalize all coordinates of the distinct stations as well as the travel distances, durations, and speeds between the various stations; the averaged latitudes and longitudes of the dataframe **CB_Stations** shall replace those provided in the **CB_Data** one in addition to assigning borough and neighboorhood associations.

In [8]:
CB_Data.index

Int64Index([       0,        1,        2,        3,        4,        5,
                   6,        7,        8,        9,
            ...
            29032973, 29032974, 29032975, 29032976, 29032977, 29032978,
            29032979, 29032980, 29032981, 29032982],
           dtype='int64', length=27380897)

In [17]:
CB_Data.started_at[29032982]

Timestamp('2022-05-09 18:47:28')

In [18]:
CB_Data.tail()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,year,month,week_of_year,day_of_week,hour_of_day,duration_min,distance_mi,speed_mph
29032978,Classic Bike,2022-05-15 07:57:48,2022-05-15 08:12:55,Broadway & W 36 St,West End Ave & W 60 St,40.750977,-73.987654,40.77237,-73.99005,Member,2022,5,19,6,7,15.116667,1.641414,6.514984
29032979,Classic Bike,2022-05-05 18:13:05,2022-05-05 18:20:10,Crescent St & 30 Ave,Vernon Blvd & 31 Ave,40.768692,-73.924957,40.769247,-73.93545,Member,2022,5,18,3,18,7.083333,0.762346,6.457523
29032980,Classic Bike,2022-05-28 00:12:09,2022-05-28 00:30:00,45 Ave & 21 St,Vernon Blvd & 31 Ave,40.747371,-73.947773,40.769247,-73.93545,Member,2022,5,21,5,0,17.85,2.359753,7.931944
29032981,Classic Bike,2022-05-19 13:06:36,2022-05-19 13:18:02,Crescent St & 30 Ave,46 St & 28 Ave,40.768692,-73.924957,40.763328,-73.908782,Member,2022,5,20,3,13,11.433333,1.486219,7.799398
29032982,Classic Bike,2022-05-09 18:47:28,2022-05-09 18:52:38,W 50 St & 9 Ave,West End Ave & W 60 St,40.763605,-73.98918,40.77237,-73.99005,Member,2022,5,19,0,18,5.166667,0.664866,7.721026


In [45]:
# First, initialize new columns in the CB_Data dataframe
CB_Data['start_boro'] = ''
CB_Data['start_hood'] = ''
CB_Data['end_boro'] = ''
CB_Data['end_hood'] = ''

# Set up loop to cycle through start_station_name and end_station_name to match values as described
for i in tqdm_notebook(CB_Data.index):
    CB_Data.start_lat[i] = CB_Stations['lat'][CB_Stations.index == CB_Data.start_station_name[i]].sum()
    CB_Data.start_lng[i] = CB_Stations['lng'][CB_Stations.index == CB_Data.start_station_name[i]].sum()
    CB_Data.end_lat[i] = CB_Stations['lat'][CB_Stations.index == CB_Data.end_station_name[i]].sum()
    CB_Data.end_lng[i] = CB_Stations['lng'][CB_Stations.index == CB_Data.end_station_name[i]].sum()
    CB_Data.start_boro[i] = CB_Stations['boro'][CB_Stations.index == CB_Data.start_station_name[i]].sum()
    CB_Data.start_hood[i] = CB_Stations['hood'][CB_Stations.index == CB_Data.start_station_name[i]].sum()
    CB_Data.end_boro[i] = CB_Stations['boro'][CB_Stations.index == CB_Data.end_station_name[i]].sum()
    CB_Data.end_hood[i] = CB_Stations['hood'][CB_Stations.index == CB_Data.end_station_name[i]].sum()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(CB_Data.index):


  0%|          | 0/27380897 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CB_Data.start_lat[i] = CB_Stations['lat'][CB_Stations.index == CB_Data.start_station_name[i]].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CB_Data.start_lng[i] = CB_Stations['lng'][CB_Stations.index == CB_Data.start_station_name[i]].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CB_Data.end_lat[i] = CB_Stations['lat'][CB_Stations.index == CB_Data.end_station_name[i]].sum()
A value is trying to be set on a copy of a slice from a DataFrame

See the cavea

KeyboardInterrupt: 

In [42]:
CB_Stations['lat'][CB_Stations.index == CB_Data.start_station_name[0]].sum()

40.718169860044384

In [26]:
CB_Stations.index

Index(['1 Ave & E 110 St', '1 Ave & E 16 St', '1 Ave & E 18 St',
       '1 Ave & E 30 St', '1 Ave & E 39 St', '1 Ave & E 44 St',
       '1 Ave & E 6 St', '1 Ave & E 62 St', '1 Ave & E 68 St',
       '1 Ave & E 78 St',
       ...
       'Windsor Pl & Howard Pl', 'Withers St & Kingsland Ave',
       'Wolcott St & Dwight St', 'Woodward Ave & Harman St',
       'Wyckoff Av & Jefferson St', 'Wyckoff Av & Stanhope St',
       'Wyckoff Ave & Gates Ave', 'Wyckoff St & Nevins St',
       'Wythe Ave & Metropolitan Ave', 'Yankee Ferry Terminal'],
      dtype='object', length=1682)

In [46]:
CB_Data.head()

Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,end_station_name,start_lat,start_lng,end_lat,end_lng,member_casual,...,week_of_year,day_of_week,hour_of_day,duration_min,distance_mi,speed_mph,start_boro,start_hood,end_boro,end_hood
0,Classic Bike,2021-06-01 23:12:34,2021-06-01 23:14:46,Driggs Ave & N 9 St,Bayard St & Leonard St,40.71817,-73.955201,40.719156,-73.948855,Member,...,22,1,23,2.2,0.506033,13.800891,Brooklyn,Williamsburg,Brooklyn,Greenpoint
1,Classic Bike,2021-06-16 17:14:56,2021-06-16 17:29:15,Fulton St & Broadway,Mercer St & Spring St,40.711066,-74.009447,40.723627,-73.999496,Casual,...,24,2,17,14.316667,1.553328,6.509873,Manhattan,Financial District,Manhattan,SoHo
2,Classic Bike,2021-06-07 19:41:55,2021-06-07 19:51:28,Devoe St & Lorimer St,Manhattan Av & Leonard St,40.713352,-73.949103,40.72084,-73.94844,Casual,...,23,0,19,9.55,0.562419,3.533523,Brooklyn,Williamsburg,Brooklyn,Greenpoint
3,Electric Bike,2021-06-17 15:13:15,2021-06-17 15:33:25,Driggs Ave & N 9 St,Greenwich Ave & Charles St,40.71817,-73.955201,40.735238,-74.000271,Member,...,24,3,15,20.166667,4.287591,12.756469,Brooklyn,Williamsburg,Manhattan,Greenwich Village
4,Electric Bike,2021-06-18 08:27:03,2021-06-18 08:53:37,Graham Ave & Conselyea St,E 30 St & Park Ave S,40.715143,-73.944507,40.744449,-73.983035,Member,...,24,4,8,26.566667,4.680581,10.570947,Brooklyn,Williamsburg,Manhattan,Flatiron District


In [None]:
# df1 merge/join with df2, left join, on index column of cb stations and start/end stations of cb_data