# Goal of the ML project 
- Dataset: citybike sharing system

- Un-Supervisioned Analysis - Descriptive analysis
- Apply techniques:
    - Clustering
    - Outlier analysis or Pattern mining

# Data Selection

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display


## Import Functions

### dslabs_functions

In [2]:
%run "scripts/dslabs_functions.py"

dslabs_functions lodaded


### data functions

In [3]:
%run "scripts/data_functions.py"


dslabs_functions lodaded
data_functions lodaded


## Load Files 

### Weather file - New York
- Weather info, daily and hourly, from NewYork for the year 2024
    - info: rain, precipitation, temperature,...
- This weather info was extracted from OpenMeteo
    - Link here - https://open-meteo.com/en/docs/historical-weather-api#latitude=40.7143&longitude=-74.006&start_date=2019-01-01&end_date=2019-12-31&hourly=temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,snow_depth,weather_code,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&daily=&timezone=America%2FNew_York&models=
    

In [4]:
filepath_weather_ny_h = r'data/open-meteo-new-york-hourly-2024.csv'
#filepath_weather_ny_d = r'data/open-meteo-new-york-daily-2024.csv'

In [5]:
df_weather_ny_h = pd.read_csv(filepath_weather_ny_h,sep=';', na_values="")
#df_weather_ny_d = pd.read_csv(filepath_weather_ny_d, sep =';', na_values="")

display(df_weather_ny_h.head(5))
display(df_weather_ny_h.shape)
#display(df_weather_ny_d.head(5))    
#display(df_weather_ny_d.shape)


Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),dew_point_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),snow_depth (m),weather_code (wmo code),...,cloud_cover_low (%),cloud_cover_mid (%),cloud_cover_high (%),et0_fao_evapotranspiration (mm),vapour_pressure_deficit (kPa),wind_speed_10m (km/h),wind_speed_100m (km/h),wind_direction_10m (°),wind_direction_100m (°),wind_gusts_10m (km/h)
0,2024-01-01T00:00,1.6,76,-2.3,-2.2,0.0,0.0,0.0,0.0,3,...,94,100,0,0.01,0.17,8.9,20.4,243,256,13.7
1,2024-01-01T01:00,2.6,74,-1.6,-1.4,0.0,0.0,0.0,0.0,3,...,100,100,0,0.01,0.19,11.4,20.9,235,246,18.4
2,2024-01-01T02:00,2.7,74,-1.4,-1.1,0.0,0.0,0.0,0.0,3,...,100,100,0,0.01,0.19,9.7,18.3,239,248,17.6
3,2024-01-01T03:00,2.5,76,-1.3,-1.0,0.0,0.0,0.0,0.0,2,...,27,52,0,0.01,0.18,8.1,16.5,249,259,15.1
4,2024-01-01T04:00,0.5,88,-1.2,-2.9,0.0,0.0,0.0,0.0,3,...,5,90,0,0.0,0.07,7.5,14.5,253,264,12.2


(8784, 23)

### Main file - city bike share system
- Data on each ride in the citibyke share system in New York, between the period of 1/4/24 and /31/6/24.
    - Contains info on:
        - Start date of the trip
        - End date of the trip
        - Start station
        - End station
        - Start longitude and latitude
        - end longitude and latitude
        - rideable_bike - classical vs eletric
        - type of user (member or casual)
            - member: subscribers
                - Anual membership: Users who pay an annual fee to use the Citi Bike system.
                    -> These users are most likely local residents of New York.
            - casual: Non-subscribers
                - "Single Ride" and "Day pass": Users who choose short-term options and do not use Citi Bike as frequently as annual members.
                    -> These users are most likely not locals. They are tourists or visitors who come to New York occasionally for business or personal reasons.
    - Data provided on citybike website
        - link here - https://citibikenyc.com/system-data

In [6]:
filepath = 'data/2024-citibike-tripdata/202404-citibike-tripdata.csv'

test_data = False

# test_data = False
 
# Load the data

if test_data:
    df = pd.read_csv(filepath, na_values="")
    df: Dataframe = df.sample(frac = 0.1,sep = ',', replace = False)
else:
    df = pd.read_csv(filepath, sep =',', na_values="")

display(df)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.735790,-73.981693,40.723180,-73.994800,member
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217058,BE0F887BE16DFDDA,electric_bike,2024-04-16 07:47:35.966,2024-04-16 07:54:57.079,Atlantic Ave & Furman St,4614.04,Clark St & Henry St,4789.03,40.691736,-74.000174,40.697601,-73.993446,member
3217059,850F33001EDD1AE4,classic_bike,2024-04-10 08:45:16.500,2024-04-10 08:51:55.652,E 63 St & 3 Ave,6830.02,E 44 St & Lexington Ave,6464.09,40.763954,-73.964600,40.752643,-73.974996,member
3217060,CEE1CDE84344E8FF,electric_bike,2024-04-09 17:44:45.398,2024-04-09 18:03:46.293,Atlantic Ave & Furman St,4614.04,E 5 St & Cooper Sq,5712.12,40.691729,-74.000147,40.727690,-73.990993,member
3217061,174E104B28274EE1,classic_bike,2024-04-03 14:42:57.301,2024-04-03 14:51:34.967,W 48 St & Rockefeller Plaza,6626.11,W 20 St & 5 Ave,6098.02,40.757769,-73.979294,40.739730,-73.991040,member


In [27]:
import os
import pandas as pd

# Diretório onde os ficheiros estão armazenados
directory = 'data/2024-citibike-tripdata'

# Lista para armazenar os DataFrames
dfs = []

# Iterar sobre todos os ficheiros no diretório
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        filepath = os.path.join(directory, filename)
        # Ler o ficheiro e adicioná-lo à lista
        df = pd.read_csv(filepath, sep=',')  # Ajuste 'sep' se necessário
        dfs.append(df)

# Combinar todos os DataFrames em um só
df = pd.concat(dfs, ignore_index=True)

# Salvar o DataFrame combinado em um único ficheiro, se necessário
df.to_csv('data/2024-citibike-tripdata.csv', index=False)

In [7]:
display(df.head())

display(df.shape)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,40.72318,-73.9948,member
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member


(3217063, 13)

## Sampling

In [11]:
test_data = True

# test_data = False
 
# Load the data
filepath = 'data/2024-citibike-tripdata.csv'
if test_data:
    df = pd.read_csv(filepath, na_values="")
    df: DataFrame = df_main.sample(frac = 0.08, replace = False)
else:
    df = pd.read_csv(filepath, na_values="")


NameError: name 'df_main' is not defined

In [52]:
display(df.head())

display(df.shape)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
2392250,C728387DDECD1AC2,electric_bike,2024-04-24 14:19:58.746,2024-04-24 14:28:09.526,1 Ave & E 6 St,5626.15,E 20 St & FDR Dr,5886.13,40.726291,-73.986199,40.733209,-73.975681,member
2032313,A1D1FE09F828B2DA,classic_bike,2024-04-21 15:06:45.330,2024-04-21 15:09:05.198,Murray St & West St,5329.08,West St & Chambers St,5329.03,40.71503,-74.01246,40.717548,-74.013221,member
2367996,740D668B114A8EBC,electric_bike,2024-04-25 08:10:14.145,2024-04-25 08:13:32.561,Bergen St & Smith St,4446.01,Fulton St & Adams St,4637.06,40.686799,-73.990757,40.692418,-73.989495,member
1498554,B01091CCC040B0D9,classic_bike,2024-04-27 14:58:29.716,2024-04-27 15:09:35.034,S 4 St & Roebling St,5195.06,Driggs Ave & N 9 St,5411.08,40.710709,-73.959724,40.71817,-73.955201,casual
2732486,8CC6D98B34E72A0E,electric_bike,2024-04-09 13:08:29.289,2024-04-09 13:23:03.929,Broadway & W 58 St,6948.1,MacDougal St & Washington Sq,5797.01,40.766367,-73.981773,40.732264,-73.998522,member


(25736, 13)

# Data Cleaning/Transformation

- Feature Engineering steps
    - In this project, we have the main dataset (df) with data for each ride in citybike system. In order to enrich the analysis, we decided to cross join this data with hourly meteorological conditions (from openMeteo), to understand how the weather may influence the rides in citybike.
- Add new features
    -   Spacial:
        - Start station borough (bronx, queens, manathan,...)
        - End station borough (bronx, queens, manathan,...)

    - Time 
        - is_weekend (0 or 1)
        - time_of_day - lunch time, early afternoon, evening..
        - day_of_week - staruday, friday, thursday
        - hour
    - Ride Information:
        - ride_duration - The total duration of the ride in seconds.
        - ride_avg_speed -  The average speed of the ride calculated using the geodesic distance and ride duration (km/h).
    - Weather
        - weather_desc: The weather conditions during the ride (e.g., Overcast, Clear sky, Rainy).
        rain: The amount of rain (in mm) during the ride.
        - temperature: The temperature (in °C) during the ride.
        - .....
- Rename original cols from main df - citi bike
- Remove id columns: make sure to remove id columns from the df

## Add more info on the borough (queens, manhathan, bronx,...) of the stations 

In [29]:
# Get list of distinct values in start_Station_name end_station_name and save them into csv
df_unique_start_station = df["start_station_name"].unique()

df_start_stations = pd.DataFrame(df_unique_start_station, columns = ["start_station_name"]).dropna()

df_start_stations.to_csv('data/df_start_stations.csv', index=False, sep = ';')

display(df_start_stations)

df_unique_end_station = df["end_station_name"].unique()

df_end_stations = pd.DataFrame(df_unique_end_station, columns = ["end_station_name"]).dropna()

df_end_stations.to_csv('data/df_end_stations.csv', index=False, sep = ';')

display(df_end_stations)


Unnamed: 0,start_station_name
0,FDR Drive & E 35 St
1,Forsyth St & Grand St
2,E 20 St & 2 Ave
3,Eastern Pkwy & Washington Ave
4,W 27 St & 6 Ave
...,...
2125,Riverside Dr E & W 155 St
2126,Lab - NYC
2127,Madison St & 10 St
2128,Hilltop


Unnamed: 0,end_station_name
0,E 10 St & 2 Ave
1,Mott St & Prince St
2,Eastern Pkwy & Franklin Ave (SW Corner)
3,E 25 St & 1 Ave
4,Columbia Heights & Cranberry St
...,...
2170,Newark Ave
2171,Faile St & Garrison Ave
2172,JC Medical Center
2173,McGinley Square


#### Get start_station boroughs list from csv file

In [8]:
# Load the data
filepath = 'data/df_start_stations_boroughs.csv'
df_start_stations_boroughs = pd.read_csv(filepath,sep=';', na_values="")

df_start_stations_boroughs = df_start_stations_boroughs.rename(columns = {'start_station_borough':'start_borough'})

display(df_start_stations_boroughs)

Unnamed: 0,start_station_name,start_borough
0,FDR Drive & E 35 St,Manhattan
1,Forsyth St & Grand St,Manhattan
2,E 20 St & 2 Ave,Manhattan
3,Eastern Pkwy & Washington Ave,Brooklyn
4,W 27 St & 6 Ave,Manhattan
...,...,...
2124,Riverside Dr E & W 155 St,Manhattan
2125,Lab - NYC,Manhattan
2126,Madison St & 10 St,Brooklyn
2127,Hilltop,Staten Island


### Get end_station boroughs list from csv file

In [9]:
# Load the data
filepath = 'data/df_end_stations_boroughs.csv'
df_end_stations_boroughs = pd.read_csv(filepath,sep=';', na_values="")

df_end_stations_boroughs.rename(columns = {'end_station_borough':'end_borough'}, inplace = True)

display(df_end_stations_boroughs)

Unnamed: 0,end_station_name,end_borough
0,Prospect Ave & Longwood Ave,Bronx
1,E 161 St & River Ave,Bronx
2,Southern Blvd & Ave St John,Bronx
3,Melrose Ave & E 150 St,Bronx
4,E Burnside Ave & Ryer Ave,Bronx
...,...,...
2169,49 St & 25 Ave,Queens
2170,Corona Ave & 102 St,Queens
2171,30 Ave & 80 St,Queens
2172,Sunken Meadow Comfort Station,Staten Island


### Merge with main df to get boroughs data

In [10]:

df_merged_with_end = df.merge(df_start_stations_boroughs, on = 'start_station_name', how = 'left')

df_merged_boroughs = df_merged_with_end.merge(df_end_stations_boroughs, on = 'end_station_name', how = 'left')

display(df_merged_boroughs)



Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member,Manhattan,Manhattan
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member,Manhattan,Manhattan
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.735790,-73.981693,40.723180,-73.994800,member,Manhattan,Manhattan
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member,Brooklyn,Brooklyn
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member,Manhattan,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217058,BE0F887BE16DFDDA,electric_bike,2024-04-16 07:47:35.966,2024-04-16 07:54:57.079,Atlantic Ave & Furman St,4614.04,Clark St & Henry St,4789.03,40.691736,-74.000174,40.697601,-73.993446,member,Brooklyn,Brooklyn
3217059,850F33001EDD1AE4,classic_bike,2024-04-10 08:45:16.500,2024-04-10 08:51:55.652,E 63 St & 3 Ave,6830.02,E 44 St & Lexington Ave,6464.09,40.763954,-73.964600,40.752643,-73.974996,member,Manhattan,Manhattan
3217060,CEE1CDE84344E8FF,electric_bike,2024-04-09 17:44:45.398,2024-04-09 18:03:46.293,Atlantic Ave & Furman St,4614.04,E 5 St & Cooper Sq,5712.12,40.691729,-74.000147,40.727690,-73.990993,member,Brooklyn,Manhattan
3217061,174E104B28274EE1,classic_bike,2024-04-03 14:42:57.301,2024-04-03 14:51:34.967,W 48 St & Rockefeller Plaza,6626.11,W 20 St & 5 Ave,6098.02,40.757769,-73.979294,40.739730,-73.991040,member,,Manhattan


 Check if we have obserrvations with null values for start_station_borough and end_station_borough

In [11]:
# Filtra as linhas onde a coluna 'start_station_borough' ou 'end_station_borough' está vazia (NaN)
df_merged_boroughs_nulls = df_merged_boroughs[df_merged_boroughs["start_borough"].isnull() | df_merged_boroughs["end_borough"].isnull()]

# Exibe o DataFrame com as linhas filtradas
display(df_merged_boroughs_nulls)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough
36,E5A3ED9BE3FD7587,electric_bike,2024-04-11 12:37:32.437,2024-04-11 14:46:17.101,W 27 St & 6 Ave,6215.07,,,40.745293,-73.990210,40.820000,-73.960000,member,Manhattan,
39,81444091ACF3B22B,electric_bike,2024-04-25 13:35:26.343,2024-04-25 14:56:20.044,Forsyth St & Grand St,5382.07,,,40.717665,-73.993287,40.760000,-73.980000,casual,Manhattan,
40,974382208A82A0E4,electric_bike,2024-04-25 18:07:33.803,2024-04-25 19:18:29.748,Cleveland Pl & Spring St,5492.05,,,40.725142,-73.998888,40.710000,-73.980000,casual,Manhattan,
41,652155A253C6605D,electric_bike,2024-04-25 00:03:07.359,2024-04-25 01:10:29.453,Eastern Pkwy & Washington Ave,3928.08,,,40.671740,-73.963483,40.680000,-73.950000,casual,Brooklyn,
42,06B12DA029ECE28C,electric_bike,2024-04-25 19:57:47.547,2024-04-25 23:41:41.747,E 40 St & 5 Ave,6474.11,,,40.752147,-73.981002,40.640000,-73.940000,casual,Manhattan,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217024,014D41C5041A5B44,classic_bike,2024-04-11 21:48:28.758,2024-04-11 21:55:49.231,W 48 St & Rockefeller Plaza,6626.11,E 44 St & Lexington Ave,6464.09,40.757769,-73.979294,40.752643,-73.974996,member,,Manhattan
3217026,F01B7ECE8562D3A5,classic_bike,2024-04-15 18:02:19.066,2024-04-15 18:42:02.681,W 48 St & Rockefeller Plaza,6626.11,Clark St & Henry St,4789.03,40.757769,-73.979294,40.697601,-73.993446,member,,Brooklyn
3217052,981E6F4927784D23,classic_bike,2024-04-10 15:42:12.949,2024-04-10 15:52:15.784,W 48 St & Rockefeller Plaza,6626.11,W 20 St & 5 Ave,6098.02,40.757769,-73.979294,40.739730,-73.991040,member,,Manhattan
3217053,4550E3DECA824108,classic_bike,2024-04-30 14:58:14.296,2024-04-30 15:02:37.302,W 48 St & Rockefeller Plaza,6626.11,E 44 St & Lexington Ave,6464.09,40.757769,-73.979294,40.752643,-73.974996,member,,Manhattan


In [12]:
df_merged_boroughs_nulls = df_merged_boroughs[df_merged_boroughs["start_borough"].isnull() | df_merged_boroughs["end_borough"].isnull()]

display(df_merged_boroughs_nulls)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough
36,E5A3ED9BE3FD7587,electric_bike,2024-04-11 12:37:32.437,2024-04-11 14:46:17.101,W 27 St & 6 Ave,6215.07,,,40.745293,-73.990210,40.820000,-73.960000,member,Manhattan,
39,81444091ACF3B22B,electric_bike,2024-04-25 13:35:26.343,2024-04-25 14:56:20.044,Forsyth St & Grand St,5382.07,,,40.717665,-73.993287,40.760000,-73.980000,casual,Manhattan,
40,974382208A82A0E4,electric_bike,2024-04-25 18:07:33.803,2024-04-25 19:18:29.748,Cleveland Pl & Spring St,5492.05,,,40.725142,-73.998888,40.710000,-73.980000,casual,Manhattan,
41,652155A253C6605D,electric_bike,2024-04-25 00:03:07.359,2024-04-25 01:10:29.453,Eastern Pkwy & Washington Ave,3928.08,,,40.671740,-73.963483,40.680000,-73.950000,casual,Brooklyn,
42,06B12DA029ECE28C,electric_bike,2024-04-25 19:57:47.547,2024-04-25 23:41:41.747,E 40 St & 5 Ave,6474.11,,,40.752147,-73.981002,40.640000,-73.940000,casual,Manhattan,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217024,014D41C5041A5B44,classic_bike,2024-04-11 21:48:28.758,2024-04-11 21:55:49.231,W 48 St & Rockefeller Plaza,6626.11,E 44 St & Lexington Ave,6464.09,40.757769,-73.979294,40.752643,-73.974996,member,,Manhattan
3217026,F01B7ECE8562D3A5,classic_bike,2024-04-15 18:02:19.066,2024-04-15 18:42:02.681,W 48 St & Rockefeller Plaza,6626.11,Clark St & Henry St,4789.03,40.757769,-73.979294,40.697601,-73.993446,member,,Brooklyn
3217052,981E6F4927784D23,classic_bike,2024-04-10 15:42:12.949,2024-04-10 15:52:15.784,W 48 St & Rockefeller Plaza,6626.11,W 20 St & 5 Ave,6098.02,40.757769,-73.979294,40.739730,-73.991040,member,,Manhattan
3217053,4550E3DECA824108,classic_bike,2024-04-30 14:58:14.296,2024-04-30 15:02:37.302,W 48 St & Rockefeller Plaza,6626.11,E 44 St & Lexington Ave,6464.09,40.757769,-73.979294,40.752643,-73.974996,member,,Manhattan


- We have 23k of obersvations where we werent able to find the boroughs for the start and end stations. We will drop these observations in next step

### Drop 23k rows with null values for start_station_borough end_station_borough
- Reason of drop: 23k in a dataset of 3millions rows is irrelevant, and that way we can clean out observations that we weren't able to get borough name

In [13]:
df_merged_boroughs = df_merged_boroughs.dropna()
df = df_merged_boroughs

In [14]:
display(df)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member,Manhattan,Manhattan
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member,Manhattan,Manhattan
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.735790,-73.981693,40.723180,-73.994800,member,Manhattan,Manhattan
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member,Brooklyn,Brooklyn
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member,Manhattan,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217057,1E488FC06367D630,classic_bike,2024-04-14 04:18:07.827,2024-04-14 04:19:59.776,Henry St & Middagh St,4861.05,Clark St & Henry St,4789.03,40.700300,-73.991581,40.697601,-73.993446,member,Brooklyn,Brooklyn
3217058,BE0F887BE16DFDDA,electric_bike,2024-04-16 07:47:35.966,2024-04-16 07:54:57.079,Atlantic Ave & Furman St,4614.04,Clark St & Henry St,4789.03,40.691736,-74.000174,40.697601,-73.993446,member,Brooklyn,Brooklyn
3217059,850F33001EDD1AE4,classic_bike,2024-04-10 08:45:16.500,2024-04-10 08:51:55.652,E 63 St & 3 Ave,6830.02,E 44 St & Lexington Ave,6464.09,40.763954,-73.964600,40.752643,-73.974996,member,Manhattan,Manhattan
3217060,CEE1CDE84344E8FF,electric_bike,2024-04-09 17:44:45.398,2024-04-09 18:03:46.293,Atlantic Ave & Furman St,4614.04,E 5 St & Cooper Sq,5712.12,40.691729,-74.000147,40.727690,-73.990993,member,Brooklyn,Manhattan


In [36]:
display(df)
display(df.shape)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member,Manhattan,Manhattan
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member,Manhattan,Manhattan
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.735790,-73.981693,40.723180,-73.994800,member,Manhattan,Manhattan
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member,Brooklyn,Brooklyn
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member,Manhattan,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3217057,1E488FC06367D630,classic_bike,2024-04-14 04:18:07.827,2024-04-14 04:19:59.776,Henry St & Middagh St,4861.05,Clark St & Henry St,4789.03,40.700300,-73.991581,40.697601,-73.993446,member,Brooklyn,Brooklyn
3217058,BE0F887BE16DFDDA,electric_bike,2024-04-16 07:47:35.966,2024-04-16 07:54:57.079,Atlantic Ave & Furman St,4614.04,Clark St & Henry St,4789.03,40.691736,-74.000174,40.697601,-73.993446,member,Brooklyn,Brooklyn
3217059,850F33001EDD1AE4,classic_bike,2024-04-10 08:45:16.500,2024-04-10 08:51:55.652,E 63 St & 3 Ave,6830.02,E 44 St & Lexington Ave,6464.09,40.763954,-73.964600,40.752643,-73.974996,member,Manhattan,Manhattan
3217060,CEE1CDE84344E8FF,electric_bike,2024-04-09 17:44:45.398,2024-04-09 18:03:46.293,Atlantic Ave & Furman St,4614.04,E 5 St & Cooper Sq,5712.12,40.691729,-74.000147,40.727690,-73.990993,member,Brooklyn,Manhattan


(3193597, 15)

## Add points of interest (turist spots) near the stations or for each borough
- Check this page. you have infor about land use by each block of the city
    https://zola.planning.nyc.gov/l/zoning-district/C6-2A?search=false#12.59/40.73073/-73.9305

## Add more time variables from data variable - started_at

In [15]:
# Convert started_at to date time column
df['started_at'] = pd.to_datetime(df['started_at'])
display(df.head())
df.dtypes

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member,Manhattan,Manhattan
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member,Manhattan,Manhattan
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,40.72318,-73.9948,member,Manhattan,Manhattan
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member,Brooklyn,Brooklyn
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member,Manhattan,Manhattan


ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at                      object
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
start_borough                 object
end_borough                   object
dtype: object

In [17]:
# Split the start_at column to have more information and detail on the:
# - day_of_month
# - day_ok_week 
# - is_weekend
# - time_of_day
# - day
# -hour
#As the dataset only as data from s sepecific month of 2020, we don't need to get the year, quarter and month.

df['day_of_month'] = df['started_at'].dt.day
df['hour'] = pd.to_datetime(df['started_at']).dt.hour
df['day_of_week'] = df['started_at'].dt.day_name()
df['is_weekend'] = df['started_at'].dt.weekday.apply(lambda x: 1 if x>=5 else 0)

#Function to categorize time of day based on hour

def time_of_day(hour):
    if 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 17:
        return 'afternoon'
    elif 17 <= hour < 21:
        return 'evening'
    else:
        return 'night'

df['time_of_day'] = df['hour'].apply(time_of_day)


#def time_of_day(hour):
#    if hour >= 6 and hour < 9:
#        return 'early morning'
#    elif hour >= 9 and hour < 12:
#        return 'late morning'
#    elif hour >=12 and hour < 14:
#        return 'lunch time'
#    elif hour >=14 and hour < 17:
#        return 'afternoon'
#    elif hour >=17 and hour < 21:
#        return 'evening'
#    else:
#        return 'night'

display(df.head())

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough,day_of_month,hour,day_of_week,is_weekend,time_of_day
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member,Manhattan,Manhattan,27,13,Saturday,1,afternoon
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member,Manhattan,Manhattan,25,15,Thursday,0,afternoon
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,40.72318,-73.9948,member,Manhattan,Manhattan,6,11,Saturday,1,morning
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member,Brooklyn,Brooklyn,6,16,Saturday,1,afternoon
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member,Manhattan,Manhattan,10,17,Wednesday,0,evening


## Add variable of ride duration based on started_at and ended_at date columns
- add column ride_duration, that we can use to diferentiate long vs shorter trips
- column calculated based on started_at and  ended_at date columns
- column in seconds


In [18]:
# Convert started_at to date time column
df['ended_at'] = pd.to_datetime(df['ended_at'])
display(df.head())
df.dtypes


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,start_borough,end_borough,day_of_month,hour,day_of_week,is_weekend,time_of_day
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,40.729708,-73.986598,member,Manhattan,Manhattan,27,13,Saturday,1,afternoon
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,40.729708,-73.986598,member,Manhattan,Manhattan,25,15,Thursday,0,afternoon
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,40.72318,-73.9948,member,Manhattan,Manhattan,6,11,Saturday,1,morning
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,40.670529,-73.958222,member,Brooklyn,Brooklyn,6,16,Saturday,1,afternoon
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,40.738177,-73.977387,member,Manhattan,Manhattan,10,17,Wednesday,0,evening


ride_id                       object
rideable_type                 object
started_at            datetime64[ns]
ended_at              datetime64[ns]
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
start_borough                 object
end_borough                   object
day_of_month                   int32
hour                           int32
day_of_week                   object
is_weekend                     int64
time_of_day                   object
dtype: object

In [19]:
df['ride_duration_sec'] = (df['ended_at'] - df['started_at']).dt.seconds

display(df.head())

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,end_lng,member_casual,start_borough,end_borough,day_of_month,hour,day_of_week,is_weekend,time_of_day,ride_duration_sec
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,-73.986598,member,Manhattan,Manhattan,27,13,Saturday,1,afternoon,549
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,-73.986598,member,Manhattan,Manhattan,25,15,Thursday,0,afternoon,278
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,-73.9948,member,Manhattan,Manhattan,6,11,Saturday,1,morning,411
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,-73.958222,member,Brooklyn,Brooklyn,6,16,Saturday,1,afternoon,137
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,-73.977387,member,Manhattan,Manhattan,10,17,Wednesday,0,evening,476


In [20]:
df['ride_duration_sec'] = (df['ended_at'] - df['started_at']).dt.total_seconds()
df['ride_duration_min'] = (df['ended_at'] - df['started_at']).dt.total_seconds()/60
display(df.head())


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,member_casual,start_borough,end_borough,day_of_month,hour,day_of_week,is_weekend,time_of_day,ride_duration_sec,ride_duration_min
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,member,Manhattan,Manhattan,27,13,Saturday,1,afternoon,549.689,9.161483
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,member,Manhattan,Manhattan,25,15,Thursday,0,afternoon,278.366,4.639433
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,member,Manhattan,Manhattan,6,11,Saturday,1,morning,411.949,6.865817
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,member,Brooklyn,Brooklyn,6,16,Saturday,1,afternoon,137.349,2.28915
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,member,Manhattan,Manhattan,10,17,Wednesday,0,evening,476.871,7.94785


## Add flag is holiday
- No holiday in newyork in april of 2024, no no need to add this flag

## Add weather data into our dataset

- Weather info, daily and hourly, from NewYork for the year 2024
    - info: rain, precipitation, temperature,...
- This weather info was extracted from OpenMeteo
    - Link here - https://open-meteo.com/en/docs/historical-weather-api#latitude=40.7143&longitude=-74.006&start_date=2019-01-01&end_date=2019-12-31&hourly=temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,snow_depth,weather_code,pressure_msl,surface_pressure,cloud_cover,cloud_cover_low,cloud_cover_mid,cloud_cover_high,et0_fao_evapotranspiration,vapour_pressure_deficit,wind_speed_10m,wind_speed_100m,wind_direction_10m,wind_direction_100m,wind_gusts_10m,soil_temperature_0_to_7cm,soil_temperature_7_to_28cm,soil_temperature_28_to_100cm,soil_temperature_100_to_255cm,soil_moisture_0_to_7cm,soil_moisture_7_to_28cm,soil_moisture_28_to_100cm,soil_moisture_100_to_255cm&daily=&timezone=America%2FNew_York&models=

Info on dataset columns:
- snowfall = 5 cm (new snow that felt in a specific hour)
- snow_depth = 15 cm (total snow accumulated in the floor in a specific hour)
- rain_sum (mm) = Total sum of liquid rain (excludes snow, hail, etc.).
- precipitation_sum (mm) = Total sum of all precipitation (includes rain, snow, hail, etc.)
- cloud_cover (%): Represents the total percentage of the sky covered by clouds at all altitudes, without distinguishing between low, middle, or high
- cloud_cover_low (%): Represents the percentage of cloud cover specifically at low altitudes, typically below 2 km.- cloud_cover_mid (%) = Percentage of cloud cover at mid altitudes (typically between 2 km and 6 km).
- cloud_cover_high (%) = Percentage of cloud cover at high altitudes (typically above 6 km).
- wind_speed_10m (km/h)= Represents the wind speed measured at 10 meters above ground level, in kilometers per hour (km/h). This is typically used to assess the strength of the wind near the surface.
- weather_code (wmo code)	- World meteorological code for that hour 
    - cloudly, sunny, mist

    

In [21]:
display(df.head())
display(df_weather_ny_h.head())
display(df_weather_ny_h.shape)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,member_casual,start_borough,end_borough,day_of_month,hour,day_of_week,is_weekend,time_of_day,ride_duration_sec,ride_duration_min
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,member,Manhattan,Manhattan,27,13,Saturday,1,afternoon,549.689,9.161483
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,member,Manhattan,Manhattan,25,15,Thursday,0,afternoon,278.366,4.639433
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,member,Manhattan,Manhattan,6,11,Saturday,1,morning,411.949,6.865817
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,member,Brooklyn,Brooklyn,6,16,Saturday,1,afternoon,137.349,2.28915
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,member,Manhattan,Manhattan,10,17,Wednesday,0,evening,476.871,7.94785


Unnamed: 0,time,temperature_2m (°C),relative_humidity_2m (%),dew_point_2m (°C),apparent_temperature (°C),precipitation (mm),rain (mm),snowfall (cm),snow_depth (m),weather_code (wmo code),...,cloud_cover_low (%),cloud_cover_mid (%),cloud_cover_high (%),et0_fao_evapotranspiration (mm),vapour_pressure_deficit (kPa),wind_speed_10m (km/h),wind_speed_100m (km/h),wind_direction_10m (°),wind_direction_100m (°),wind_gusts_10m (km/h)
0,2024-01-01T00:00,1.6,76,-2.3,-2.2,0.0,0.0,0.0,0.0,3,...,94,100,0,0.01,0.17,8.9,20.4,243,256,13.7
1,2024-01-01T01:00,2.6,74,-1.6,-1.4,0.0,0.0,0.0,0.0,3,...,100,100,0,0.01,0.19,11.4,20.9,235,246,18.4
2,2024-01-01T02:00,2.7,74,-1.4,-1.1,0.0,0.0,0.0,0.0,3,...,100,100,0,0.01,0.19,9.7,18.3,239,248,17.6
3,2024-01-01T03:00,2.5,76,-1.3,-1.0,0.0,0.0,0.0,0.0,2,...,27,52,0,0.01,0.18,8.1,16.5,249,259,15.1
4,2024-01-01T04:00,0.5,88,-1.2,-2.9,0.0,0.0,0.0,0.0,3,...,5,90,0,0.0,0.07,7.5,14.5,253,264,12.2


(8784, 23)

In [22]:
#Let's use the hourly dataset to have detailed weather info

#convert time to datetime format
df_weather_ny_h['time'] = pd.to_datetime(df_weather_ny_h['time'])

#drop meteo columns that are not relevant for the analysis
df_weather_ny_h.drop(columns = [
    'apparent_temperature (°C)',
    'relative_humidity_2m (%)',
    'dew_point_2m (°C)',
    'et0_fao_evapotranspiration (mm)',
    'precipitation (mm)',
    'vapour_pressure_deficit (kPa)',
    'wind_speed_100m (km/h)',
    'snow_depth (m)',
    'cloud_cover (%)',
    'cloud_cover_mid (%)',
    'cloud_cover_high (%)',
    'pressure_msl (hPa)',
    'surface_pressure (hPa)',
    'wind_direction_10m (°)',
    'wind_direction_100m (°)',
    'wind_gusts_10m (km/h)'],
inplace=True)

display(df_weather_ny_h)

Unnamed: 0,time,temperature_2m (°C),rain (mm),snowfall (cm),weather_code (wmo code),cloud_cover_low (%),wind_speed_10m (km/h)
0,2024-01-01 00:00:00,1.6,0.0,0.0,3,94,8.9
1,2024-01-01 01:00:00,2.6,0.0,0.0,3,100,11.4
2,2024-01-01 02:00:00,2.7,0.0,0.0,3,100,9.7
3,2024-01-01 03:00:00,2.5,0.0,0.0,2,27,8.1
4,2024-01-01 04:00:00,0.5,0.0,0.0,3,5,7.5
...,...,...,...,...,...,...,...
8779,2024-12-31 19:00:00,8.0,0.0,0.0,3,0,10.7
8780,2024-12-31 20:00:00,7.9,0.0,0.0,3,6,10.8
8781,2024-12-31 21:00:00,7.2,3.1,0.0,63,18,9.7
8782,2024-12-31 22:00:00,7.2,2.8,0.0,63,36,12.2


##### Get list/dictionairy with the description of WMO Weather codes
- source: WMO- world meteorological organization

In [23]:
# Dictionary with WMO weather codes and descriptions
wmo_weather_codes = {
    0: "Clear sky",
    1: "Mainly clear",
    2: "Partly cloudy",
    3: "Overcast",
    45: "Fog",
    48: "Depositing rime fog",
    51: "Drizzle: Light",
    53: "Drizzle: Moderate",
    55: "Drizzle: Dense",
    56: "Freezing drizzle: Light",
    57: "Freezing drizzle: Dense",
    61: "Rain: Slight",
    63: "Rain: Moderate",
    65: "Rain: Heavy",
    66: "Freezing rain: Light",
    67: "Freezing rain: Heavy",
    71: "Snowfall: Slight",
    73: "Snowfall: Moderate",
    75: "Snowfall: Heavy",
    77: "Snow grains",
    80: "Rain showers: Slight",
    81: "Rain showers: Moderate",
    82: "Rain showers: Violent",
    85: "Snow showers: Slight",
    86: "Snow showers: Heavy",
    95: "Thunderstorm: Slight or moderate",
    96: "Thunderstorm with slight hail",
    99: "Thunderstorm with heavy hail"
}

df_wmo_weather_codes =pd.DataFrame(list(wmo_weather_codes.items()), columns = ['wmo_code', 'wmo_weather_desc'])
display(df_wmo_weather_codes)

Unnamed: 0,wmo_code,wmo_weather_desc
0,0,Clear sky
1,1,Mainly clear
2,2,Partly cloudy
3,3,Overcast
4,45,Fog
5,48,Depositing rime fog
6,51,Drizzle: Light
7,53,Drizzle: Moderate
8,55,Drizzle: Dense
9,56,Freezing drizzle: Light


#### Add weather description to the weather dataset
- Cloud Cover
- Overcast → The sky is completely covered with clouds (100% cloud cover).
- Partly cloudy → A mix of clouds and clear sky, typically 30-70% cloud cover.
- Mainly clear → Mostly clear with a few scattered clouds (10-30% cloud cover).
- Clear sky → No significant clouds, nearly 0% cloud cover.
- ❄️ Snowfall
- Snowfall: Slight → Light snowflakes falling, minimal accumulation.
- Snowfall: Moderate → Steady snowfall with noticeable accumulation.
- Snowfall: Heavy → Intense snowfall with rapid accumulation, possibly reducing visibility.
- 🌧 Drizzle (Light, fine rain with small droplets)
- Drizzle: Light → A few small droplets falling intermittently, barely wetting the ground.
- Drizzle: Moderate → Continuous fine rain, making surfaces damp.
- Drizzle: Dense → Heavy drizzle, creating persistent wet conditions, but not forming puddles.
- 🌦 Rain (Heavier precipitation than drizzle)
- Rain: Slight → Light rain with small raindrops and little accumulation.
- Rain: Moderate → Steady rain that wets the ground and can form small puddles.
- Rain: Heavy → Intense rainfall, quickly accumulating, possibly causing water runoff.

In [24]:
df_weather_ny_h = df_weather_ny_h.merge(
    df_wmo_weather_codes,
    left_on='weather_code (wmo code)',
    right_on= 'wmo_code', 
    how = 'left')

df_weather_ny_h.rename(columns={
    'temperature_2m (°C)': 'temperature_2m',
    'rain (mm)': 'rain_mm',
    'snowfall (cm)': 'snowfall_cm',
    'weather_code (wmo code)': 'wmo_weather_code',
    'cloud_cover_low (%)': 'cloud_cover_low_pct',
    'wind_speed_10m (km/h)': 'wind_speed_10m',
}, inplace=True)

df_weather_ny_h.drop(columns = ['wmo_code'], inplace=True)

display(df_weather_ny_h.head(10))

Unnamed: 0,time,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc
0,2024-01-01 00:00:00,1.6,0.0,0.0,3,94,8.9,Overcast
1,2024-01-01 01:00:00,2.6,0.0,0.0,3,100,11.4,Overcast
2,2024-01-01 02:00:00,2.7,0.0,0.0,3,100,9.7,Overcast
3,2024-01-01 03:00:00,2.5,0.0,0.0,2,27,8.1,Partly cloudy
4,2024-01-01 04:00:00,0.5,0.0,0.0,3,5,7.5,Overcast
5,2024-01-01 05:00:00,-0.0,0.0,0.0,3,99,8.6,Overcast
6,2024-01-01 06:00:00,1.2,0.0,0.0,3,100,3.7,Overcast
7,2024-01-01 07:00:00,1.7,0.0,0.0,3,100,5.0,Overcast
8,2024-01-01 08:00:00,3.0,0.0,0.0,3,100,4.7,Overcast
9,2024-01-01 09:00:00,3.6,0.0,0.0,3,100,5.0,Overcast


In [14]:
df_weather_ny_h_null= df_weather_ny_h['wmo_weather_desc'].isnull().sum()

display(df_weather_ny_h_null)

0

In [15]:
df_weather_ny_h['wmo_weather_desc'].unique()

array(['Overcast', 'Partly cloudy', 'Mainly clear', 'Clear sky',
       'Snowfall: Slight', 'Snowfall: Moderate', 'Snowfall: Heavy',
       'Drizzle: Moderate', 'Drizzle: Dense', 'Drizzle: Light',
       'Rain: Slight', 'Rain: Moderate', 'Rain: Heavy'], dtype=object)

In [134]:
df_weather_ny_h['weather_desc'].value_counts()

weather_desc
Clear sky             3148
Overcast              2951
Mainly clear           829
Drizzle: Light         658
Partly cloudy          552
Drizzle: Moderate      223
Rain: Moderate         115
Rain: Slight           114
Drizzle: Dense          67
Snowfall: Slight        53
Snowfall: Moderate      38
Rain: Heavy             20
Snowfall: Heavy         16
Name: count, dtype: int64

In [158]:
df_april = df_weather_ny_h[df_weather_ny_h['time'].dt.month == 4]  # Filtra apenas abril

df_april['wmo_weather_desc'].value_counts()

wmo_weather_desc
Overcast             297
Clear sky            190
Drizzle: Light        65
Mainly clear          58
Partly cloudy         45
Drizzle: Moderate     33
Rain: Slight          17
Rain: Moderate         8
Drizzle: Dense         7
Name: count, dtype: int64

In [159]:
df_weather_ny_h_rain = df_weather_ny_h[df_weather_ny_h['wmo_weather_desc'].str.contains('Rain')]
display(df_weather_ny_h_rain)

Unnamed: 0,time,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc
207,2024-01-09 15:00:00,6.0,1.5,0.0,61,100,33.2,Rain: Slight
208,2024-01-09 16:00:00,7.1,1.3,0.0,61,100,35.1,Rain: Slight
210,2024-01-09 18:00:00,9.3,3.6,0.0,63,100,31.3,Rain: Moderate
211,2024-01-09 19:00:00,10.0,3.7,0.0,63,100,32.3,Rain: Moderate
212,2024-01-09 20:00:00,11.1,5.5,0.0,63,100,46.0,Rain: Moderate
...,...,...,...,...,...,...,...,...
8424,2024-12-17 00:00:00,9.4,1.5,0.0,61,100,14.8,Rain: Slight
8738,2024-12-30 02:00:00,12.5,3.4,0.0,63,97,22.7,Rain: Moderate
8781,2024-12-31 21:00:00,7.2,3.1,0.0,63,18,9.7,Rain: Moderate
8782,2024-12-31 22:00:00,7.2,2.8,0.0,63,36,12.2,Rain: Moderate


In [228]:
df_weather_ny_h_snowfall = df_weather_ny_h[df_weather_ny_h['wmo_weather_desc'].str.contains('Snowfall')]
display(df_weather_ny_h_snowfall)

Unnamed: 0,time,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc
135,2024-01-06 15:00:00,1.9,0.0,0.14,71,23,14.7,Snowfall: Slight
136,2024-01-06 16:00:00,1.6,0.1,0.28,73,100,16.3,Snowfall: Moderate
137,2024-01-06 17:00:00,1.1,0.2,0.70,73,100,17.6,Snowfall: Moderate
138,2024-01-06 18:00:00,0.9,0.4,0.98,75,100,19.2,Snowfall: Heavy
139,2024-01-06 19:00:00,0.8,0.7,1.61,75,100,21.5,Snowfall: Heavy
...,...,...,...,...,...,...,...,...
8527,2024-12-21 07:00:00,-1.7,0.0,0.35,73,73,8.3,Snowfall: Moderate
8528,2024-12-21 08:00:00,-1.3,0.0,0.14,71,9,16.0,Snowfall: Slight
8600,2024-12-24 08:00:00,-3.5,0.0,0.63,73,4,8.2,Snowfall: Moderate
8601,2024-12-24 09:00:00,-3.4,0.0,0.63,73,0,6.5,Snowfall: Moderate


#### Merge weather data with main df city_bike

In [25]:
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,member_casual,start_borough,end_borough,day_of_month,hour,day_of_week,is_weekend,time_of_day,ride_duration_sec,ride_duration_min
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,member,Manhattan,Manhattan,27,13,Saturday,1,afternoon,549.689,9.161483
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,member,Manhattan,Manhattan,25,15,Thursday,0,afternoon,278.366,4.639433
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,member,Manhattan,Manhattan,6,11,Saturday,1,morning,411.949,6.865817
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,member,Brooklyn,Brooklyn,6,16,Saturday,1,afternoon,137.349,2.28915
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,member,Manhattan,Manhattan,10,17,Wednesday,0,evening,476.871,7.94785


In [26]:
#create a new column in the df dataframe to have date until hour

df['started_at_hour'] = df['started_at'].dt.floor('H')  # Extract the hour part (rounded down)

display(df.head())

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,start_borough,end_borough,day_of_month,hour,day_of_week,is_weekend,time_of_day,ride_duration_sec,ride_duration_min,started_at_hour
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,Manhattan,Manhattan,27,13,Saturday,1,afternoon,549.689,9.161483,2024-04-27 13:00:00
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,Manhattan,Manhattan,25,15,Thursday,0,afternoon,278.366,4.639433,2024-04-25 15:00:00
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,Manhattan,Manhattan,6,11,Saturday,1,morning,411.949,6.865817,2024-04-06 11:00:00
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,Brooklyn,Brooklyn,6,16,Saturday,1,afternoon,137.349,2.28915,2024-04-06 16:00:00
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,Manhattan,Manhattan,10,17,Wednesday,0,evening,476.871,7.94785,2024-04-10 17:00:00


In [27]:
#merge the df main with the weather data df on time column
df = df.merge(
    df_weather_ny_h,
    left_on='started_at_hour',
    right_on='time',
    how = 'left'
)

#drop time col from the weather df as is not relevant anymore
df = df.drop(columns=['time','started_at_hour'])

display(df.head())
display(df.shape)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,time_of_day,ride_duration_sec,ride_duration_min,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,afternoon,549.689,9.161483,14.3,0.0,0.0,3,16,25.3,Overcast
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,afternoon,278.366,4.639433,11.2,0.0,0.0,0,0,13.7,Clear sky
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,morning,411.949,6.865817,9.6,0.0,0.0,3,95,27.9,Overcast
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,afternoon,137.349,2.28915,10.7,0.0,0.0,3,11,25.4,Overcast
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,evening,476.871,7.94785,15.0,0.0,0.0,2,44,10.7,Partly cloudy


(3193597, 29)

- Check if we have any value in df with null value in weather description

In [28]:
df_null_weather_desc = df['wmo_weather_desc'].isnull().sum()
display(df_null_weather_desc)


0

## Add variable of distance between stations (end and start station) to the dataset - app: Open street map
    - With distance, it's also possible to calculate the velocity of the rides (faster, slow,..)
    - Guidelines on how to calculate distance - https://www.askpython.com/python/examples/find-distance-between-two-geo-locations
        - Haversine formula
        - Math module
        - Geodesic distance
        - Great Circle formula
    - for this project, we will use the Haversine formula as it's more simple to calculate ant it's pretty acurate on small distances
    - ride_distance -> in Km

### Calculate haversine distance - example

In [67]:
import haversine as hs   
from haversine import Unit
 
Benavente=(38.9630, -8.6214)
Lisboa=(38.7169, -9.1395)
 
result=hs.haversine(Benavente,Lisboa,unit=Unit.KILOMETERS)
print("The distance calculated is:",result)


The distance calculated is: 52.558405926198816


### Calculate geodesic distance - example

In [148]:
from geopy.distance import geodesic as GD

Benavente=(38.9630, -8.6214)
Lisboa=(38.7169, -9.1395)
 
print("The distance between Benavente and Lisboa is: ", GD(Benavente,Lisboa).km)

The distance between Benavente and Lisboa is:  52.6285438379004


### Final calculation using geodesic distance

In [29]:
import pandas as pd
from haversine import haversine
from geopy.distance import geodesic


## Function to calculate distance with harversine method
#def calculate_distance_hv(row):
#    start = (row['start_lat'], row['start_lng'])
#    end = (row['end_lat'], row['end_lng'])
#    return haversine(start, end)
#

# Function to calculate geodesic distance with geodesic method
def calculate_distance_geo(row):
    start = (row['start_lat'], row['start_lng'])
    end = (row['end_lat'], row['end_lng'])
    return geodesic(start, end).kilometers


# Add column 'riding_distance' to the DataFrame

#df['ride_dist_hv'] = df.apply(calculate_distance_hv, axis=1)
df['ride_distance'] = df.apply(calculate_distance_geo, axis=1)

display(df.head())

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,ride_duration_sec,ride_duration_min,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc,ride_distance
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,549.689,9.161483,14.3,0.0,0.0,3,16,25.3,Overcast,2.037884
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,278.366,4.639433,11.2,0.0,0.0,0,0,13.7,Clear sky,1.44746
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,411.949,6.865817,9.6,0.0,0.0,3,95,27.9,Overcast,1.785276
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,137.349,2.28915,10.7,0.0,0.0,3,11,25.4,Overcast,0.431973
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,476.871,7.94785,15.0,0.0,0.0,2,44,10.7,Partly cloudy,1.376807


## Add average speed for each ride
- Based on distance and ride duration, add average speed/velocity km/h
- ride_avg_speed -> in km/h

In [30]:
#function to calculate the velocity for each ride (km/h)

def calculate_velocity(row):
    distance = row['ride_distance']
    time = row['ride_duration_sec']/3600
    if time >0:
        velocity = distance/time
    else:
        velocity = 0
    return velocity

#def calculate_velocity_hv(row):
#    distance = row['ride_dist_hv']
#    time = row['ride_duration']/3600
#    if time >0:
#        velocity = distance/time
#    else:
#        velocity = 0
#    return velocity


df['ride_avg_speed'] = df.apply(calculate_velocity, axis=1)
#df['ride_avg_speed_hv'] = df.apply(calculate_velocity_hv, axis=1)

display(df.head())
display(df.shape)

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,ride_duration_min,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc,ride_distance,ride_avg_speed
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,9.161483,14.3,0.0,0.0,3,16,25.3,Overcast,2.037884,13.346422
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,4.639433,11.2,0.0,0.0,0,0,13.7,Clear sky,1.44746,18.719442
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,6.865817,9.6,0.0,0.0,3,95,27.9,Overcast,1.785276,15.601426
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,2.28915,10.7,0.0,0.0,3,11,25.4,Overcast,0.431973,11.322269
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,7.94785,15.0,0.0,0.0,2,44,10.7,Partly cloudy,1.376807,10.393806


(3193597, 31)

In [31]:
display(df.shape)
display(df.dtypes)
display(df.columns)

(3193597, 31)

ride_id                        object
rideable_type                  object
started_at             datetime64[ns]
ended_at               datetime64[ns]
start_station_name             object
start_station_id               object
end_station_name               object
end_station_id                 object
start_lat                     float64
start_lng                     float64
end_lat                       float64
end_lng                       float64
member_casual                  object
start_borough                  object
end_borough                    object
day_of_month                    int32
hour                            int32
day_of_week                    object
is_weekend                      int64
time_of_day                    object
ride_duration_sec             float64
ride_duration_min             float64
temperature_2m                float64
rain_mm                       float64
snowfall_cm                   float64
wmo_weather_code                int64
cloud_cover_

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'start_borough', 'end_borough', 'day_of_month', 'hour',
       'day_of_week', 'is_weekend', 'time_of_day', 'ride_duration_sec',
       'ride_duration_min', 'temperature_2m', 'rain_mm', 'snowfall_cm',
       'wmo_weather_code', 'cloud_cover_low_pct', 'wind_speed_10m',
       'wmo_weather_desc', 'ride_distance', 'ride_avg_speed'],
      dtype='object')

## Rename some original cols from the main df from citi bike

In [32]:
display(df.head())

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,...,ride_duration_min,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc,ride_distance,ride_avg_speed
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,9.161483,14.3,0.0,0.0,3,16,25.3,Overcast,2.037884,13.346422
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,4.639433,11.2,0.0,0.0,0,0,13.7,Clear sky,1.44746,18.719442
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,6.865817,9.6,0.0,0.0,3,95,27.9,Overcast,1.785276,15.601426
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,2.28915,10.7,0.0,0.0,3,11,25.4,Overcast,0.431973,11.322269
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,7.94785,15.0,0.0,0.0,2,44,10.7,Partly cloudy,1.376807,10.393806


In [265]:
display(df.columns)

Index(['ride_id', 'rideable_type', 'started_at', 'ended_at',
       'start_station_name', 'start_station_id', 'end_station_name',
       'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng',
       'member_casual', 'start_borough', 'end_borough', 'day_of_month', 'hour',
       'day_of_week', 'is_weekend', 'time_of_day', 'ride_duration_sec',
       'ride_duration_min', 'temperature_2m', 'rain_mm', 'snowfall_cm',
       'wmo_weather_code', 'cloud_cover_low_pct', 'wind_speed_10m',
       'wmo_weather_desc', 'ride_distance', 'ride_avg_speed'],
      dtype='object')

In [33]:
df.rename(columns = 
          {'rideable_type': 'bike_type',
          'started_at':'start_time',
          'ended_at':'end_time',
          'start_station_name':'start_station',
          'end_station_name':'end_station',
          'ride_distance':'ride_distance_km',
          'member_casual':'user_type',
          },
            inplace = True)

display(df.head())

Unnamed: 0,ride_id,bike_type,start_time,end_time,start_station,start_station_id,end_station,end_station_id,start_lat,start_lng,...,ride_duration_min,temperature_2m,rain_mm,snowfall_cm,wmo_weather_code,cloud_cover_low_pct,wind_speed_10m,wmo_weather_desc,ride_distance_km,ride_avg_speed
0,F561526822C9D60B,electric_bike,2024-04-27 13:56:13.940,2024-04-27 14:05:23.629,FDR Drive & E 35 St,6230.04,E 10 St & 2 Ave,5746.02,40.743955,-73.971391,...,9.161483,14.3,0.0,0.0,3,16,25.3,Overcast,2.037884,13.346422
1,359BAF91507F4998,electric_bike,2024-04-25 15:23:14.529,2024-04-25 15:27:52.895,Forsyth St & Grand St,5382.07,E 10 St & 2 Ave,5746.02,40.717741,-73.993388,...,4.639433,11.2,0.0,0.0,0,0,13.7,Clear sky,1.44746,18.719442
2,AAEE95A1C0106C97,electric_bike,2024-04-06 11:15:18.132,2024-04-06 11:22:10.081,E 20 St & 2 Ave,5971.08,Mott St & Prince St,5561.04,40.73579,-73.981693,...,6.865817,9.6,0.0,0.0,3,95,27.9,Overcast,1.785276,15.601426
3,95B077C9C619D404,electric_bike,2024-04-06 16:19:25.749,2024-04-06 16:21:43.098,Eastern Pkwy & Washington Ave,3928.08,Eastern Pkwy & Franklin Ave (SW Corner),3919.12,40.671649,-73.963115,...,2.28915,10.7,0.0,0.0,3,11,25.4,Overcast,0.431973,11.322269
4,1A33C864454C4692,electric_bike,2024-04-10 17:40:14.700,2024-04-10 17:48:11.571,W 27 St & 6 Ave,6215.07,E 25 St & 1 Ave,6004.07,40.745446,-73.990591,...,7.94785,15.0,0.0,0.0,2,44,10.7,Partly cloudy,1.376807,10.393806


In [34]:
display(df.columns)

Index(['ride_id', 'bike_type', 'start_time', 'end_time', 'start_station',
       'start_station_id', 'end_station', 'end_station_id', 'start_lat',
       'start_lng', 'end_lat', 'end_lng', 'user_type', 'start_borough',
       'end_borough', 'day_of_month', 'hour', 'day_of_week', 'is_weekend',
       'time_of_day', 'ride_duration_sec', 'ride_duration_min',
       'temperature_2m', 'rain_mm', 'snowfall_cm', 'wmo_weather_code',
       'cloud_cover_low_pct', 'wind_speed_10m', 'wmo_weather_desc',
       'ride_distance_km', 'ride_avg_speed'],
      dtype='object')

# Data Exploration

## How many rides per day, month, quarter?

## How many rides per bicyle type and by type of user?

# Final df to csv

In [35]:
df.to_csv('data/citi_bike_pre_proc.csv',index=False)

# NOTES / TO DOS

- Sampling on the main file. Or just get data from 3 months of the year like (march, april, may)
- Add boroughs to the city bike data (DONE)
    ask chatgpt to get borough name (queens, manathan, bronx,...) with the station name
- Neste dataset, juntar info de distancias das viagens, e vais ter a possibilidade de ter a velocidade na viagem (vigens mais rápidas ou mais lentas)
    - Para juntar as distancias, tentar obter através da api do Open street map. Api que consegues calcular as distancias entre as distancias da estacao inicial e final (DONE)
        - considerar colunas start_lat, start_lng,	end_lat, end_lng
- REMOVER COLUNAS IDs do dataset antes do data profiling - (DONE)