In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Data Summary

In [2]:
df = pd.read_csv("/Users/ishan/Downloads/202310-citibike-tripdata.csv",parse_dates=['started_at','ended_at'],low_memory=False)

In [3]:
print(df.info())
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3823673 entries, 0 to 3823672
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
dtypes: datetime64[ns](2), float64(4), object(7)
memory usage: 379.2+ MB
None


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,F01D2D54E9E60D6E,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,4146.02,Columbus Pl & Atlantic Ave,4146.02,40.677223,-73.922792,40.67717,-73.92285,casual
1,CAE4EDBEA07001BD,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,7354.01,Central Park West & W 85 St,7354.01,40.78476,-73.969862,40.78476,-73.969862,casual
2,FDC34BAD31193E07,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,4645.09,5 St & 6 Ave,3874.01,40.694974,-73.995936,40.670484,-73.98209,casual
3,DFEA5E65AE91CE2A,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,4614.04,Atlantic Ave & Furman St,4614.04,40.691669,-74.000139,40.691652,-73.999979,casual
4,48299D8BE9B55255,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),6432.1,E 58 St & 3 Ave,6762.02,40.751845,-73.979585,40.760958,-73.967245,casual


In [4]:
from tabulate import tabulate
pd.set_option('display.width', 10000)

# Summary of numerical columns
numerical_summary = df.describe()

# Summary of categorical columns
categorical_summary = df.describe(include='object')

# Print formatted numerical columns summary
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(numerical_cols)
print("Numerical Columns Summary:\n")
print(tabulate(numerical_summary, headers='keys', tablefmt='grid'))

# Print formatted categorical columns summary
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print(categorical_cols)
print("\nCategorical Columns Summary:\n")
print(tabulate(categorical_summary, headers='keys', tablefmt='grid'))

Index(['start_lat', 'start_lng', 'end_lat', 'end_lng'], dtype='object')
Numerical Columns Summary:

+-------+-------------------------------+-------------------------------+--------------+---------------+--------------+---------------+
|       | started_at                    | ended_at                      |    start_lat |     start_lng |      end_lat |       end_lng |
| count | 3823673                       | 3823673                       |  3.82367e+06 |   3.82367e+06 |  3.82129e+06 |   3.82129e+06 |
+-------+-------------------------------+-------------------------------+--------------+---------------+--------------+---------------+
| mean  | 2023-10-16 06:55:04.447225600 | 2023-10-16 07:09:51.799132928 | 40.7387      | -73.9727      | 40.7385      | -73.9728      |
+-------+-------------------------------+-------------------------------+--------------+---------------+--------------+---------------+
| min   | 2023-10-01 00:00:00           | 2023-10-01 00:00:17           | 40.6149   

In [5]:
df.nunique().sort_values(ascending=False)

ride_id               3823673
ended_at              1666063
started_at            1661539
start_lat              883455
start_lng              728676
end_station_name         2151
end_lat                  2124
end_lng                  2124
start_station_name       2116
end_station_id           2098
start_station_id         2063
rideable_type               2
member_casual               2
dtype: int64

In [6]:
missing_percentage = df.isnull().mean() * 100
missing_percentage_sorted = missing_percentage.sort_values(ascending=False)
print(missing_percentage_sorted.head(5))

end_station_name      0.392764
end_station_id        0.392764
start_station_name    0.151320
start_station_id      0.151320
end_lat               0.062427
dtype: float64


# Demand

In [19]:
top_stations = df['start_station_name'].value_counts().head(15).index
top_stations

Index(['W 21 St & 6 Ave', 'West St & Chambers St', 'E 41 St & Madison Ave (SE corner)', 'University Pl & E 14 St', 'Broadway & W 58 St', '11 Ave & W 41 St', 'Ave A & E 14 St', '1 Ave & E 68 St', 'Broadway & W 25 St', '7 Ave & Central Park South', '6 Ave & W 33 St', '8 Ave & W 31 St', 'W 31 St & 7 Ave', 'E 17 St & Broadway', 'E 33 St & 1 Ave'], dtype='object', name='start_station_name')

In [20]:
df_filtered = df[df['start_station_name'].isin(top_stations) & 
                 df['started_at'].dt.hour.isin([8,9])]

In [21]:
avg_trips_per_day = df_filtered.groupby('start_station_name').size() / \
      df_filtered['started_at'].dt.date.nunique()

In [31]:
print(avg_trips_per_day.sort_values(ascending=False))


start_station_name
E 41 St & Madison Ave (SE corner)    56.838710
1 Ave & E 68 St                      50.354839
W 21 St & 6 Ave                      50.225806
E 33 St & 1 Ave                      46.677419
Broadway & W 58 St                   46.419355
8 Ave & W 31 St                      45.935484
11 Ave & W 41 St                     45.193548
West St & Chambers St                39.290323
Ave A & E 14 St                      37.032258
W 31 St & 7 Ave                      36.387097
6 Ave & W 33 St                      34.806452
University Pl & E 14 St              34.225806
E 17 St & Broadway                   33.193548
7 Ave & Central Park South           32.548387
Broadway & W 25 St                   29.258065
dtype: float64


In [32]:
df_filtered['trip_duration'] = (df_filtered['ended_at'] - df_filtered['started_at']).dt.total_seconds() / 60  # duration in minutes
avg_trip_time = df_filtered.groupby('start_station_name')['trip_duration'].mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['trip_duration'] = (df_filtered['ended_at'] - df_filtered['started_at']).dt.total_seconds() / 60  # duration in minutes


In [34]:
avg_trip_time.sort_values(ascending=False)

start_station_name
7 Ave & Central Park South           18.317988
11 Ave & W 41 St                     15.555068
West St & Chambers St                14.435276
Broadway & W 58 St                   14.312868
8 Ave & W 31 St                      14.308684
1 Ave & E 68 St                      13.171695
W 21 St & 6 Ave                      13.006605
E 33 St & 1 Ave                      12.726422
University Pl & E 14 St              12.409881
W 31 St & 7 Ave                      12.332033
E 41 St & Madison Ave (SE corner)    12.306139
Broadway & W 25 St                   11.548585
Ave A & E 14 St                      11.288545
E 17 St & Broadway                   11.204033
6 Ave & W 33 St                       9.587010
Name: trip_duration, dtype: float64

In [35]:
# Combine the two Series into a DataFrame
combined_df = pd.concat([avg_trips_per_day, avg_trip_time], axis=1)

# Rename the columns for clarity
combined_df.columns = ['Average Trips Per Day', 'Average Trip Duration (Minutes)']

# Optionally, you can sort by one of the columns
combined_df.sort_values(by='Average Trips Per Day', ascending=False, inplace=True)

# Display the combined table
print(combined_df)

                                   Average Trips Per Day  Average Trip Duration (Minutes)
start_station_name                                                                       
E 41 St & Madison Ave (SE corner)              56.838710                        12.306139
1 Ave & E 68 St                                50.354839                        13.171695
W 21 St & 6 Ave                                50.225806                        13.006605
E 33 St & 1 Ave                                46.677419                        12.726422
Broadway & W 58 St                             46.419355                        14.312868
8 Ave & W 31 St                                45.935484                        14.308684
11 Ave & W 41 St                               45.193548                        15.555068
West St & Chambers St                          39.290323                        14.435276
Ave A & E 14 St                                37.032258                        11.288545
W 31 St & 