In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Data Summary

In [2]:
df = pd.read_csv("/Users/ishan/Downloads/202310-citibike-tripdata.csv",parse_dates=['started_at','ended_at'],low_memory=False)

In [3]:
print(df.info())
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3823673 entries, 0 to 3823672
Data columns (total 13 columns):
 #   Column              Dtype         
---  ------              -----         
 0   ride_id             object        
 1   rideable_type       object        
 2   started_at          datetime64[ns]
 3   ended_at            datetime64[ns]
 4   start_station_name  object        
 5   start_station_id    object        
 6   end_station_name    object        
 7   end_station_id      object        
 8   start_lat           float64       
 9   start_lng           float64       
 10  end_lat             float64       
 11  end_lng             float64       
 12  member_casual       object        
dtypes: datetime64[ns](2), float64(4), object(7)
memory usage: 379.2+ MB
None


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,F01D2D54E9E60D6E,classic_bike,2023-10-03 02:48:38,2023-10-03 02:48:40,Columbus Pl & Atlantic Ave,4146.02,Columbus Pl & Atlantic Ave,4146.02,40.677223,-73.922792,40.67717,-73.92285,casual
1,CAE4EDBEA07001BD,classic_bike,2023-10-11 16:03:17,2023-10-11 16:45:26,Central Park West & W 85 St,7354.01,Central Park West & W 85 St,7354.01,40.78476,-73.969862,40.78476,-73.969862,casual
2,FDC34BAD31193E07,classic_bike,2023-10-11 19:57:13,2023-10-11 20:20:10,Hicks St & Montague St,4645.09,5 St & 6 Ave,3874.01,40.694974,-73.995936,40.670484,-73.98209,casual
3,DFEA5E65AE91CE2A,classic_bike,2023-10-10 20:18:22,2023-10-10 20:18:37,Atlantic Ave & Furman St,4614.04,Atlantic Ave & Furman St,4614.04,40.691669,-74.000139,40.691652,-73.999979,casual
4,48299D8BE9B55255,classic_bike,2023-10-17 16:26:58,2023-10-17 16:34:27,E 41 St & Madison Ave (SE corner),6432.1,E 58 St & 3 Ave,6762.02,40.751845,-73.979585,40.760958,-73.967245,casual


In [4]:
from tabulate import tabulate
pd.set_option('display.width', 10000)

# Summary of numerical columns
numerical_summary = df.describe()

# Summary of categorical columns
categorical_summary = df.describe(include='object')

# Print formatted numerical columns summary
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(numerical_cols)
print("Numerical Columns Summary:\n")
print(tabulate(numerical_summary, headers='keys', tablefmt='grid'))

# Print formatted categorical columns summary
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
print(categorical_cols)
print("\nCategorical Columns Summary:\n")
print(tabulate(categorical_summary, headers='keys', tablefmt='grid'))

Index(['start_lat', 'start_lng', 'end_lat', 'end_lng'], dtype='object')
Numerical Columns Summary:

+-------+-------------------------------+-------------------------------+--------------+---------------+--------------+---------------+
|       | started_at                    | ended_at                      |    start_lat |     start_lng |      end_lat |       end_lng |
| count | 3823673                       | 3823673                       |  3.82367e+06 |   3.82367e+06 |  3.82129e+06 |   3.82129e+06 |
+-------+-------------------------------+-------------------------------+--------------+---------------+--------------+---------------+
| mean  | 2023-10-16 06:55:04.447225600 | 2023-10-16 07:09:51.799132928 | 40.7387      | -73.9727      | 40.7385      | -73.9728      |
+-------+-------------------------------+-------------------------------+--------------+---------------+--------------+---------------+
| min   | 2023-10-01 00:00:00           | 2023-10-01 00:00:17           | 40.6149   

In [5]:
df.nunique().sort_values(ascending=False)

ride_id               3823673
ended_at              1666063
started_at            1661539
start_lat              883455
start_lng              728676
end_station_name         2151
end_lat                  2124
end_lng                  2124
start_station_name       2116
end_station_id           2098
start_station_id         2063
rideable_type               2
member_casual               2
dtype: int64

In [6]:
missing_percentage = df.isnull().mean() * 100
missing_percentage_sorted = missing_percentage.sort_values(ascending=False)
print(missing_percentage_sorted.head(5))

end_station_name      0.392764
end_station_id        0.392764
start_station_name    0.151320
start_station_id      0.151320
end_lat               0.062427
dtype: float64


# Demand

In [7]:
top_stations = df['start_station_name'].value_counts().head(30).index
top_stations

Index(['W 21 St & 6 Ave', 'West St & Chambers St', 'E 41 St & Madison Ave (SE corner)', 'University Pl & E 14 St', 'Broadway & W 58 St', '11 Ave & W 41 St', 'Ave A & E 14 St', '1 Ave & E 68 St', 'Broadway & W 25 St', '7 Ave & Central Park South', '6 Ave & W 33 St', '8 Ave & W 31 St', 'W 31 St & 7 Ave', 'E 17 St & Broadway', 'E 33 St & 1 Ave', 'W 41 St & 8 Ave', '12 Ave & W 40 St', 'Central Park S & 6 Ave', 'W 30 St & 10 Ave', 'Cooper Square & Astor Pl', '2 Ave & E 29 St', 'W 20 St & 10 Ave', 'West St & Liberty St', 'W 22 St & 10 Ave', 'Broadway & E 14 St', '4 Ave & E 12 St', 'Cleveland Pl & Spring St', 'W 13 St & 5 Ave', '6 Ave & W 34 St', '8 Ave & W 33 St'], dtype='object', name='start_station_name')

In [8]:
df_filtered = df[df['start_station_name'].isin(top_stations) & 
                 df['started_at'].dt.hour.isin([8,9,10,11])]

In [9]:
avg_trips_per_day = df_filtered.groupby('start_station_name').size() / \
      df_filtered['started_at'].dt.date.nunique()

In [10]:
print(avg_trips_per_day.sort_values(ascending=False))


start_station_name
W 21 St & 6 Ave                      95.322581
E 41 St & Madison Ave (SE corner)    84.967742
Broadway & W 58 St                   82.096774
12 Ave & W 40 St                     81.064516
1 Ave & E 68 St                      77.548387
11 Ave & W 41 St                     76.419355
University Pl & E 14 St              76.064516
E 33 St & 1 Ave                      75.709677
7 Ave & Central Park South           74.935484
West St & Chambers St                71.645161
W 20 St & 10 Ave                     70.967742
8 Ave & W 31 St                      70.935484
Ave A & E 14 St                      69.354839
2 Ave & E 29 St                      68.129032
W 22 St & 10 Ave                     65.548387
E 17 St & Broadway                   64.612903
W 30 St & 10 Ave                     63.032258
West St & Liberty St                 62.322581
W 31 St & 7 Ave                      61.193548
4 Ave & E 12 St                      60.741935
Broadway & W 25 St                   60.4

In [11]:
df_filtered['trip_duration'] = (df_filtered['ended_at'] - df_filtered['started_at']).dt.total_seconds() / 60  # duration in minutes
avg_trip_time = df_filtered.groupby('start_station_name')['trip_duration'].mean()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['trip_duration'] = (df_filtered['ended_at'] - df_filtered['started_at']).dt.total_seconds() / 60  # duration in minutes


In [12]:
avg_trip_time.sort_values(ascending=False)

start_station_name
Central Park S & 6 Ave               32.758641
7 Ave & Central Park South           29.742072
12 Ave & W 40 St                     21.552507
Broadway & W 58 St                   18.776555
West St & Chambers St                16.244229
8 Ave & W 31 St                      16.032030
8 Ave & W 33 St                      15.840541
11 Ave & W 41 St                     14.812474
West St & Liberty St                 14.678899
1 Ave & E 68 St                      14.050104
6 Ave & W 34 St                      14.041748
W 41 St & 8 Ave                      14.001363
W 20 St & 10 Ave                     13.693076
Broadway & W 25 St                   13.668411
W 31 St & 7 Ave                      13.152618
W 22 St & 10 Ave                     12.683645
E 33 St & 1 Ave                      12.643147
E 41 St & Madison Ave (SE corner)    12.508618
W 30 St & 10 Ave                     12.373968
Cooper Square & Astor Pl             11.909130
W 21 St & 6 Ave                      11.8

In [17]:
# Combine the two Series into a DataFrame
combined_df = pd.concat([avg_trips_per_day, avg_trip_time], axis=1)

# Rename the columns for clarity
combined_df.columns = ['Average Trips Per Day', 'Average Trip Duration (Minutes)']

# Optionally, you can sort by one of the columns
combined_df.sort_values(by='Average Trips Per Day', ascending=False, inplace=True)

locations = [
    "E 17 St & Broadway", "W 21 St & 6 Ave", "West St & Chambers St",
    "7 Ave & Central Park South", "8 Ave & W 31 St", "Central Park S & 6 Ave",
    "University Pl & E 14 St", "W 20 St & 10 Ave", "E 41 St & Madison Ave (SE corner)",
    "West St & Liberty St"
]
filtered_combined_df = combined_df.loc[locations]

# Display the combined table
print(filtered_combined_df)

filtered_combined_df.to_csv('stations_demand_avg_time.csv', index=True)


                                   Average Trips Per Day  Average Trip Duration (Minutes)
start_station_name                                                                       
E 17 St & Broadway                             64.612903                        11.594109
W 21 St & 6 Ave                                95.322581                        11.880333
West St & Chambers St                          71.645161                        16.244229
7 Ave & Central Park South                     74.935484                        29.742072
8 Ave & W 31 St                                70.935484                        16.032030
Central Park S & 6 Ave                         60.225806                        32.758641
University Pl & E 14 St                        76.064516                        10.515720
W 20 St & 10 Ave                               70.967742                        13.693076
E 41 St & Madison Ave (SE corner)              84.967742                        12.508618
West St & 

In [18]:
import folium
from folium.features import DivIcon

# Latitude and Longitude for the updated list of stations
stations = {
    "E 17 St & Broadway": {"lat": 40.737050, "lon": -73.990093},
    "W 21 St & 6 Ave": {"lat": 40.741740, "lon": -73.994156},
    "West St & Chambers St": {"lat": 40.717548, "lon": -74.013221},
    "7 Ave & Central Park South": {"lat": 40.766368, "lon": -73.977688},
    "8 Ave & W 31 St": {"lat": 40.750020, "lon": -73.994760},
    "Central Park S & 6 Ave": {"lat": 40.765909, "lon": -73.976342},
    "University Pl & E 14 St": {"lat": 40.734927, "lon": -73.992005},
    "W 20 St & 10 Ave": {"lat": 40.746745, "lon": -74.007756},
    "E 41 St & Madison Ave (SE corner)": {"lat": 40.752722, "lon": -73.977987},
    "West St & Liberty St": {"lat": 40.711444, "lon": -74.014847}
}

# Create a map
m = folium.Map(location=[40.741895, -73.989308], zoom_start=13)

# Add markers with station names
for station, coord in stations.items():
    folium.Marker([coord['lat'], coord['lon']]).add_to(m)
    folium.map.Marker(
        [coord['lat'], coord['lon']],
        icon=DivIcon(
            icon_size=(150,36),
            icon_anchor=(0,0),
            html=f'<div style="font-size: 12pt">{station}</div>',
            )
        ).add_to(m)

# Display the map
m


In [21]:
import pandas as pd
import numpy as np
from math import radians, cos, sin, asin, sqrt

# Haversine formula to calculate the distance between two lat/lon points
def haversine(lon1, lat1, lon2, lat2):
    # Convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of Earth in kilometers
    return c * r

# Stations with their latitude and longitude
stations = {
    "E 17 St & Broadway": {"lat": 40.737050, "lon": -73.990093},
    "W 21 St & 6 Ave": {"lat": 40.741740, "lon": -73.994156},
    "West St & Chambers St": {"lat": 40.717548, "lon": -74.013221},
    "7 Ave & Central Park South": {"lat": 40.766368, "lon": -73.977688},
    "8 Ave & W 31 St": {"lat": 40.750020, "lon": -73.994760},
    "Central Park S & 6 Ave": {"lat": 40.765909, "lon": -73.976342},
    "University Pl & E 14 St": {"lat": 40.734927, "lon": -73.992005},
    "W 20 St & 10 Ave": {"lat": 40.746745, "lon": -74.007756},
    "E 41 St & Madison Ave (SE corner)": {"lat": 40.752722, "lon": -73.977987},
    "West St & Liberty St": {"lat": 40.711444, "lon": -74.014847}
}


# Average biking speed in km/h
average_speed_kmh = 15

# List to store the data
data = []

# Calculate distances and estimated bike time for each station pair
for from_station, from_coords in stations.items():
    for to_station, to_coords in stations.items():
        if from_station != to_station:
            distance = haversine(from_coords['lon'], from_coords['lat'], to_coords['lon'], to_coords['lat'])
            time_minutes = (distance / average_speed_kmh) * 60  # Convert hours to minutes
            data.append({'From': from_station, 'To': to_station, 'Distance (km)': distance, 'Estimated Time (min)': time_minutes})

# Create DataFrame from the list
distances_df = pd.DataFrame(data)

# Display the DataFrame
print(distances_df)

distances_df.to_csv('stations_time.csv', index=True)



                    From                                 To  Distance (km)  Estimated Time (min)
0     E 17 St & Broadway                    W 21 St & 6 Ave       0.623814              2.495255
1     E 17 St & Broadway              West St & Chambers St       2.915602             11.662407
2     E 17 St & Broadway         7 Ave & Central Park South       3.423387             13.693548
3     E 17 St & Broadway                    8 Ave & W 31 St       1.494832              5.979326
4     E 17 St & Broadway             Central Park S & 6 Ave       3.411631             13.646525
..                   ...                                ...            ...                   ...
85  West St & Liberty St                    8 Ave & W 31 St       4.611313             18.445253
86  West St & Liberty St             Central Park S & 6 Ave       6.870386             27.481545
87  West St & Liberty St            University Pl & E 14 St       3.244018             12.976074
88  West St & Liberty St      