# import

In [1]:
import os
import zipfile
import pandas as pd
# import geopandas as gpd
# from shapely.geometry import Point
# print(gpd.__version__)


# data combine
combine data from 2024.1-2024.12

In [2]:
# without checking datetime
# Setting the data folder path
data_folder = 'datasets'
output_file = 'bike_2024_combined.csv'

# Create an empty DataFrame for merging all data
combined_df = pd.DataFrame()
column_standard = None  # Used to standardize field names

# Iterate over all zip files in the destination folder
for file in sorted(os.listdir(data_folder)):
    if file.startswith('JC-2024'):
        if file.endswith('.zip') or file.endswith('.csv.zip'):
            zip_path = os.path.join(data_folder, file)
            print(f"Processing: {file}")

            with zipfile.ZipFile(zip_path, 'r') as z:
                # Fetch the first csv file
                csv_name = [f for f in z.namelist() if f.endswith('.csv')][0]
                with z.open(csv_name) as f:
                    df = pd.read_csv(f)

                    # Initialize standard column names (only the first file is taken as a baseline) Take out the first csv file
                    if column_standard is None:
                        column_standard = df.columns.tolist()
                    else:
                        # Trying to rename inconsistent fields
                        if set(df.columns) != set(column_standard):
                            print(f"Column mismatch in {file}")
                            continue

                    combined_df = pd.concat([combined_df, df], ignore_index=True)

# Save the merged file
combined_df.to_csv(output_file, index=False)
output_file


Processing: JC-202401-citibike-tripdata.csv.zip
Processing: JC-202402-citibike-tripdata.csv.zip
Processing: JC-202403-citibike-tripdata.csv.zip
Processing: JC-202404-citibike-tripdata.csv.zip
Processing: JC-202405-citibike-tripdata.csv.zip
Processing: JC-202406-citibike-tripdata.csv.zip
Processing: JC-202407-citibike-tripdata.csv.zip
Processing: JC-202408-citibike-tripdata.csv.zip
Processing: JC-202409-citibike-tripdata.csv.zip
Processing: JC-202410-citibike-tripdata.csv.zip
Processing: JC-202411-citibike-tripdata.csv.zip
Processing: JC-202412-citibike-tripdata.csv.zip


'bike_2024_combined.csv'

# Data clean

In [3]:
df = pd.read_csv("bike_2024_combined.csv", parse_dates=False) # Avoid pandas automatically parsing dates

df.head()


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0744109F13385D1D,electric_bike,2024-01-15 15:18:07,2024-01-15 15:32:44,Morris Canal,JC072,Oakland Ave,JC022,40.712297,-74.038185,40.737604,-74.052478,member
1,B1488BFEF9118000,classic_bike,2024-01-13 15:32:50,2024-01-13 15:36:18,JC Medical Center,JC110,Grove St PATH,JC115,40.715391,-74.049692,40.71941,-74.04309,member
2,95A2FE8E51B4C836,classic_bike,2024-01-19 13:11:00,2024-01-19 13:14:44,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member
3,95D9AFF6A1652DC1,classic_bike,2024-01-23 07:03:49,2024-01-23 07:07:11,Morris Canal,JC072,Exchange Pl,JC116,40.712419,-74.038526,40.716366,-74.034344,member
4,5F7408988A83B1B3,classic_bike,2024-01-01 16:46:10,2024-01-01 16:50:31,Morris Canal,JC072,Harborside,JC104,40.712419,-74.038526,40.719252,-74.034234,member


In [4]:
df.dtypes


ride_id                object
rideable_type          object
started_at             object
ended_at               object
start_station_name     object
start_station_id       object
end_station_name       object
end_station_id         object
start_lat             float64
start_lng             float64
end_lat               float64
end_lng               float64
member_casual          object
dtype: object

In [5]:
rows, cols = df.shape
print(f"rows: {rows}, columns: {cols}")


rows: 1052451, columns: 13


## Check whether this station is located in Manhattan
- start_in_manhattan: If or not the start point is in Manhattan.
- end_in_manhattan: whether the end point is in Manhattan 
- out_of_manhattan: At least one end of it isn't in Manhattan.
- A value of 1 means it is in Manhattan, 0 means it is not.

In [6]:
# Define Manhattan bounding box
lat_min, lat_max = 40.700, 40.880
lng_min, lng_max = -74.020, -73.910

# Check if start or end station is in Manhattan
df['start_in_manhattan'] = (
    (df['start_lat'].between(lat_min, lat_max)) &
    (df['start_lng'].between(lng_min, lng_max))
).astype(int)

df['end_in_manhattan'] = (
    (df['end_lat'].between(lat_min, lat_max)) &
    (df['end_lng'].between(lng_min, lng_max))
).astype(int)

df['out_of_manhattan'] = 1 - (
    df['start_in_manhattan'] & df['end_in_manhattan']
)

## Clean started_at and ended at

In [7]:
# convert started_at column to datetime type
df['started_at_parsed'] = pd.to_datetime(df['started_at'], errors='coerce')

# extract year, month, and day from the started_at_parsed column
df['Started_Year'] = df['started_at_parsed'].dt.year
df['Started_Month'] = df['started_at_parsed'].dt.month
df['Started_Day'] = df['started_at_parsed'].dt.day
df['Started_Hour'] = df['started_at_parsed'].dt.hour

# extract season based on the month
df['Started_Season'] = df['Started_Month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Fall', 10: 'Fall', 11: 'Fall'
})

# extract weekday name from the started_at_parsed column
df['Started_Weekday'] = df['started_at_parsed'].dt.day_name()

# df.to_csv('bike_2024_combined_cleaned.csv', index=False)


In [8]:
# convert ended_at column to datetime type
df['ended_at_parsed'] = pd.to_datetime(df['ended_at'], errors='coerce')

# extract year, month, and day from the ended_at_parsed column
df['Ended_Year'] = df['ended_at_parsed'].dt.year
df['Ended_Month'] = df['ended_at_parsed'].dt.month
df['Ended_Day'] = df['ended_at_parsed'].dt.day
df['Ended_Hour'] = df['ended_at_parsed'].dt.hour

# extract season based on the month
df['Ended_Season'] = df['Ended_Month'].map({
    12: 'Winter', 1: 'Winter', 2: 'Winter',
    3: 'Spring', 4: 'Spring', 5: 'Spring',
    6: 'Summer', 7: 'Summer', 8: 'Summer',
    9: 'Fall', 10: 'Fall', 11: 'Fall'
})

# extract weekday name from the ended_at_parsed column
df['Ended_Weekday'] = df['ended_at_parsed'].dt.day_name()


In [9]:
df.dtypes

ride_id                       object
rideable_type                 object
started_at                    object
ended_at                      object
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
start_in_manhattan             int64
end_in_manhattan               int64
out_of_manhattan               int64
started_at_parsed     datetime64[ns]
Started_Year                 float64
Started_Month                float64
Started_Day                  float64
Started_Hour                 float64
Started_Season                object
Started_Weekday               object
ended_at_parsed       datetime64[ns]
Ended_Year                   float64
Ended_Month                  float64
Ended_Day                    float64
E

## Clean rideable_type

In [10]:
df['rideable_type'].dropna().unique().tolist()

['electric_bike', 'classic_bike']

## clean station_name and station_id 

### check the exception values of clean start_station_name and start_station_id 

In [11]:
# df['start_station_name'].dropna().unique().tolist()
# sorted(df['start_station_name'].dropna().unique().tolist())


In [12]:
# df['start_station_id'].dropna().unique().tolist()

In [13]:
# --- Step 1: Find station names linked to multiple IDs ---
name_to_ids = df.groupby('start_station_name')['start_station_id'].nunique()
conflict_names = name_to_ids[name_to_ids > 1]

# --- Step 2: Find station IDs linked to multiple names ---
id_to_names = df.groupby('start_station_id')['start_station_name'].nunique()
conflict_ids = id_to_names[id_to_names > 1]

# --- Step 3: Print mismatched name → ID mappings ---
print("Station names linked to multiple IDs:")
print(df[df['start_station_name'].isin(conflict_names.index)][['start_station_name', 'start_station_id']].drop_duplicates())

# --- Step 4: Print mismatched ID → name mappings ---
print("\nStation IDs linked to multiple names:")
print(df[df['start_station_id'].isin(conflict_ids.index)][['start_station_id', 'start_station_name']].drop_duplicates())


Station names linked to multiple IDs:
Empty DataFrame
Columns: [start_station_name, start_station_id]
Index: []

Station IDs linked to multiple names:
Empty DataFrame
Columns: [start_station_id, start_station_name]
Index: []


### check the exception values of end_station_name and end_station_id

In [14]:
# df['end_station_name'].dropna().unique().tolist()

In [15]:
# df['end_station_id'].dropna().unique().tolist()

In [16]:
# --- Step 5: Find end station names linked to multiple IDs ---
end_name_to_ids = df.groupby('end_station_name')['end_station_id'].nunique()
end_conflict_names = end_name_to_ids[end_name_to_ids > 1]

# --- Step 6: Find end station IDs linked to multiple names ---
end_id_to_names = df.groupby('end_station_id')['end_station_name'].nunique()
end_conflict_ids = end_id_to_names[end_id_to_names > 1]

# --- Step 7: Print mismatched end name → ID mappings ---
print("\nEnd station names linked to multiple IDs:")
print(df[df['end_station_name'].isin(end_conflict_names.index)][['end_station_name', 'end_station_id']].drop_duplicates())

# --- Step 8: Print mismatched end ID → name mappings ---
print("\nEnd station IDs linked to multiple names:")
print(df[df['end_station_id'].isin(end_conflict_ids.index)][['end_station_id', 'end_station_name']].drop_duplicates())


End station names linked to multiple IDs:
          end_station_name end_station_id
123816  E 17 St & Broadway        5980.07
180093     W 54 St & 9 Ave        6920.03
671474     W 54 St & 9 Ave        6920.05
822560  E 17 St & Broadway        5980.10

End station IDs linked to multiple names:
       end_station_id           end_station_name
114909        5772.05  Morton St & Washington St
906473        5772.05   Morton St & Greenwich St


### clean those stations

In [17]:
# Convert station IDs to string for safe comparison
df['start_station_id'] = df['start_station_id'].astype(str)
df['end_station_id'] = df['end_station_id'].astype(str)

# Normalize station names for known issues
replace_dict = {
    'W 180 St & Ft Washington Ave': 'W 180 St & Fort Washington Ave',
    'Morton St & Greenwich St': 'Morton St & Washington St',
    'Central Park W & W 76 St': 'Central Park West & W 76 St',
    'Monmouth & 6th': 'Monmouth and 6th'
}
df['start_station_name'] = df['start_station_name'].replace(replace_dict)
df['end_station_name'] = df['end_station_name'].replace(replace_dict)

# For station names mapped to multiple IDs: keep smallest by string order
start_name_to_id = (
    df.groupby('start_station_name')['start_station_id']
    .apply(lambda x: sorted(x.unique())[0])
)
end_name_to_id = (
    df.groupby('end_station_name')['end_station_id']
    .apply(lambda x: sorted(x.unique())[0])
)

df['start_station_id'] = df['start_station_name'].map(start_name_to_id)
df['end_station_id'] = df['end_station_name'].map(end_name_to_id)


### check whether exceptions still exist:

In [18]:
# --- Step 1: Find station names linked to multiple IDs ---
name_to_ids = df.groupby('start_station_name')['start_station_id'].nunique()
conflict_names = name_to_ids[name_to_ids > 1]

# --- Step 2: Find station IDs linked to multiple names ---
id_to_names = df.groupby('start_station_id')['start_station_name'].nunique()
conflict_ids = id_to_names[id_to_names > 1]

# --- Step 3: Print mismatched name → ID mappings ---
print("Station names linked to multiple IDs:")
print(df[df['start_station_name'].isin(conflict_names.index)][['start_station_name', 'start_station_id']].drop_duplicates())

# --- Step 4: Print mismatched ID → name mappings ---
print("\nStation IDs linked to multiple names:")
print(df[df['start_station_id'].isin(conflict_ids.index)][['start_station_id', 'start_station_name']].drop_duplicates())


Station names linked to multiple IDs:
Empty DataFrame
Columns: [start_station_name, start_station_id]
Index: []

Station IDs linked to multiple names:
Empty DataFrame
Columns: [start_station_id, start_station_name]
Index: []


In [19]:
# --- Step 5: Find end station names linked to multiple IDs ---
end_name_to_ids = df.groupby('end_station_name')['end_station_id'].nunique()
end_conflict_names = end_name_to_ids[end_name_to_ids > 1]

# --- Step 6: Find end station IDs linked to multiple names ---
end_id_to_names = df.groupby('end_station_id')['end_station_name'].nunique()
end_conflict_ids = end_id_to_names[end_id_to_names > 1]

# --- Step 7: Print mismatched end name → ID mappings ---
print("\nEnd station names linked to multiple IDs:")
print(df[df['end_station_name'].isin(end_conflict_names.index)][['end_station_name', 'end_station_id']].drop_duplicates())

# --- Step 8: Print mismatched end ID → name mappings ---
print("\nEnd station IDs linked to multiple names:")
print(df[df['end_station_id'].isin(end_conflict_ids.index)][['end_station_id', 'end_station_name']].drop_duplicates())


End station names linked to multiple IDs:
Empty DataFrame
Columns: [end_station_name, end_station_id]
Index: []

End station IDs linked to multiple names:
Empty DataFrame
Columns: [end_station_id, end_station_name]
Index: []


In [20]:
df.dtypes

ride_id                       object
rideable_type                 object
started_at                    object
ended_at                      object
start_station_name            object
start_station_id              object
end_station_name              object
end_station_id                object
start_lat                    float64
start_lng                    float64
end_lat                      float64
end_lng                      float64
member_casual                 object
start_in_manhattan             int64
end_in_manhattan               int64
out_of_manhattan               int64
started_at_parsed     datetime64[ns]
Started_Year                 float64
Started_Month                float64
Started_Day                  float64
Started_Hour                 float64
Started_Season                object
Started_Weekday               object
ended_at_parsed       datetime64[ns]
Ended_Year                   float64
Ended_Month                  float64
Ended_Day                    float64
E

# calculate busyness by station-hour

In [21]:
# 提取起点在曼哈顿的站点
start_stations = df[df['start_in_manhattan'] == 1][['start_station_id', 'start_station_name']]
start_stations = start_stations.rename(columns={
    'start_station_id': 'station_id',
    'start_station_name': 'station_name'
})

# 提取终点在曼哈顿的站点
end_stations = df[df['end_in_manhattan'] == 1][['end_station_id', 'end_station_name']]
end_stations = end_stations.rename(columns={
    'end_station_id': 'station_id',
    'end_station_name': 'station_name'
})

# 合并并去重
stations_in_manhattan = pd.concat([start_stations, end_stations], ignore_index=True)
stations_in_manhattan = stations_in_manhattan.dropna().drop_duplicates().sort_values(by='station_id')

# 打印所有在曼哈顿的唯一站点
print(stations_in_manhattan)

     station_id                     station_name
1       4074.14       Classon Ave & St Marks Ave
1606    4829.01  Columbia Heights & Cranberry St
572     4846.01          South St & Whitehall St
77      4883.03            Broadway & Whipple St
2299    4895.03                Front St & Jay St
...         ...                              ...
501     8381.04          W 181 St & Riverside Dr
1556    8399.06   Fort Washington Ave & W 183 St
238     8715.06              W 218 St & Broadway
448       HB201             12 St & Sinatra Dr N
17        HB611                  4 St & River St

[394 rows x 2 columns]


In [22]:
# Step 1: Filter trips where at least one end is in Manhattan
df_mh = df[(df['start_in_manhattan'] == 1) | (df['end_in_manhattan'] == 1)].copy()

# Step 2: Extract hourly timestamps
df_mh['Started_Timestamp'] = pd.to_datetime(df_mh['started_at_parsed']).dt.floor('h')
df_mh['Ended_Timestamp'] = pd.to_datetime(df_mh['ended_at_parsed']).dt.floor('h')

# Step 3: Count outflow by (start_station_id, Started_Timestamp)
outflow = (
    df_mh.groupby(['start_station_id', 'Started_Timestamp'])
    .size()
    .reset_index(name='outflow')
    .rename(columns={'start_station_id': 'station_id', 'Started_Timestamp': 'timestamp'})
)

# Step 4: Count inflow by (end_station_id, Ended_Timestamp)
inflow = (
    df_mh.groupby(['end_station_id', 'Ended_Timestamp'])
    .size()
    .reset_index(name='inflow')
    .rename(columns={'end_station_id': 'station_id', 'Ended_Timestamp': 'timestamp'})
)

# Step 5: Merge inflow and outflow on (station_id, timestamp)
busyness = pd.merge(outflow, inflow, on=['station_id', 'timestamp'], how='outer').fillna(0)

# Step 6: Optional - convert counts to integers
busyness['inflow'] = busyness['inflow'].astype(int)
busyness['outflow'] = busyness['outflow'].astype(int)

# Step 7: Save if needed
busyness.to_csv("manhattan_station_busyness.csv", index=False)

# Preview
print(busyness.head())


  station_id           timestamp  outflow  inflow
0    3431.02 2024-02-14 09:00:00        0       1
1    4074.14 2024-02-14 08:00:00        1       0
2    4846.01 2024-05-26 13:00:00        0       2
3    4883.03 2024-01-24 17:00:00        0       1
4    4927.04 2024-04-04 17:00:00        0       1


In [23]:
# Filter for trips with either end in Manhattan
df_mh = df[(df['start_in_manhattan'] == 1) | (df['end_in_manhattan'] == 1)].copy()

# Drop rows with missing hour values
df_mh = df_mh.dropna(subset=['Started_Hour', 'Ended_Hour'])

# Ensure hour columns are integer
df_mh['Started_Hour'] = df_mh['Started_Hour'].astype(int)
df_mh['Ended_Hour'] = df_mh['Ended_Hour'].astype(int)

# Group and count outflow
outflow = df_mh[df_mh['start_in_manhattan'] == 1].groupby(
    ['start_station_id', 'Started_Hour']
).size().reset_index(name='outflow').rename(
    columns={'start_station_id': 'station_id', 'Started_Hour': 'hour'}
)

# Group and count inflow
inflow = df_mh[df_mh['end_in_manhattan'] == 1].groupby(
    ['end_station_id', 'Ended_Hour']
).size().reset_index(name='inflow').rename(
    columns={'end_station_id': 'station_id', 'Ended_Hour': 'hour'}
)

# Merge inflow and outflow
busyness = pd.merge(outflow, inflow, on=['station_id', 'hour'], how='outer').fillna(0)
busyness[['inflow', 'outflow']] = busyness[['inflow', 'outflow']].astype(int)
busyness = busyness.sort_values(by=['station_id', 'hour'])

busyness.to_csv("manhattan_station_busyness_hour.csv", index=False)

# Preview
print(busyness.head())


  station_id  hour  outflow  inflow
0    4074.14     8        1       0
1    4846.01    13        0       2
2    4883.03    17        0       1
3    4927.04    17        0       1
4    4939.07    13        0       1


In [24]:
df.to_csv("bike_2024_combined_cleaned.csv", index=False)