In [23]:
import pandas as pd

Read the "bus_clean.csv" file

In [24]:
bus = pd.read_csv("processdata/bus_clean.csv", parse_dates=["service_date", "scheduled_dt", "actual_dt"]) #convert dates to datetime64 type
print("Loaded bus_clean.csv:", bus.shape) #sanity check row and col numbers

Loaded bus_clean.csv: (54144994, 13)


In [25]:
weather_2023 = pd.read_csv("rawdata/2023bostonweather.csv") #read in the file
weather_2024 = pd.read_csv("rawdata/2024bostonweather.csv")

weather_raw = pd.concat([weather_2023, weather_2024], ignore_index=True) #merge two files
print("Loaded weather data:", weather_raw.shape) #sanity check row and col numbers

Loaded weather data: (17544, 22)


Remove uselss columns

In [26]:
cols_to_drop = [c for c in weather_raw.columns if c.endswith("_source")] 
cols_to_drop += ["wpgt"]
weather_raw = weather_raw.drop(columns=cols_to_drop, errors="ignore")

Create hour-level timestamp to help merge with "bus_clean.csv"

In [27]:
weather_raw["timestamp_hour"] = pd.to_datetime(
    weather_raw[["year", "month", "day", "hour"]]
)

Select the columns we want to keep as potential features.

In [28]:
keep_weather_cols = [
    "timestamp_hour",
    "temp", "rhum", "prcp", "wdir", "wspd", "pres", "cldc", "coco"
]
weather = weather_raw[keep_weather_cols].copy()

Rename columns

In [29]:
weather = weather.rename(columns={
    "temp": "air_temp_c",
    "rhum": "rel_humidity_pct",
    "prcp": "precip_mm",
    "wdir": "wind_dir_deg",
    "wspd": "wind_speed_kmh",
    "pres": "pressure_hpa",
    "cldc": "cloud_cover",
    "coco": "weather_condition"
})

Assign name to "weather_condition" for different weather conditions

In [30]:
weather_condition_map = {
    1: "Clear",
    2: "Fair",
    3: "Cloudy",
    4: "Overcast",
    5: "Fog",
    6: "Freezing Fog",
    7: "Light Rain",
    8: "Rain",
    9: "Heavy Rain",
    10: "Freezing Rain",
    11: "Heavy Freezing Rain",
    12: "Sleet",
    13: "Heavy Sleet",
    14: "Light Snowfall",
    15: "Snowfall",
    16: "Heavy Snowfall",
    17: "Rain Shower",
    18: "Heavy Rain Shower",
    19: "Sleet Shower",
    20: "Heavy Sleet Shower",
    21: "Snow Shower",
    22: "Heavy Snow Shower",
    23: "Lightning",
    24: "Hail",
    25: "Thunderstorm",
    26: "Heavy Thunderstorm",
    27: "Storm"
}

# Apply mapping
weather["weather_condition"] = weather["weather_condition"].map(weather_condition_map)

#  Check results
print(weather["weather_condition"].head(10))

0    Light Rain
1    Light Rain
2    Light Rain
3    Light Rain
4    Light Rain
5    Light Rain
6    Light Rain
7    Light Rain
8      Overcast
9      Overcast
Name: weather_condition, dtype: object


In [31]:
weather.head(5)

Unnamed: 0,timestamp_hour,air_temp_c,rel_humidity_pct,precip_mm,wind_dir_deg,wind_speed_kmh,pressure_hpa,cloud_cover,weather_condition
0,2023-01-01 00:00:00,11.7,100,1.2,210,11.2,1010.4,8,Light Rain
1,2023-01-01 01:00:00,11.1,100,0.3,190,11.2,1008.9,8,Light Rain
2,2023-01-01 02:00:00,11.7,100,0.5,210,13.0,1008.2,8,Light Rain
3,2023-01-01 03:00:00,11.1,100,1.8,200,11.2,1007.2,8,Light Rain
4,2023-01-01 04:00:00,11.1,100,2.3,190,11.2,1006.2,8,Light Rain


In [32]:
bus.head(5)

Unnamed: 0,service_date,hour,weekday,is_weekend,route_id,direction_id,stop_id,time_point_order,point_type,scheduled_dt,actual_dt,delay_seconds,delay_minutes
0,2023-01-01,6,6,1,1,Inbound,110,1,Startpoint,2023-01-01 06:05:00,2023-01-01 06:05:04,4.0,0.066667
1,2023-01-01,6,6,1,1,Inbound,67,2,Midpoint,2023-01-01 06:09:00,2023-01-01 06:06:28,-152.0,-2.533333
2,2023-01-01,6,6,1,1,Inbound,72,3,Midpoint,2023-01-01 06:12:00,2023-01-01 06:08:57,-183.0,-3.05
3,2023-01-01,6,6,1,1,Inbound,75,4,Midpoint,2023-01-01 06:15:00,2023-01-01 06:12:41,-139.0,-2.316667
4,2023-01-01,6,6,1,1,Inbound,79,5,Midpoint,2023-01-01 06:19:00,2023-01-01 06:16:35,-145.0,-2.416667


Merge bus_clean table and weather table

In [33]:
bus["event_hour"] = bus["actual_dt"].dt.floor("h") #convert time into hous to match witrh timpestamp hour
bus_weather = pd.merge(
    bus,
    weather,
    left_on="event_hour",
    right_on="timestamp_hour",
    how="left"
)
bus_weather = bus_weather.drop(columns=["timestamp_hour"]) #remove duplicate columns

View the new table and save it

In [34]:
print("Merged shape:", bus_weather.shape)
print(bus_weather.head(10))

Merged shape: (54144994, 22)
  service_date  hour  weekday  is_weekend route_id direction_id  stop_id  \
0   2023-01-01     6        6           1       01      Inbound      110   
1   2023-01-01     6        6           1       01      Inbound       67   
2   2023-01-01     6        6           1       01      Inbound       72   
3   2023-01-01     6        6           1       01      Inbound       75   
4   2023-01-01     6        6           1       01      Inbound       79   
5   2023-01-01     6        6           1       01      Inbound      187   
6   2023-01-01     6        6           1       01      Inbound       59   
7   2023-01-01     6        6           1       01      Inbound      110   
8   2023-01-01     6        6           1       01      Inbound       67   
9   2023-01-01     6        6           1       01      Inbound       62   

   time_point_order  point_type        scheduled_dt  ... delay_minutes  \
0                 1  Startpoint 2023-01-01 06:05:00  ...    

Check missing values

In [35]:
bus_weather.isna().sum()

service_date               0
hour                       0
weekday                    0
is_weekend                 0
route_id                   0
direction_id               0
stop_id                    0
time_point_order           0
point_type                 0
scheduled_dt               0
actual_dt            4283986
delay_seconds        4283986
delay_minutes        4283986
event_hour           4283986
air_temp_c           4283986
rel_humidity_pct     4283986
precip_mm            7739966
wind_dir_deg         4283986
wind_speed_kmh       4283986
pressure_hpa         4283986
cloud_cover          4283986
weather_condition    4283986
dtype: int64

Drop rows that contain missing values

In [36]:
bus_weather_clean = bus_weather.dropna()
print(bus_weather_clean.shape)

(46405028, 22)


In [37]:
bus_weather_clean.head(10)

Unnamed: 0,service_date,hour,weekday,is_weekend,route_id,direction_id,stop_id,time_point_order,point_type,scheduled_dt,...,delay_minutes,event_hour,air_temp_c,rel_humidity_pct,precip_mm,wind_dir_deg,wind_speed_kmh,pressure_hpa,cloud_cover,weather_condition
0,2023-01-01,6,6,1,1,Inbound,110,1,Startpoint,2023-01-01 06:05:00,...,0.066667,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
1,2023-01-01,6,6,1,1,Inbound,67,2,Midpoint,2023-01-01 06:09:00,...,-2.533333,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
2,2023-01-01,6,6,1,1,Inbound,72,3,Midpoint,2023-01-01 06:12:00,...,-3.05,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
3,2023-01-01,6,6,1,1,Inbound,75,4,Midpoint,2023-01-01 06:15:00,...,-2.316667,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
4,2023-01-01,6,6,1,1,Inbound,79,5,Midpoint,2023-01-01 06:19:00,...,-2.416667,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
5,2023-01-01,6,6,1,1,Inbound,187,6,Midpoint,2023-01-01 06:21:00,...,0.65,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
6,2023-01-01,6,6,1,1,Inbound,59,7,Midpoint,2023-01-01 06:25:00,...,-1.066667,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
7,2023-01-01,6,6,1,1,Inbound,110,1,Startpoint,2023-01-01 06:25:00,...,-0.283333,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
8,2023-01-01,6,6,1,1,Inbound,67,2,Midpoint,2023-01-01 06:29:00,...,-2.016667,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain
9,2023-01-01,6,6,1,1,Inbound,62,8,Midpoint,2023-01-01 06:29:00,...,-0.516667,2023-01-01 06:00:00,12.2,97.0,0.8,210.0,16.6,1004.1,8.0,Light Rain


In [38]:
bus_weather_clean.describe()

Unnamed: 0,service_date,hour,weekday,is_weekend,stop_id,time_point_order,scheduled_dt,actual_dt,delay_seconds,delay_minutes,event_hour,air_temp_c,rel_humidity_pct,precip_mm,wind_dir_deg,wind_speed_kmh,pressure_hpa,cloud_cover
count,46405028,46405030.0,46405030.0,46405030.0,46405030.0,46405030.0,46405028,46405028,46405030.0,46405030.0,46405028,46405030.0,46405030.0,46405030.0,46405030.0,46405030.0,46405030.0,46405030.0
mean,2024-01-03 12:18:38.215103232,13.07246,2.676608,0.1965832,12664.93,4.353983,2024-01-04 01:52:16.887102976,2024-01-04 01:52:55.204860672,38.31776,0.6386293,2024-01-04 01:22:57.724167680,12.30523,66.86836,0.1404955,198.6976,16.96746,1015.882,4.789645
min,2023-01-01 00:00:00,0.0,0.0,0.0,2.0,1.0,2023-01-01 00:00:00,2023-01-01 00:00:02,-86340.0,-1439.0,2023-01-01 00:00:00,-22.8,13.0,0.0,0.0,0.0,979.0,0.0
25%,2023-07-03 00:00:00,8.0,1.0,0.0,1323.0,2.0,2023-07-03 11:22:00,2023-07-03 11:22:24,26.0,0.4333333,2023-07-03 11:00:00,5.0,53.0,0.0,110.0,11.2,1010.7,2.0
50%,2024-01-05 00:00:00,14.0,3.0,0.0,5098.0,4.0,2024-01-05 08:24:00,2024-01-05 08:24:50,170.0,2.833333,2024-01-05 08:00:00,11.7,68.0,0.0,220.0,16.6,1016.0,5.0
75%,2024-07-05 00:00:00,18.0,4.0,0.0,12003.0,6.0,2024-07-05 22:20:00,2024-07-05 22:19:12.249999872,398.0,6.633333,2024-07-05 22:00:00,20.0,83.0,0.0,290.0,22.3,1021.5,8.0
max,2024-12-31 00:00:00,23.0,6.0,1.0,883321.0,28.0,2024-12-31 23:59:00,2024-12-31 23:59:59,86399.0,1439.983,2024-12-31 23:00:00,36.7,100.0,21.8,360.0,79.6,1047.3,8.0
std,,6.094504,1.864209,0.3974144,21491.26,2.538687,,,4920.911,82.01519,,9.106053,19.56994,0.8193298,103.6883,8.086031,8.492587,2.632038


Save file

In [39]:
bus_weather_clean.to_csv("processdata/bus_weather_clean.csv", index=False)
print("Saved as bus_weather_clean.csv")

Saved as bus_weather_clean.csv
