# Railway Delays Analysis: Holidays and Weather Impact


In [2]:
# 1. Import Libraries and Load Dataset
#import pandas, load and preview .csv file
import pandas as pd
trains = pd.read_csv('path/to/file')
trains.head()     


Unnamed: 0,date,train_number,train_direction,station_name,station_order,scheduled_arrival_time,scheduled_departure_time,stop_time,actual_arrival_time,actual_departure_time,arrival_delay,departure_delay,wind,weather,temperature,major_holiday
0,2019-10-09,G1,down,Beijingnan Railway Station,1,09:00:00,09:00:00,----,09:00:00,09:00:00,0,0,light winds from the S,sunny,22,False
1,2019-10-09,G1,down,Jinanxi Railway Station,2,10:22:00,10:24:00,2,10:21:00,10:23:00,-1,-1,light winds from the S,light rain,22,False
2,2019-10-09,G1,down,Nanjingnan Railway Station,3,12:24:00,12:26:00,2,12:24:00,12:27:00,0,1,light winds from the SE,cloudy,24,False
3,2019-10-09,G1,down,Shanghaihongqiao Railway Station,4,13:28:00,13:28:00,----,13:24:00,13:24:00,-4,0,light winds from the E,cloudy,24,False
4,2019-10-10,G1,down,Beijingnan Railway Station,1,09:00:00,09:00:00,----,09:00:00,09:00:00,0,0,light winds from the E,cloudy,21,False


In [4]:
# 2. Data Preprocessing and Preparation
# check for missing values
trains.isna().sum()

date                        0
train_number                0
train_direction             0
station_name                0
station_order               0
scheduled_arrival_time      0
scheduled_departure_time    0
stop_time                   0
actual_arrival_time         0
actual_departure_time       0
arrival_delay               0
departure_delay             0
wind                        0
weather                     0
temperature                 0
major_holiday               0
dtype: int64

In [8]:
#As this is quite a large dataset, the following analysis will focus on comparing data from October (2019) to data from December (2019).

#split dataset into two dfs: 9 Oct - 31 Oct and 9 Jan - 31 Jan
oct_mask = (trains['date'] >= '2019-10-09') & (trains['date'] <= '2019-10-31') 
oct_trains = trains[oct_mask]

#drop column train_number from oct_trains
oct_trains = oct_trains.drop(['train_number'], axis = 1)
oct_trains.head()

dec_mask = (trains['date'] >= '2019-12-09') & (trains['date'] <= '2019-12-31') 
dec_trains = trains[dec_mask]

#drop column train_number from dec_trains
dec_trains = dec_trains.drop(['train_number'], axis = 1)
dec_trains.head()

Unnamed: 0,date,train_direction,station_name,station_order,scheduled_arrival_time,scheduled_departure_time,stop_time,actual_arrival_time,actual_departure_time,arrival_delay,departure_delay,wind,weather,temperature,major_holiday
152,2019-12-09,down,Beijingnan Railway Station,1,09:00:00,09:00:00,----,08:59:00,08:59:00,0,-1,light winds from the SW,haze,3,False
153,2019-12-09,down,Jinanxi Railway Station,2,10:22:00,10:24:00,2,10:20:00,10:22:00,-2,-2,light winds from the S,fog,10,False
154,2019-12-09,down,Nanjingnan Railway Station,3,12:24:00,12:26:00,2,12:28:00,12:30:00,4,4,light winds from the SW,sunny,17,False
155,2019-12-09,down,Shanghaihongqiao Railway Station,4,13:28:00,13:28:00,----,13:25:00,13:25:00,-3,0,gentle breeze from the SW,cloudy,15,False
156,2019-12-10,down,Beijingnan Railway Station,1,09:00:00,09:00:00,----,08:59:00,08:59:00,0,-1,moderate breeze from the NW,cloudy,8,False


In [9]:
# 3. Analysis of Train Rides on Holidays vs. Non-Holidays
# Do major holidays have an impact on railway delay times? 
# count number of holidays for oct and dec df
oct_nunique = oct_trains[oct_trains['major_holiday']]['date'].nunique()
oct_nunique
dec_nunique = dec_trains[dec_trains['major_holiday']]['date'].nunique()
dec_nunique

#oct_trains contains 1 major holiday, dec_trains 2 major holidays

2

In [10]:
# train rides on major holidays and train rides on regular days
oct_trains_rides = oct_trains['major_holiday'].value_counts()
oct_trains_rides

# oct_trains contains 536926 trainrides on non major holidays (23 days), averaging to 23,345 a day. 
# On the major holiday, there are 24,019 train rides. 
# The amount of train rides seems to be slightly higher on the holiday.

major_holiday
False    536926
True      24019
Name: count, dtype: int64

In [11]:
dec_trains_rides = dec_trains['major_holiday'].value_counts()
dec_trains_rides

# dec_trains contains 504321 trainrides on non major holidays (23 days), averaging to 21,927 a day. 
# On the 2 major holidays, there are 47,980 train rides, averaging to 23,990 a day.
# Again, the amount of train rides seems to be slightly higher on the holidays

major_holiday
False    504321
True      47980
Name: count, dtype: int64

In [12]:
# Groupby major_holiday and calculate average of arrival_delay and departure_delay

oct_avg_delay = oct_trains.groupby(['major_holiday']).agg({'arrival_delay': 'mean', 'departure_delay': 'mean'})
oct_avg_delay
dec_avg_delay = dec_trains.groupby(['major_holiday']).agg({'arrival_delay': 'mean', 'departure_delay': 'mean'})
dec_avg_delay

avg_delay_combined = pd.concat([oct_avg_delay, dec_avg_delay], keys=['October', 'December'])
avg_delay_combined

# In October, both holiday and non-holiday trains showed similar arrival and departure delay. In December, while both holiday and 
# non-holiday trains experienced increased arrival delays compared to October, the presence of holidays did not significantly 
# change the average delays. This suggests, that other factors may have a more significant impact on delays in October.

Unnamed: 0_level_0,Unnamed: 1_level_0,arrival_delay,departure_delay
Unnamed: 0_level_1,major_holiday,Unnamed: 2_level_1,Unnamed: 3_level_1
October,False,1.210375,-3.363763
October,True,1.124984,-3.363879
December,False,3.733118,-1.903153
December,True,2.833389,-2.753085


In [13]:
# Impact of Weather Conditions on Delays
# How do different weather conditions affect arrival and departure delays in October?
oct_weather_delay = oct_trains.groupby(['weather']).agg({'arrival_delay': 'mean', 'departure_delay': 'mean'}).sort_values('arrival_delay', ascending=False)
oct_weather_delay

# Rainy weather, especially light and moderate rain result in highest delays regarding arrival delays. Interestingly, more severe weather 
# conditions, such as downpour or thundershowers have resulted in less arrival delay. In addition to that, light to moderate rain and haze 
# result in trains arriving ahead of time. 

# Count frequencies of individual types of weather
oct_weather_counts = oct_trains['weather'].value_counts()
oct_weather_counts

weather
cloudy                    223961
sunny                     173362
light rain                 83178
overcast                   65371
showers                     7697
moderate rain               3845
thundershowers              1552
sleet                        929
haze                         559
light to moderate rain       345
downpour                     146
Name: count, dtype: int64

In [33]:
# How do different weather conditions affect arrival and departure delays in December?
dec_weather_delay = dec_trains.groupby(['weather']).agg({'arrival_delay': 'mean', 'departure_delay': 'mean'}).sort_values('arrival_delay', ascending=False)
dec_weather_delay


Unnamed: 0_level_0,arrival_delay,departure_delay
weather,Unnamed: 1_level_1,Unnamed: 2_level_1
moderate rain,12.915179,12.845814
blizzard,8.846154,8.615385
moderate to heavy snow,5.899563,4.938865
light to moderate snow,5.724274,5.221636
light to moderate rain,5.203704,-6.853704
heavy snow,4.909747,3.362816
heavy snow to blizzard,4.605505,4.155963
light rain,3.978945,0.176554
sunny,3.94005,1.118372
moderate snow,3.475309,2.646605


In [21]:
#Calculate average arrival and departure delays of October and December data

oct_mean_arr_delay = oct_weather_delay['arrival_delay'].mean()
oct_mean_dep_delay = oct_weather_delay['departure_delay'].mean()
oct_mean_delay = oct_mean_arr_delay + oct_mean_dep_delay

dec_mean_arr_delay = dec_weather_delay['arrival_delay'].mean()
dec_mean_dep_delay = dec_weather_delay['departure_delay'].mean()
dec_mean_delay = dec_mean_arr_delay + dec_mean_dep_delay
oct_mean_delay, dec_mean_delay

# The data suggest, that in December trains on average experienced more pronounced delays compared to October. 

(-1.0432761473962058, 5.571373647213387)

In [16]:
# Combine overview over delays in October and December

weather_delay_combined = pd.concat([oct_weather_delay, dec_weather_delay], keys = ['October', 'December'])
weather_delay_combined

#Different weather conditions affect arrival and departure delays in both October and December. In October  
#light rain and cloudy weather caused small arrival delays and caused trains to depart earlier as scheduled. 
# In contrast, in December delays were significantly higher across almost all types of weather with moderate rain and blizzards causing 
# the longest delays.

Unnamed: 0_level_0,Unnamed: 1_level_0,arrival_delay,departure_delay
Unnamed: 0_level_1,weather,Unnamed: 2_level_1,Unnamed: 3_level_1
October,light rain,1.455926,-2.728209
October,cloudy,1.442372,-6.341671
October,moderate rain,1.255657,-8.204161
October,sunny,0.974279,-1.174946
October,overcast,0.916997,-0.107418
October,downpour,0.59589,1.226027
October,thundershowers,0.134665,0.673969
October,sleet,0.058127,0.278794
October,showers,-0.067039,0.186566
October,light to moderate rain,-0.307246,-0.005797


In [None]:
# 5. Key Findings and Future Research
# Summary of findings: 
# Holidays had a minor effect on railway delays, the effect of weather conditions on delays was much more pronounced, especially in December.
# A possible area for further research could be to explore seasonal patterns in delay causes by expanding the analysis to cover more 
# months or years. This could reveal for instance whether certain weather conditions consistently lead to delays across seasons. 
# Future research could aim at taking into account more years and analyzing seasonal trends to determine if specific patterns in delays persist over time.