## Analysis of My Alerts

In [1]:
import pandas as pd
import warnings

from bokeh.io import output_notebook
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category10, gray
from bokeh.plotting import Figure, show, figure
from datetime import datetime
from sklearn import tree


from bokeh.sampledata.les_mis import data


In [2]:
# load data
alert_data = pd.read_csv('../data/my_mta_data_for_analysis.csv', encoding='latin1', parse_dates =['time'])

In [3]:
# load visualization tools & settings
output_notebook()
warnings.filterwarnings('ignore')

#### Total Count of Disruption Notices over Three Years

In [4]:
alert_data['estimated'].value_counts()

update                      39826
police                       6714
signal                       4602
sick customer                3253
planned service change       2892
maintenance                  1674
end disruption                988
fire                          830
test                          373
weather                       268
unauthorized person           211
retraction                    206
equipment problem             175
other agencies                 58
accident                       57
flooding                       53
event                          47
traffic                        22
non service                    18
construction                   16
high volume                    13
unplanned service change       10
other                           1
Name: estimated, dtype: int64

For an initial analysis of when the trains are bad, we're going to remove a few classes of alerts:
- 'update' messages on existing disruptions
- 'end disruption' announcements
- 'planned services change' announcements, since those are announced ahead of the disruption

In [5]:
alert_data_ex_end = alert_data[
    (alert_data['estimated'] != 'end disruption') & 
    (alert_data['estimated'] != 'planned service change') &
    (alert_data['estimated'] != 'update')
]

#### Converted to Daily Averages

In [6]:
print('Average Total Disruptions per Day: ', str(round(len(alert_data_ex_end)/(3*365),1)))
print()
print(round(alert_data_ex_end['estimated'].value_counts()/ (3 * 365),1))


Average Total Disruptions per Day:  17.0

police                      6.1
signal                      4.2
sick customer               3.0
maintenance                 1.5
fire                        0.8
test                        0.3
weather                     0.2
unauthorized person         0.2
retraction                  0.2
equipment problem           0.2
other agencies              0.1
accident                    0.1
flooding                    0.0
event                       0.0
traffic                     0.0
non service                 0.0
construction                0.0
high volume                 0.0
unplanned service change    0.0
other                       0.0
Name: estimated, dtype: float64


#### Total Alerts over time

In [7]:
daily_alerts = pd.DataFrame(
    alert_data_ex_end['time'].apply(
        lambda x: x.date()
    ).value_counts())

daily_alerts.sort_index(inplace=True)
daily_alerts.columns = ['alerts']
daily_alerts['rolling average 7'] = daily_alerts['alerts'].rolling(7).mean()

hover = HoverTool(
    tooltips=[
        ("Date", "$x{%F}"),
        ("Alerts", "$y{0f}")
    ],
    formatters={"$x": "datetime"},
    mode="mouse"
)

plot = Figure(x_axis_type="datetime", plot_width=900, tools=[hover])
plot.multi_line(
    xs=[daily_alerts.index]*2, 
    ys=[daily_alerts['alerts'],daily_alerts['rolling average 7']],
    line_color= ['#aaaaaa','#000000'],
    line_width=[1,3]
)

show(plot)

In [8]:
daily_alerts.sort_values(by='alerts', ascending=False).head(20)

Unnamed: 0,alerts,rolling average 7
2016-01-23,55,24.571429
2017-02-09,52,27.285714
2015-03-05,47,23.428571
2015-02-02,47,23.714286
2017-04-04,37,22.142857
2015-03-04,36,19.428571
2016-01-22,36,18.428571
2017-01-25,36,20.142857
2016-09-01,36,18.0
2015-09-10,35,18.142857


##### Notes on some outliers:
-  spike on 2016-01-23, 2016-01-24: [blizzard](https://en.wikipedia.org/wiki/January_2016_United_States_blizzard) big enough to have its own wikipedia entry
- big winter storms on 2015-02-02, 2015-03-05 also caused major disruptions,


In [11]:
# let's exclude bad weather from the analysis: we'll predict that later
daily_alerts = pd.DataFrame(
    alert_data_ex_end[alert_data_ex_end['estimated'] != 'weather']['time'].apply(
        lambda x: x.date()
    ).value_counts())

daily_alerts.sort_index(inplace=True)
daily_alerts.columns = ['alerts']
daily_alerts['rolling average 21'] = daily_alerts['alerts'].rolling(21).mean()

plot = Figure(x_axis_type="datetime", plot_width=900, tools=[hover])
plot.multi_line(
    xs=[daily_alerts.index]*2, 
    ys=[daily_alerts['alerts'],daily_alerts['rolling average 21']],
    line_color= ['#aaaaaa','#000000'],
    line_width=[1,3]
)

show(plot)

In [12]:
# worst days this year, ex Weather
daily_alerts[daily_alerts.index > datetime(2016,12,31).date()].sort_values(by='alerts', ascending=False).head(20)

Unnamed: 0,alerts,rolling average 21
2017-02-09,46,20.761905
2017-04-04,37,19.904762
2017-01-25,36,20.380952
2017-07-10,33,20.761905
2017-08-18,33,19.47619
2017-05-19,32,19.952381
2017-08-02,31,18.142857
2017-03-27,31,17.52381
2017-05-22,31,19.619048
2017-01-23,31,19.809524


The worst day this summer was 2017-07-10: let's see what happened

In [17]:
DATE_TO_MATCH = datetime(2017,8,18).date()

alert_data_ex_end[['title','time']][
    alert_data_ex_end['time'].apply(lambda x: x.date()) == DATE_TO_MATCH
]


Unnamed: 0,title,time
61469,"QNS, F Trains, Mechanical Problems",2017-08-18 00:57:00
61471,"MANH, 1 and 2 Trains, Track Maintenance",2017-08-18 02:47:00
61474,"BX, 2 Trains, Customer Struck by a train",2017-08-18 04:14:00
61475,"BX, 2 Trains, Customer Struck by Trains",2017-08-18 04:20:00
61476,"BX, 2 Trains, Customer Struck by Train",2017-08-18 04:26:00
61477,"MANH, F Trains, Track Maintenance",2017-08-18 04:56:00
61482,"QNS, E Trains, Switch Problems",2017-08-18 06:57:00
61485,"QNS, E & F Trains, Unruly Customer",2017-08-18 08:17:00
61486,"BX, 4 Train, Mechanical Problems",2017-08-18 08:18:00
61489,"QNS, F Train, Signal Problems",2017-08-18 08:50:00


#### But when should I avoid the subway? This doesn't help. 

Let's see if we can start identifying times that the subway is particularly bad

(note: python & ISO use Monday as first day of week by convention)

In [18]:
weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

alert_data['weekday'] = alert_data['time'].apply(lambda x: weekdays[x.weekday()])
alert_data['time_of_day'] = alert_data['time'].apply(lambda x: x.time())

def bucket_minutes(time, bucket_size = 5):
    minutes_to_subtract = time.minute % bucket_size
    datetime_object = datetime(2017,1,1, time.hour, time.minute - minutes_to_subtract)
    return datetime_object

# hr bucket to start
alert_data['time_bucket'] = alert_data['time'].apply(
    lambda x: ('0' + str(x.hour))[-2:]
)

In [19]:
alert_data_ex_end_and_weather = alert_data[
    (alert_data['estimated'] != 'end disruption') & 
    (alert_data['estimated'] != 'planned service change') &
    (alert_data['estimated'] != 'weather')
]

In [20]:
weekly_heatmap_data = alert_data_ex_end_and_weather[
    ['weekday', 'time_bucket', 'count']
].groupby(
    ['weekday', 'time_bucket']
).count().reset_index()

weekly_heatmap_data['alpha'] = weekly_heatmap_data['count'] / max(weekly_heatmap_data['count'])

In [34]:
# weekly_heatmap = HeatMap(
#     alert_data_ex_end_and_weather, 
#     x='weekday', 
#     y='time_bucket',
#     values='count',
#     palette= gray(40),
#     legend=False
# )

p = figure(
    title='Heat Map of Disruptions',
    x_axis_location="above",
    x_range = weekdays,
    y_range = (23.5,-0.5),
    tools='hover'
)

p.plot_height = 800
p.plot_width = 800
p.grid.grid_line_color = None
p.axis.axis_line_color = None
p.axis.major_tick_line_color = None

p.rect(
    source=weekly_heatmap_data,
    x='weekday',
    y='time_bucket',
    width= 0.99,
    height = 0.95,
    color='#000000',
    alpha='alpha'
)

p.select_one(HoverTool).tooltips = [
    ('time', '@time_bucket'),
    ('disruptions', '@count')
]

show(p)

In [22]:
weekly_heatmap_data.head(20)

Unnamed: 0,weekday,time_bucket,count,alpha
0,Friday,0,260,0.323786
1,Friday,1,127,0.158157
2,Friday,2,256,0.318804
3,Friday,3,141,0.175592
4,Friday,4,200,0.249066
5,Friday,5,176,0.219178
6,Friday,6,359,0.447073
7,Friday,7,453,0.564134
8,Friday,8,648,0.806974
9,Friday,9,480,0.597758


In [23]:
alert_data_ex_end_and_weather[['title','time']].head(40)

#alert_data[['msg','time']].head(40)

Unnamed: 0,title,time
0,"UPDATED: BKLYN, 2,3 & 4 Trains, Sick Customer",2014-09-01 00:10:00
1,"UPDATED: MANH, 1, 2 and 3 Trains, Signal Probl...",2014-09-01 00:26:00
2,"UPDATED: QNS, E and F Trains, Track Maintenance",2014-09-01 00:27:00
3,"UPDATED: QNS, E and F Trains, Track Maintenance",2014-09-01 00:38:00
4,"UPDATED: MANH, 1, 2 and 3 Trains, Signal Probl...",2014-09-01 00:40:00
5,"BX, 2 Trains, Earlier Incident",2014-09-01 03:12:00
6,"BKLYN, 2 Trains, Mechanical Problems",2014-09-01 03:56:00
7,"UPDATED: BKLYN, 2 Trains, Mechanical Problems",2014-09-01 04:04:00
8,"MANH, 6 Trains, Earlier Incident",2014-09-01 08:24:00
9,"QNS, J Trains, Sick Customer",2014-09-01 11:39:00


In [74]:
# try 15m to graph by day
alert_data['time_bucket'] = alert_data['time'].apply(
    lambda x: x.hour + int(x.minute/15)/4
)

alert_data_ex_end_and_weather = alert_data[
    (alert_data['estimated'] != 'end disruption') & 
    (alert_data['estimated'] != 'planned service change') &
    (alert_data['estimated'] != 'update') & 
    (alert_data['estimated'] != 'weather') 
]

weekly_linechart_data = alert_data_ex_end_and_weather[
    ['weekday', 'time_bucket', 'count']
].groupby(
    ['weekday', 'time_bucket']
).count().reset_index()


In [75]:
plot = Figure(plot_width=900, tools=[hover])
plot.multi_line(
    xs=[[x/4 for x in range(24*4)]] * 7, 
    ys=[
        weekly_linechart_data['count'][weekly_linechart_data['weekday'] == 'Monday'],
        weekly_linechart_data['count'][weekly_linechart_data['weekday'] == 'Tuesday'],
        weekly_linechart_data['count'][weekly_linechart_data['weekday'] == 'Wednesday'],
        weekly_linechart_data['count'][weekly_linechart_data['weekday'] == 'Thursday'],
        weekly_linechart_data['count'][weekly_linechart_data['weekday'] == 'Friday'],
        weekly_linechart_data['count'][weekly_linechart_data['weekday'] == 'Saturday'],
        weekly_linechart_data['count'][weekly_linechart_data['weekday'] == 'Sunday']
    ],
    line_color= Category10[7],
    line_width=[1]*7
)

show(plot)

In [69]:
[[x/4 for x in range(3)]] * 7

[[0.0, 0.25, 0.5],
 [0.0, 0.25, 0.5],
 [0.0, 0.25, 0.5],
 [0.0, 0.25, 0.5],
 [0.0, 0.25, 0.5],
 [0.0, 0.25, 0.5],
 [0.0, 0.25, 0.5]]

In [71]:
alert_data_ex_end_and_weather.to_csv('tmp.csv', index=False)