## MTA Self Assessment data



In [1]:
import pandas as pd
import re
import string
import warnings

from bokeh.io import output_notebook
from bokeh.layouts import gridplot
from bokeh.models import HoverTool, ColumnDataSource, FuncTickFormatter, DatetimeTickFormatter
from bokeh.palettes import Category10, gray
from bokeh.plotting import Figure, show, figure
from datetime import datetime, timedelta

In [2]:
# load general variables
subway_lines = 'ABCDEFGJLMNQRSWZ1234567'
subway_colors = {
    'A': '#0039A6',
    'B': '#FF6319',
    'C': '#0039A6',
    'D': '#FF6319',
    'E': '#0039A6',
    'F': '#FF6319',
    'G': '#6CBE45',
    'J': '#996633',
    'L': '#A7A9AC',
    'M': '#FF6319',
    'N': '#FCCC0A',
    'Q': '#FCCC0A',
    'R': '#FCCC0A',
    'S': '#808183',
    'Z': '#996633',
    'W': '#FCCC0A',
    '1': '#EE352E',
    '2': '#EE352E',
    '3': '#EE352E',
    '4': '#00933C',
    '5': '#00933C',
    '6': '#00933C',
    '7': '#B933AD'
}

In [107]:
# load visualization tools & settings
output_notebook()
warnings.filterwarnings('ignore')

 Configure the default output state to generate output in notebook cells
    when :func:`show` is called.

    Args:
        resources (Resource, optional) :
            How and where to load BokehJS from (default: CDN)

        verbose (bool, optional) :
            whether to display detailed BokehJS banner (default: False)

        hide_banner (bool, optional):
            whether to hide the Bokeh banner (default: False)

        load_timeout (int, optional) :
            Timeout in milliseconds when plots assume load timed out (default: 5000)

        notebook_type (string, optional):
            Notebook type (default: jupyter)

    Returns:
        None

    .. note::
        Generally, this should be called at the beginning of an interactive
        session or the top of a script.

    


In [3]:
mta_data = pd.read_csv('../data/MTA_Performance_NYCT.csv', encoding='latin1')

In [4]:
mta_data.head(2)

Unnamed: 0,INDICATOR_SEQ,PARENT_SEQ,AGENCY_NAME,INDICATOR_NAME,DESCRIPTION,CATEGORY,FREQUENCY,DESIRED_CHANGE,INDICATOR_UNIT,DECIMAL_PLACES,PERIOD_YEAR,PERIOD_MONTH,YTD_TARGET,YTD_ACTUAL,MONTHLY_TARGET,MONTHLY_ACTUAL
0,67816,0,NYC Transit,Mean Distance Between Failures - Subways,Average number of miles a subway car travels i...,Service Indicators,M,U,-,0,2008,1,148244.0,148244.0,155000.0,148420.0
1,67816,0,NYC Transit,Mean Distance Between Failures - Subways,Average number of miles a subway car travels i...,Service Indicators,M,U,-,0,2008,2,148476.0,148476.0,155000.0,140993.0


In [5]:
mta_data['date'] = mta_data[
    ['PERIOD_YEAR','PERIOD_MONTH']
].apply(lambda x: datetime(x[0], x[1], 1), axis=1)

In [6]:
mta_data['INDICATOR_NAME'].unique()

array(['Mean Distance Between Failures - Subways',
       'Wait Assessment - Subways (Inactive, Historic Calculations)',
       'Customer Injury Rate - Subways',
       'Mean Distance Between Failures - Staten Island Railway ',
       'On-Time Performance - Staten Island Railway',
       'Employee Lost Time and Restricted Duty Rate ',
       'Total Ridership - Subways',
       'Mean Distance Between Failures - NYCT Bus',
       'Total Paratransit Ridership - NYCT Bus',
       'Customer Accident Injury Rate - NYCT Bus',
       'Total Ridership - NYCT Bus ', 'Elevator Availability - Subways',
       'Escalator Availability - Subways',
       'Collisions with Injury Rate - NYCT Bus',
       '% of Completed Trips - NYCT Bus',
       'East New York Depot - % of Completed Trips',
       'Flatbush Depot - % of Completed Trips',
       'Jackie Gleason Depot - % of Completed Trips',
       'Ulmer Park Depot - % of Completed Trips',
       'Fresh Pond Depot - % of Completed Trips',
       'Yukon

In [7]:
otp_data = mta_data[mta_data['INDICATOR_NAME'].str.contains('OTP')]

In [8]:
# On Time Performance by line

plot_list = []
for d in subway_lines:
    if d != 'J'and d != 'S' and d != 'W':
        _plot = figure(
            plot_width=250, 
            plot_height=150, 
            y_range = (0,100), 
            title=d, 
            y_minor_ticks=2, 
            x_axis_type='datetime')
        _plot.line(
            x=otp_data['date'][otp_data['INDICATOR_NAME'].str.contains(d + ' Line')] ,
            y=otp_data['MONTHLY_ACTUAL'][otp_data['INDICATOR_NAME'].str.contains(d + ' Line')],
            line_color=subway_colors[d],
            line_width=3
        )
        
        _plot.xaxis.formatter=DatetimeTickFormatter(years='%y')
        plot_list.append(_plot)
        
grid = gridplot(plot_list, ncols=5, plot_width=250, plot_height=150)
show(grid)


In [33]:
otp_data['line'] = otp_data['INDICATOR_NAME'].apply(lambda x: x[17:18])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [49]:
# wait assessment
assessment_data = mta_data[mta_data['INDICATOR_NAME'].str.contains('Assessment - ')]
plot_list = []
for d in subway_lines:
    if d != 'J'and d != 'S' and d != 'W':
        _plot = figure(
            plot_width=250, 
            plot_height=150, 
            y_range = (50,100), 
            title=d, 
            y_minor_ticks=2, 
            x_axis_type='datetime')
        _plot.line(
            x=assessment_data['date'][assessment_data['INDICATOR_NAME'].str.contains(d + ' Line')] ,
            y=assessment_data['MONTHLY_ACTUAL'][assessment_data['INDICATOR_NAME'].str.contains(d + ' Line')],
            line_color=subway_colors[d],
            line_width=3
        )
        
        _plot.xaxis.formatter=DatetimeTickFormatter(years='%y')
        plot_list.append(_plot)
        
grid = gridplot(plot_list, ncols=5, plot_width=250, plot_height=150)
show(grid)

### Let's look at On-Time Performance vs count of disruptions as a sanity check

In [12]:
# performance stats vs alerts
alert_data = pd.read_csv('../data/my_mta_data_for_analysis.csv')
alert_data.head(2)

Unnamed: 0.1,Unnamed: 0,count,hex_x,title,body,msg,system,time,hex_y,planned work,...,unauthorized person,event,sick customer,update,non service,construction,weather,other agencies,other,estimated
0,190003,182417,2c891,"UPDATED: BKLYN, 2,3 & 4 Trains, Sick Customer",Following an earlier incident at Ber...,"Following an earlier incident at Bergen St, 2...",NYC,2014-09-01 00:10:00,2c891,False,...,False,False,True,True,False,False,False,False,False,end disruption
1,190006,182420,2c894,"UPDATED: MANH, 1, 2 and 3 Trains, Signal Probl...","b/d 1, 2 and 3 trains are running wi...","b/d 1, 2 and 3 trains are running with delays...",NYC,2014-09-01 00:26:00,2c894,False,...,False,False,False,True,False,False,False,False,False,signal


In [13]:
# Find subway lines mentioned
regex = re.compile('[%s]' % re.escape(string.punctuation))
def extract_subway_lines(txt, subway_lines=subway_lines):
    cleaned_txt = regex.sub(' ', txt)
    split_txt = cleaned_txt.split()
    if 'All' in split_txt:
        return [s for s in subway_lines]
    else:
        return [s for s in split_txt if s in subway_lines]
    
alert_data['lines'] = alert_data['title'].apply(
    lambda x: extract_subway_lines(x)
)

In [15]:
# create monthly field
alert_data['month'] = alert_data['time'].apply(lambda x: datetime(pd.to_datetime(x).year, pd.to_datetime(x).month, 1))

In [76]:
# alerts by line by month
disruption_by_line = []

def generate_disruption_list(disruption, lines, month, disruption_by_line= disruption_by_line):
    for line in lines:
        disruption_by_line.append({"line": line, "disruption": disruption, "count": 1, "month" : month})
    return None

alert_data_ex = alert_data[
    (~alert_data['estimated'].isin(['planned service change','non service','retraction'])) 
    & (~alert_data['update'])
]

alert_data_ex[['estimated', 'lines', 'month']].apply(
    lambda x: generate_disruption_list(x[0],x[1],x[2]), axis=1
)

disruption_df = pd.DataFrame(disruption_by_line)
disruption_df.head()


Unnamed: 0,count,disruption,line,month
0,1,equipment problem,2,2014-09-01
1,1,sick customer,J,2014-09-01
2,1,signal,N,2014-09-01
3,1,event,2,2014-09-01
4,1,event,3,2014-09-01


In [77]:
disruption_by_line_by_month = disruption_df.drop(
    ['disruption'], 
    axis=1
).groupby(['line','month']).count().reset_index()

In [78]:
alerts_vs_otp = disruption_by_line_by_month.merge(
    right=otp_data[['line', 'date', 'MONTHLY_ACTUAL']], 
    left_on=['month', 'line'],
    right_on=['date', 'line'],
    how = 'inner')

In [106]:
# include updates, avg corr = -.31
# exclude updates, avg corr = -.38
# include updates and end-of-disruption = -.39
# exclude updates but include end-of-disruption = ~.48 (ex R, -.52)
plot_list = []
avg_corr = []
for d in subway_lines:
    if d != 'Z'and d != 'S' and d != 'W':
        
        _otp = alerts_vs_otp[alerts_vs_otp['line'] == d][['count', 'MONTHLY_ACTUAL']]
        print(d + ': ' + str(round(_otp.corr()['count'][1],5)))
        avg_corr.append(_otp.corr()['count'][1])
        
        _plot = figure(
            plot_width=200, 
            plot_height=200, 
            x_range = (0,100),
            y_range = (25,100), 
            title=d, 
            y_minor_ticks=2)
        _plot.scatter(
            x=_otp['count'],
            y=_otp['MONTHLY_ACTUAL'],
            line_color=subway_colors[d],
            line_width=3
        )
        
        plot_list.append(_plot)

print('Mean Correlation: ' + str(round(sum(avg_corr)/len(avg_corr),4)))
grid = gridplot(plot_list, ncols=5, plot_width=200, plot_height=200)
show(grid)

A: -0.52958
B: -0.52542
C: -0.38095
D: -0.54041
E: -0.56095
F: -0.81515
G: -0.21825
J: -0.67211
L: -0.64016
M: -0.70634
N: -0.38051
Q: -0.34638
R: 0.3781
1: -0.64634
2: -0.52945
3: -0.57132
4: -0.64114
5: -0.60167
6: -0.06261
7: -0.57662
Mean Correlation: -0.4784


In [103]:
def find_trainline(cell, line):
    try:
        cell.index(line)
        return True
    except ValueError:
        return False

r_line_alerts = alert_data['msg'][alert_data['lines'].apply(lambda x: find_trainline(x, 'R'))]

In [104]:
r_line_alerts[200:300].apply(lambda x: print(x))

b/d D,  N,  Q & R delays and service changes,  due to track maintenance at DeKalb Av. Details at www.mta.info.
b/d D,  N,  Q & R delays and service changes,  due to track maintenance at DeKalb Av. Details at www.mta.info.
b/d D,  N,  Q & R delays and service changes,  due to track maintenance at DeKalb Av. Details at www.mta.info.
Following an earlier incident at DeKalb Av,  D,  N,  Q & R train service has resumed with residual delays.
b/d,  D,  N & R delayed,  s/b,  some N terminating at Canal St,  some R terminating at Whitehall St,  track maintenance at DeKalb Av. Allow additional travel time.
b/d,  D,  N & R delayed,  s/b,  some N terminating at Canal St,  some R terminating at Whitehall St,  track maintenance at DeKalb Av. Allow additional travel time.
b/d,  D,  N & R delayed,  s/b,  some N terminating at Canal St,  some R terminating at Whitehall St,  track maintenance at DeKalb Av. Allow additional travel time.
Following an earlier incident at DeKalb Av,  D,  N and R train servi

2474    None
2475    None
2482    None
2484    None
2501    None
2502    None
2503    None
2510    None
2525    None
2528    None
2544    None
2546    None
2547    None
2595    None
2596    None
2637    None
2640    None
2645    None
2646    None
2660    None
2661    None
2662    None
2665    None
2667    None
2669    None
2740    None
2741    None
2746    None
2748    None
2751    None
        ... 
3264    None
3265    None
3269    None
3270    None
3273    None
3321    None
3324    None
3335    None
3336    None
3337    None
3340    None
3345    None
3346    None
3348    None
3354    None
3356    None
3369    None
3370    None
3371    None
3372    None
3373    None
3407    None
3409    None
3445    None
3452    None
3453    None
3477    None
3481    None
3484    None
3487    None
Name: msg, Length: 100, dtype: object