In [2]:
import pandas as pd
import os
os.chdir("/media/seconddrive/mta_stationing_problem")
from pyspark import SparkContext,SparkConf
from pyspark.sql import SQLContext
from pyspark.sql import Row, SparkSession
from pyspark.sql.types import IntegerType
from pyspark.sql import functions as F
from pyspark import SparkConf
import pandas as pd
import pickle
import numpy as np
from tqdm import tqdm

In [3]:
spark = SparkSession.builder.master("local[*]")\
                .config("spark.sql.session.timeZone", "UTC")\
                .config('spark.driver.extraJavaOptions', '-Duser.timezone=UTC')\
                .config('spark.driver.memory', '2g')\
                .config('spark.executor.memory', '2g')\
                .config("spark.ui.showConsoleProgress", "false")\
                .appName("app").getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

22/08/19 14:56:25 WARN Utils: Your hostname, scope-vanderbilt resolves to a loopback address: 127.0.1.1; using 10.2.218.69 instead (on interface enp8s0)
22/08/19 14:56:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/08/19 14:56:25 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Merging apc data to service disruption data

In [None]:
APC_START = '2020-01-01'
APC_END   = '2022-04-06'

In [None]:
# Get APC data
f = os.path.join('data', 'processed', 'apc_weather_gtfs.parquet')
apcdata = spark.read.load(f)
todelete = apcdata.filter('(load < 0) OR (load IS NULL)').select('transit_date','trip_id','overload_id').distinct()
todelete=todelete.withColumn('marker',F.lit(1))
apcdataafternegdelete=apcdata.join(todelete,on=['trip_id','transit_date','overload_id'],how='left').filter('marker is null').drop('marker')
apcdataafternegdelete = apcdataafternegdelete.sort(['trip_id', 'overload_id'])
get_columns = ['trip_id', 'transit_date', 'arrival_time', 'arrival_time_str', 'vehicle_id',
               'block_abbr', 'stop_sequence', 'stop_id_original', 'overload_id']
get_str = ", ".join([c for c in get_columns])
apcdataafternegdelete.createOrReplaceTempView("apc")

# # filter subset
query = f"""
SELECT {get_str}
FROM apc
ORDER BY arrival_time
"""
apcdata = spark.sql(query)
apcdata = apcdata.dropna(subset=['arrival_time'])
apcdata = apcdata.na.fill(value=-1)
apcdata = apcdata.orderBy("arrival_time")

In [None]:
# apcdata.groupBy().agg(F.count(F.when(F.col("overload_id")>0, True))).collect()[0][0]
# apcdata.groupBy().agg(F.count(F.when(F.col("overload_id")>0, True))).show()
# +------------------------------------------------+
# |count(CASE WHEN (overload_id > 0) THEN true END)|
# +------------------------------------------------+
# |                                           14371|
# +------------------------------------------------+


In [None]:
# apcdata.groupBy('overload_id').count().show()
# +-----------+--------+
# |overload_id|   count|
# +-----------+--------+
# |          1|    9295|
# |          4|      47|
# |          2|    4702|
# |          0|15792788|
# |          3|     327|
# +-----------+--------+


In [None]:
# Get service disruption dataset
fp = os.path.join('data', 'others', 'Service Disruptions_07_2019_08_2022.csv')
disruptions_df = pd.read_csv(fp)
disruptions_df.head()
disruptions_df['DATETIME'] = disruptions_df['DATE'] + ' ' + disruptions_df['TIME']
disruptions_df['DATE'] = pd.to_datetime(disruptions_df['DATE'], format='%m/%d/%y', errors='coerce')
disruptions_df['TIME'] = pd.to_datetime(disruptions_df['TIME'], format='%H:%M:%S', errors='coerce')
disruptions_df['DATETIME'] = pd.to_datetime(disruptions_df['DATETIME'], format='%m/%d/%y %H:%M:%S', errors='coerce')
disruptions_df = disruptions_df[(disruptions_df['DATE'] >= APC_START) & (disruptions_df['DATE'] <= APC_END)]

# Remove weather related disruptions
disruptions_df = disruptions_df[(disruptions_df['REASON'] != 'Weather')].sort_values(by=['DATETIME']).reset_index(drop=True)
print('Shape:', disruptions_df.shape)
disruptions_df = disruptions_df.drop(columns=['COMMENTS'])
disruptions_df['BLOCK'] = disruptions_df['BLOCK'].astype('int32')

# Convert to spark dataframe for merging
disruptions_sp = spark.createDataFrame(disruptions_df)
disruptions_sp = disruptions_sp.withColumn("BLOCK", F.col("BLOCK").cast(IntegerType()))

In [None]:
def asof_join_backward(l, r):
    return pd.merge_asof(l, r, left_on='DATETIME', right_on='arrival_time', direction="backward", left_by="BLOCK", right_by="block_abbr", tolerance=pd.Timedelta('30d'))

identified_disruptions_backward = disruptions_sp.groupby("BLOCK").cogroup(apcdata.groupby("block_abbr")).applyInPandas(
                                asof_join_backward, schema="""DATE timestamp, TIME timestamp, BLOCK int, REASON string, START_STOP_ABBR string, 
                                                     START_STOP_NAME string, START_STOP_LATITUDE double, START_STOP_LONGITUDE double, DATETIME timestamp,
                                                     trip_id string, transit_date timestamp, arrival_time timestamp, arrival_time_str string, vehicle_id string,
                                                     block_abbr int, stop_sequence int, stop_id_original string, overload_id int""")

def asof_join_forward(l, r):
    return pd.merge_asof(l, r, left_on='DATETIME', right_on='arrival_time', direction="forward", left_by="BLOCK", right_by="block_abbr", tolerance=pd.Timedelta('30d'))

identified_disruptions_forward = disruptions_sp.groupby("BLOCK").cogroup(apcdata.groupby("block_abbr")).applyInPandas(
                                asof_join_forward, schema="""DATE timestamp, TIME timestamp, BLOCK int, REASON string, START_STOP_ABBR string, 
                                                     START_STOP_NAME string, START_STOP_LATITUDE double, START_STOP_LONGITUDE double, DATETIME timestamp,
                                                     trip_id string, transit_date timestamp, arrival_time timestamp, arrival_time_str string, vehicle_id string,
                                                     block_abbr int, stop_sequence int, stop_id_original string, overload_id int""")

In [None]:
identified_disruptions_backward_df = identified_disruptions_backward.toPandas()
identified_disruptions_forward_df  = identified_disruptions_forward.toPandas()

In [None]:
cols = ['DATETIME', 'BLOCK', 'REASON', 'START_STOP_ABBR', 
        'trip_id', 'transit_date', 'vehicle_id', 'block_abbr', 'arrival_time', 'stop_sequence', 'stop_id_original', 'overload_id']

In [None]:
identified_disruptions_backward_df = identified_disruptions_backward_df[cols]
identified_disruptions_forward_df = identified_disruptions_forward_df[cols]

identified_disruptions_df = identified_disruptions_backward_df.merge(identified_disruptions_forward_df, 
                                                                     how='outer', 
                                                                     on=['DATETIME', 'BLOCK', 'REASON', 'START_STOP_ABBR'], 
                                                                     suffixes=('_prev', '_next'))
identified_disruptions_df = identified_disruptions_df.dropna(subset=['trip_id_prev', 'trip_id_next'], how='all')

I think many service disruptions here occur at the ends of their trips therefore they have no effect on the succeeding trips.  
No need to send an overload bus since another bus in service can be used to cover? (does this happen?) 

In [None]:
# fp = os.path.join('data', 'processed', 'matched_service_disruptions.pkl')
# identified_disruptions_df.to_pickle(fp)

In [None]:
fp = os.path.join('data', 'processed', 'matched_service_disruptions.pkl')
identified_disruptions_df = pd.read_pickle(fp)

### Filtering out events where the past stop, next stop and reported disruption all happened on different days

In [None]:
not_same_day = identified_disruptions_df[(identified_disruptions_df['DATETIME'].dt.date != identified_disruptions_df['transit_date_prev']) & 
                          (identified_disruptions_df['DATETIME'].dt.date != identified_disruptions_df['transit_date_next']) & 
                          (identified_disruptions_df['transit_date_prev'] != identified_disruptions_df['transit_date_next'])].sort_values(by=['DATETIME'])

In [None]:
# not_same_day[not_same_day['vehicle_id_prev'] == not_same_day['vehicle_id_next']].head()
display(not_same_day.head(1))
not_same_day.shape

In [None]:
# Checking specific disruptions
def show_slice(disruptions_df, idx):
    disruption_idx = idx
    display(disruptions_df.loc[[disruption_idx]])
    tdf = disruptions_df.loc[disruption_idx]
    disruption_datetime = tdf['DATETIME']
    start_time = tdf['arrival_time_prev'] - pd.Timedelta('7h')
    end_time   = tdf['arrival_time_next'] + pd.Timedelta('7h')
    block      = int(tdf['BLOCK'])
    if pd.isnull(start_time):
        start_time = disruption_datetime
    if pd.isnull(end_time):
        end_time = disruption_datetime
    print(start_time, end_time)
    specific_apcdata = apcdata.filter(F.col("arrival_time").between(start_time, end_time))
    specific_apcdata = specific_apcdata.where(specific_apcdata.	block_abbr == block)
    return specific_apcdata.toPandas()

> Just load all overload data in APC to memory and work on that

In [None]:
overload_apc = apcdata.where(F.col("overload_id") > 0)
overload_df = overload_apc.toPandas()

In [None]:
results = {}
pbar = tqdm(total=len(not_same_day))
for k, tdf in not_same_day.iterrows():
    disruption_datetime = tdf['DATETIME']
    start_time = tdf['arrival_time_prev'] - pd.Timedelta('1h')
    end_time   = tdf['arrival_time_next'] + pd.Timedelta('1h')
    block      = int(tdf['BLOCK'])
    tdf = overload_df[(overload_df['block_abbr'] == block)]
    tdf = tdf[(tdf['arrival_time'] >= start_time) & (tdf['arrival_time'] <= end_time)]
    pbar.update(1)
    pbar.set_description(f"Processing {k}:{len(tdf[tdf['overload_id'] > 0])}")
    results[k] = len(tdf)
pbar.close()

In [None]:
# Disruption reported and vehicle was changed between service days (counted as overload)
data = {'id':list(results.keys()), 'overloads':list(results.values())}
tdf = pd.DataFrame(data)
tdf[tdf['overloads'] > 0].shape

In [None]:
# Disruption reported but either no APC data or no trip for that same block for the vehicle until a few days after
tdf[tdf['overloads'] == 0].shape

In [None]:
same_day = identified_disruptions_df[(identified_disruptions_df['DATETIME'].dt.date == identified_disruptions_df['transit_date_prev']) & 
                                     (identified_disruptions_df['DATETIME'].dt.date == identified_disruptions_df['transit_date_next'])]
same_day.shape

## Overload ID types
* 0: Service bus
* 1: Dispatched for emergency [9295]
* 2: [4702]
* 3: I think used when vehicle is still not able to ply the service
* 4: Used in one trip only `230326` `2020-09-26`

Identify how many overload buses were dispatched in the middle of a trip vs just from a depot

In [None]:
a = same_day.loc[tdf[tdf['overloads'] > 0].id][(same_day['stop_sequence_prev'] == 1) |
                                           (same_day['stop_sequence_next'] == 1)]
same_day.loc[tdf[tdf['overloads'] > 0].id].drop(a.index)

In [None]:
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
ax = same_day.loc[tdf[tdf['overloads'] > 0].id]['REASON'].value_counts().plot(kind='bar', rot=45)
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xticks(ha='right')

In [None]:
same_day.loc[tdf[tdf['overloads'] > 0].id][(same_day['vehicle_id_prev'] != same_day['vehicle_id_next'])].shape

Here is the APC data for the trip(s) above. It started the trip but due to disruptions, had to be covered by another vehicle.  
Is vehicle `1913` a special overload vehicle?
* answer is no, so certain in service vehicles can be used to cover for other vehicles.

## Checking cases where apc data was initially not null and then null records started showing

In [3]:
fp = os.path.join('data', 'processed', 'matched_service_disruptions.pkl')
identified_disruptions_df = pd.read_pickle(fp)
identified_disruptions_df.head(1)

Unnamed: 0,DATETIME,BLOCK,REASON,START_STOP_ABBR,trip_id_prev,transit_date_prev,vehicle_id_prev,block_abbr_prev,arrival_time_prev,stop_sequence_prev,stop_id_original_prev,overload_id_prev,trip_id_next,transit_date_next,vehicle_id_next,block_abbr_next,arrival_time_next,stop_sequence_next,stop_id_original_next,overload_id_next
0,2020-04-23 20:38:00,1700,Accident,33AJOHNM,218432,2020-04-21,1907,1700.0,2020-04-21 22:28:58,22.0,33AJOHNM,0.0,210626,2020-04-26,725,1700.0,2020-04-26 06:06:50,1.0,MCC5_8,0.0
1,2020-05-22 05:45:00,1700,Operator,LPSCI,218087,2020-05-20,1807,1700.0,2020-05-20 22:02:38,29.0,11ACHUNF,0.0,219747,2020-05-23,1815,1700.0,2020-05-23 06:10:52,1.0,MCC5_8,0.0
2,2020-06-20 06:15:00,1700,Operator,MCC5_8,222167,2020-06-16,1902,1700.0,2020-06-16 17:05:47,18.0,MCC5_11,0.0,221010,2020-06-21,1800,1700.0,2020-06-21 05:36:10,1.0,LPSCI,0.0
3,2020-08-03 16:26:00,1700,Mechanical,JEF9AWN,220929,2020-08-01,1821,1700.0,2020-08-01 21:38:12,29.0,LPSCI,0.0,226816,2020-08-09,728,1700.0,2020-08-09 06:05:02,1.0,MCC5_8,0.0
4,2020-08-18 15:15:00,1700,Passenger Incident,MCC5_11,226899,2020-08-18,1815,1700.0,2020-08-18 15:07:14,18.0,MCC5_11,0.0,225657,2020-08-18,1815,1700.0,2020-08-18 15:43:02,1.0,MCC5_11,0.0


In [4]:
not_same_day = identified_disruptions_df[(identified_disruptions_df['DATETIME'].dt.date != identified_disruptions_df['transit_date_prev']) & 
                          (identified_disruptions_df['DATETIME'].dt.date != identified_disruptions_df['transit_date_next']) & 
                          (identified_disruptions_df['transit_date_prev'] != identified_disruptions_df['transit_date_next'])].sort_values(by=['DATETIME'])
# not_same_day.head()

In [5]:
same_day = identified_disruptions_df[(identified_disruptions_df['DATETIME'].dt.date == identified_disruptions_df['transit_date_prev']) & 
                                     (identified_disruptions_df['DATETIME'].dt.date == identified_disruptions_df['transit_date_next'])]
# same_day[['DATETIME', 'BLOCK', 'trip_id_prev', 'trip_id_next']]
same_day.sort_values(by=['DATETIME']).tail(1)

Unnamed: 0,DATETIME,BLOCK,REASON,START_STOP_ABBR,trip_id_prev,transit_date_prev,vehicle_id_prev,block_abbr_prev,arrival_time_prev,stop_sequence_prev,stop_id_original_prev,overload_id_prev,trip_id_next,transit_date_next,vehicle_id_next,block_abbr_next,arrival_time_next,stop_sequence_next,stop_id_original_next,overload_id_next
2330,2022-04-05 18:47:00,1801,Accident,BNA,265153,2022-04-05,1830,1801.0,2022-04-05 18:43:14,30.0,DONSHASM,0.0,265174,2022-04-05,1830,1801.0,2022-04-05 19:23:30,2.0,KOR4AVWN,0.0
2331,2022-04-05 19:47:00,1801,Accident,BNA,265154,2022-04-05,1830,1801.0,2022-04-05 19:46:06,18.0,ELMMASEN,0.0,265154,2022-04-05,1830,1801.0,2022-04-05 19:47:04,19.0,ELMELMEF,0.0
778,2022-04-06 07:30:00,2210,Passenger Incident,MCC5_10,270577,2022-04-06,1913,2210.0,2022-04-06 06:40:50,19.0,25THYDNM,0.0,270583,2022-04-06,1913,2210.0,2022-04-06 09:22:38,1.0,MCC5_10,0.0
615,2022-04-06 16:17:00,704,Mechanical,21ACAPSN,268918,2022-04-06,721,704.0,2022-04-06 13:00:17,23.0,MCC5_9,0.0,268882,2022-04-06,721,704.0,2022-04-06 17:25:06,1.0,MCC5_9,0.0
2659,2022-04-06 22:15:00,5502,Accident,MCC4_15,267987,2022-04-06,124,5502.0,2022-04-06 22:04:59,1.0,MCC4_15,0.0,267987,2022-04-06,124,5502.0,2022-04-06 23:08:07,39.0,HICHICNN,0.0


In [7]:
# Get APC data
f = os.path.join("data", "apc", "cleaned-wego-daily.apc.parquet")
# f = os.path.join('data', 'processed', 'apc_weather_gtfs.parquet')
apcdata = spark.read.load(f)
get_columns = ['trip_id', 'transit_date', 'arrival_time', 'arrival_time_str', 'vehicle_id',
               'block_abbr', 'stop_sequence', 'stop_id_original', 'overload_id']
get_str = ", ".join([c for c in get_columns])
apcdata.createOrReplaceTempView("apc")

# # filter subset
query = f"""
SELECT {get_str}
FROM apc
ORDER BY arrival_time
"""
apcdata = spark.sql(query)
apcdata = apcdata.orderBy("arrival_time")

In [15]:
def get_sdf_status(datetime, block, show_df=False):
    date = datetime.split(' ')[0]

    specific_apcdata = apcdata.where(F.col("transit_date") == date)
    specific_apcdata = specific_apcdata.where(specific_apcdata.block_abbr == block)
    sdf = specific_apcdata.toPandas()
    trip_ids = sdf.groupby(['trip_id']).agg('last').sort_values('arrival_time')
    # Include next trip in case its the overload
    trip_ids = trip_ids[trip_ids['arrival_time'] >= datetime].index.tolist()[0:2]
    new_sdf = []    
    for trip_id in trip_ids:
        # Drop first and last stops (depots)
        new_sdf.append(sdf[sdf['trip_id'] == trip_id].sort_values(by=['stop_sequence']))
        # new_sdf.append(sdf[sdf['trip_id'] == trip_id].sort_values(by=['stop_sequence']).iloc[2:-2, :])
        
    if len(new_sdf) == 0:
        return -1
    
    new_sdf = pd.concat(new_sdf).reset_index(drop=True)
    new_sdf['arrival_time_str'].fillna(value='NULL', inplace=True)
    new_sdf['is_null'] = pd.isnull(new_sdf['arrival_time'])

    sametrip_overload_multiplier = 1
    if len(new_sdf[(new_sdf['trip_id'] == trip_ids[0])]['overload_id'].unique()) > 1:
        sametrip_overload_multiplier = 10
    
    nexttrip_overload_multiplier = 1
    if len(trip_ids) > 1:
        if len(new_sdf[(new_sdf['trip_id'] == trip_ids[1])]['overload_id'].unique()) > 1:
            nexttrip_overload_multiplier = 10
        
    if show_df:
        with pd.option_context("display.max_rows", None):
            display(new_sdf)
            # display(new_sdf[(new_sdf['trip_id'] == trip_ids[0])])
            # display(new_sdf[(new_sdf['trip_id'] == trip_ids[0]) & (new_sdf['overload_id'] == 0)])
        
    # Work on initial trip only and no overload
    new_sdf = new_sdf[(new_sdf['trip_id'] == trip_ids[0]) & (new_sdf['overload_id'] == 0)]

    all_valid = (new_sdf['is_null'] == False).all()
    all_null = (new_sdf['is_null'] == True).all()
    
    first_null = False
    mid_null = False
    latest_null = False
    null_groupby = new_sdf.groupby([(new_sdf.is_null != new_sdf.is_null.shift()).cumsum()])
    ilen = len(null_groupby)
    for i, g in null_groupby:
        if i == 1:
            first_null = g['is_null'].values[0]
        if i > 1 and i < ilen and not mid_null:
            mid_null = g['is_null'].values[0]
        if i == ilen:
            latest_null = g['is_null'].values[0]
    # print(all_null, first_null, mid_null, latest_null, overload_multiplier)
    
    if all_valid:
        return 1 * nexttrip_overload_multiplier * sametrip_overload_multiplier
    if all_null:
        return 2 * nexttrip_overload_multiplier * sametrip_overload_multiplier
    if latest_null:
        return 3 * nexttrip_overload_multiplier * sametrip_overload_multiplier
    if not first_null and mid_null and not latest_null:
        return 4 * nexttrip_overload_multiplier * sametrip_overload_multiplier
    if first_null and not mid_null and not latest_null:
        return 5 * nexttrip_overload_multiplier * sametrip_overload_multiplier


### Uncomment when generating results again

In [None]:
# import swifter
# sd = same_day
# sd['status'] = sd.swifter.apply(lambda x: get_sdf_status(x['DATETIME'].strftime('%Y-%m-%d %H:%M:%S'), x['BLOCK']), axis=1)
# fp = os.path.join('data', 'processed', 'same_day_same_trip_status.pkl')
# sd.to_pickle(fp)

In [None]:
fp = os.path.join('data', 'processed', 'same_day_same_trip_status.pkl')
sd = pd.read_pickle(fp)

In [None]:
statuses = []
# sd = same_day[same_day['trip_id_next'] != same_day['trip_id_prev']]
# sd = sd[sd['status'] == -1]
# for k, row in sd[0:1].iterrows():
for k, row in same_day.loc[[458]].iterrows():
    block = row['BLOCK']
    datetime = row['DATETIME'].strftime('%Y-%m-%d %H:%M:%S')
    status = get_sdf_status(datetime, block)
    print(datetime, k, status)
    statuses.append(status)
    # break
statuses

In [None]:
fp = os.path.join('data', 'processed', 'same_day_same_trip_status.pkl')
sd = pd.read_pickle(fp)

sd.loc[sd[(sd['status'] == 1) | (sd['status'] == 10) | (sd['status'] == 100)].index, 'merged_status'] = 1
sd.loc[sd[(sd['status'] == 2) | (sd['status'] == 20) | (sd['status'] == 200)].index, 'merged_status'] = 2
sd.loc[sd[(sd['status'] == 3) | (sd['status'] == 30) | (sd['status'] == 300)].index, 'merged_status'] = 3
sd.loc[sd[(sd['status'] == 4) | (sd['status'] == 40) | (sd['status'] == 400)].index, 'merged_status'] = 4
sd.loc[sd[(sd['status'] == 5) | (sd['status'] == 50) | (sd['status'] == 500)].index, 'merged_status'] = 5
sd.loc[sd[sd['status'].isna()].index, 'merged_status'] = 5
sd.loc[sd[sd['status'].isna()].index, 'status'] = 5

sd.loc[sd[(sd['status'].isin[1, 2, 3, 4, 5])].index, 'type'] = "no_overload"
sd.loc[sd[(sd['status'].isin[10, 20, 30, 40, 50])].index, 'type'] = "nexttrip_overload"
sd.loc[sd[(sd['status'].isin[100, 200, 300, 400, 500])].index, 'type'] = "sametrip_overload"

In [None]:
sd[sd['type'] != 'no_overload']
df_pivot = pd.pivot_table(sd[sd['type'] != 'no_overload'], values='status', index='REASON', columns='type',
                          aggfunc='count').fillna(0).sort_values('nexttrip_overload', ascending=False)#[['no_overload', 'sametrip_overload', 'nexttrip_overload']]
display(df_pivot)
ax =df_pivot.plot(kind='bar', rot=45, stacked=True)
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xticks(ha='right')
# ax.set_xticklabels(['no null', 'all null', 'null end', 'null mid', 'null start'])
ax.set_xlabel('Trip status')
ax.set_ylabel('Count')
ax.set_title('Handling disruptions')

fp = os.path.join('plots', 'handling_disruptions_REASONS.png')
plt.savefig(fp, dpi=300, bbox_inches='tight')

In [None]:
df_pivot = pd.pivot_table(sd, values='status', index='merged_status', columns='type',
                          aggfunc='count').fillna(0)[['no_overload', 'sametrip_overload', 'nexttrip_overload']].sort_values('no_overload', ascending=False)

ax = df_pivot.plot(kind='bar', rot=0, stacked=True)
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
labels = ['no null', 'all null', 'null end', 'null mid', 'null start']
ax.set_xticklabels([labels[int(i) - 1] for i in df_pivot.index])
ax.set_xlabel('Trip status')
ax.set_ylabel('Count')
ax.set_title('Handling disruptions')

# plt.xticks(ha='right')
fp = os.path.join('plots', 'handling_disruptions.png')
plt.savefig(fp, dpi=300, bbox_inches='tight')

### Handling not same day data

In [None]:
# sd = not_same_day
# sd['status'] = sd.swifter.apply(lambda x: get_sdf_status(x['DATETIME'].strftime('%Y-%m-%d %H:%M:%S'), x['BLOCK']), axis=1)
# fp = os.path.join('data', 'processed', 'not_same_day_same_trip_status.pkl')
# sd.to_pickle(fp)

In [8]:
fp = os.path.join('data', 'processed', 'not_same_day_same_trip_status.pkl')
nsd = pd.read_pickle(fp)

fp = os.path.join('data', 'processed', 'same_day_same_trip_status.pkl')
sd = pd.read_pickle(fp)

In [9]:
fp = os.path.join('data', 'processed', 'matched_service_disruptions.pkl')
identified_disruptions_df = pd.read_pickle(fp)

In [10]:
all_disruptions = pd.concat([sd, nsd]).sort_index()

# Status:
* 1: no nat
* 2: all nat
* 3: end nat
* 4: mid nat
* 5: start nat

In [11]:
fp = os.path.join('data', 'processed', 'same_day_same_trip_status.pkl')
all_disruptions = pd.read_pickle(fp)

all_disruptions.loc[all_disruptions[(all_disruptions['status'] == 1) | (all_disruptions['status'] == 10) | (all_disruptions['status'] == 100)].index, 'merged_status'] = 1
all_disruptions.loc[all_disruptions[(all_disruptions['status'] == 2) | (all_disruptions['status'] == 20) | (all_disruptions['status'] == 200)].index, 'merged_status'] = 2
all_disruptions.loc[all_disruptions[(all_disruptions['status'] == 3) | (all_disruptions['status'] == 30) | (all_disruptions['status'] == 300)].index, 'merged_status'] = 3
all_disruptions.loc[all_disruptions[(all_disruptions['status'] == 4) | (all_disruptions['status'] == 40) | (all_disruptions['status'] == 400)].index, 'merged_status'] = 4
all_disruptions.loc[all_disruptions[(all_disruptions['status'] == 5) | (all_disruptions['status'] == 50) | (all_disruptions['status'] == 500)].index, 'merged_status'] = 5
all_disruptions.loc[all_disruptions[all_disruptions['status'].isna()].index, 'merged_status'] = 5
all_disruptions.loc[all_disruptions[all_disruptions['status'].isna()].index, 'status'] = 5

all_disruptions.loc[all_disruptions[(all_disruptions['status'].isin[1, 2, 3, 4, 5])].index, 'type'] = "no_overload"
all_disruptions.loc[all_disruptions[(all_disruptions['status'].isin[10, 20, 30, 40, 50])].index, 'type'] = "nexttrip_overload"
all_disruptions.loc[all_disruptions[(all_disruptions['status'].isin[100, 200, 300, 400, 500])].index, 'type'] = "sametrip_overload"
all_disruptions.type.unique()

array([nan, 'no_overload', 'nexttrip_overload', 'sametrip_overload'],
      dtype=object)

In [18]:
statuses = []
# sd = same_day[same_day['trip_id_next'] != same_day['trip_id_prev']]
# sd = sd[sd['status'] == -1]
# for k, row in sd[0:1].iterrows():
for k, row in all_disruptions.loc[[1594]].iterrows():
    block = row['BLOCK']
    datetime = row['DATETIME'].strftime('%Y-%m-%d %H:%M:%S')
    status = get_sdf_status(datetime, block, show_df=True)
    print(datetime, k, status)
    statuses.append(status)
    # break
statuses

Unnamed: 0,trip_id,transit_date,arrival_time,arrival_time_str,vehicle_id,block_abbr,stop_sequence,stop_id_original,overload_id,is_null
0,261365,2022-02-06,2022-02-06 10:45:10,10:45:10,1816,301,1,MCC5_5,0,False
1,261365,2022-02-06,NaT,,1816,301,2,6AVDEASN,0,True
2,261365,2022-02-06,NaT,,1816,301,3,6AVCHUSN,0,True
3,261365,2022-02-06,NaT,,1816,301,4,8ABROSN,0,True
4,261365,2022-02-06,NaT,,1816,301,5,BRO9AWF,0,True
5,261365,2022-02-06,NaT,,1816,301,6,BRO12WN,0,True
6,261365,2022-02-06,NaT,,1816,301,7,WES17AWN,0,True
7,261365,2022-02-06,NaT,,1816,301,8,WES20AWN,0,True
8,261365,2022-02-06,NaT,,1816,301,9,WES21AWN,0,True
9,261365,2022-02-06,NaT,,1816,301,10,WES23AWN,0,True


2022-02-06 11:08:00 1594 300


[300]

In [None]:
df_pivot = pd.pivot_table(all_disruptions, values='status', index='merged_status', columns='type',
                          aggfunc='count').fillna(0)[['no_overload', 'sametrip_overload', 'nexttrip_overload']].sort_values('no_overload', ascending=False)

ax = df_pivot.plot(kind='bar', rot=0, stacked=True, color=['#1f77b4', '#ff7f0e',  '#2ca02c'])
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
labels = ['no null', 'all null', 'null end', 'null mid', 'null start']
ax.set_xticklabels([labels[int(i) - 1] for i in df_pivot.index])
ax.set_xlabel('Trip status')
ax.set_ylabel('Count')
ax.set_title('Handling disruptions')

# plt.xticks(ha='right')
fp = os.path.join('plots', 'nsd_handling_disruptions.png')
plt.savefig(fp, dpi=300, bbox_inches='tight')

In [None]:
df_pivot = pd.pivot_table(all_disruptions[all_disruptions['type'] != 'no_overload'], values='status', index='REASON', columns='type',
                          aggfunc='count').fillna(0).sort_values('nexttrip_overload', ascending=False)#[['no_overload', 'sametrip_overload', 'nexttrip_overload']]
display(df_pivot)
ax = df_pivot.plot(kind='bar', rot=45, stacked=True, color=['#2ca02c', '#ff7f0e'])
ax.yaxis.set_major_locator(MaxNLocator(integer=True))
plt.xticks(ha='right')
# ax.set_xticklabels(['no null', 'all null', 'null end', 'null mid', 'null start'])
ax.set_xlabel('Trip status')
ax.set_ylabel('Count')
ax.set_title('Handling disruptions')

fp = os.path.join('plots', 'handling_disruptions_REASONS.png')
plt.savefig(fp, dpi=300, bbox_inches='tight')


# Service disruption analysis