# Imports

In [209]:
import os
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import collections
import networkx as nx

import pdb

pd.set_option('mode.chained_assignment', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

os.chdir("/home/jacobheglund/dev/raildelays")
print(os.getcwd())

/home/jacobheglund/dev/raildelays


# Train-Based Node Formulation
We're not doing this for the paper

In [83]:
def create_nodes(df, time_delta=20, stop_delta=10):
    # takes a df and clusters rows based off of specific features and thresholds, these clusters are nodes on our train network graph
    df_tmp = df.copy()

    df_tmp = df_tmp.loc[df_tmp["arrival_sched"] == "starting"]
    df_tmp["node_idx"] = -1
    # if node_idx == -1, the row is not yet associated with a node
    df_tmp["node_idx"] = -1
    df_save = pd.DataFrame(columns=df_tmp.columns)
    node_count = 0
    #TODO make this go to 0
    while len(df_tmp) > 0:        
        # init the first row of df_tmp as the current node
        df_tmp.at[0, "node_idx"] = node_count
        node_row = df_tmp[0:1]
        df_save = df_save.append(node_row)
        node_stops = node_row["stops_in_journey"].item()
        
        time_radius = pd.Timedelta(time_delta, unit="m")
        node_depart_time = pd.to_datetime(node_row["departure_sched"], format="%H%M").item()
        t1,t2 = node_depart_time - time_radius, node_depart_time + time_radius
        curr_node_count = 1
        for j in range(len(df_tmp)):
            # check rows to see if they are associated with the current node
            curr_row = df_tmp[j:j+1]
            
            # same origin destination pair
            if node_row["OD"].item() == curr_row["OD"].item():
                
                # occur on different days
                if node_row["date"].item() != curr_row["date"].item():
                    
                    # similar scheduled initial departure time
                    curr_depart_time = pd.to_datetime(curr_row["departure_sched"], format="%H%M").item()
                    if t1 <= curr_depart_time <= t2:
                        
                        # similar number of stops
                        if node_stops-stop_delta <= curr_row["stops_in_journey"].item() <= node_stops+stop_delta:
                            # associate row j with node
                            df_tmp.at[j, "node_idx"] = node_count
                            df_save = df_save.append(curr_row)
                            curr_node_count += 1
                            
        # remove rows that are assigned to nodes
        df_tmp = df_tmp.loc[df_tmp["node_idx"] == -1]
        df_tmp = df_tmp.reset_index(drop=True)
        node_count += 1
        
    # TODO a route is completely characterized by it's "starting" row,
    ## remove all rows except starting rows and assign node to the rest of the data using RID
    # TODO reconstruct the full data with node index using df_save, df, and RID
    return df_save

In [84]:
df_nodes = create_nodes(df)

In [85]:
df_nodes["departure_sched"] = df_nodes["departure_sched"].astype("str")
df_nodes["departure_sched_datetime"] = pd.to_datetime("1900-01-01" + " " + df_nodes["departure_sched"])

In [86]:
print("Number of Nodes: ", len(np.unique(df_nodes["node_idx"])))
# how many unique routes do we get using this formulation?
## around 50-70 depending on stop_delta and time_delta


Number of Nodes:  55


In [None]:
np.unique(curr_depart)

In [None]:
n_nodes = max(df_nodes["node_idx"])
plot = plt.figure(figsize=(16, 9))
fig = plt.subplot()
cmap=cm.get_cmap("plasma")
df_nodes["departure_sched"] = df_nodes["departure_sched"].astype("int")
xticks = []

for i in range(n_nodes):
    color_idx = i / n_nodes
    
    rows = df_nodes.loc[df_nodes["node_idx"] == i]
    curr_depart = rows["departure_sched"]
    curr_stops = rows["stops_in_journey"]
    plt.scatter(curr_depart, curr_stops, c=[cmap(color_idx)], s=50)
    
#     center_x = (np.ptp(curr_depart) / 2) + min(curr_depart)
#     center_y = (np.ptp(curr_stops) / 2) + min(curr_stops)
#     rx = max(2*(max(curr_depart) - center_x), 5)
#     ry = max(2*(max(curr_stops) - center_y), 3)

#     ellipse = matplotlib.patches.Ellipse((center_x, center_y), rx, ry, alpha = 0.25)
#     ellipse.set_facecolor(cmap(color_idx))
#     fig.add_artist(ellipse)

plt.xlabel("Initial Departure Time")
plt.ylabel("Total Number of Stops")
plt.show()

In [None]:
# how many times does each route run using this formulation?
runs_per_node = []
for i in range(n_nodes):
    df_tmp = df_nodes.loc[df_nodes["node_idx"] == i]
    runs_per_node.append(len(df_tmp))


fig = plt.subplots(figsize=(16, 9))
nodes_x = np.arange(0, n_nodes, 1)
plt.xlabel("Node Index")
plt.ylabel("Runs per Node")
plt.title("Runs per Node during Data Period")
plt.plot(nodes_x, runs_per_node)
plt.show()


In [None]:
# each unique route certainly has a "starting" station, so reduce the problem space by only considering these rows
df_test = df.loc[df["arrival_actual"] == "starting"]
df_test["OD"] = df["station_origin"] + df["station_destination"]

print(len(df_test))

unique_rid = np.unique(df["RID"])
n_unique_rid = len(unique_rid)
print(n_unique_rid)



In [None]:
df_od = df_test.loc[df_test["OD"] == "BANPAD"]
unique_stops = np.unique(df["stops_in_journey"])
df_stops = df_od.loc[df_od["stops_in_journey"] == 22]

# n_unique_stops = len(unique_stops)
# print(unique_stops)
# print(n_unique_stops)


# get unique set of dates from for "close" initial departure times, check if there are any matches 
# (if so, then the routes are unique and deserve to be separate nodes on the graph)
# TODO how to come up with the candidate set of "close" initial departure times?
## probably check if they're within 10 minutes radius of each other?
dep_list = ["0604", "0608", "0609"]
# dep_list = ['0604' '0608' '0609' '0625' '0703' '0728' '0935']

date_list = []

for i in range(len(dep_list)):
    dep_time = dep_list[i]
    df_tmp = df_stops.loc[df_stops["departure_sched"] == dep_time]
    dates = np.ndarray.tolist(np.unique(df_tmp["date"]))
    for j in dates:
        date_list.append(j)

if len(date_list) == len(set(date_list)):
    print("they're all unique")
    # these times should be considered the same route


# unique_dep = np.unique(df_stops["departure_sched"])
# n_unique_dep = len(unique_dep)
# print(n_unique_dep)
# print(unique_dep)



# df_date = df_stops.loc[df_stops["date"] == "2016-03-03"]
# print(len(df_date))

# df_start = df_stops.loc[df_stops["departure_sched"] == "0625"]
# print(len(df_start))
# print(len(np.unique(df_start["date"])))

# certainly, a trains departing their first station at 0604 and 0625 are unique routes b/c they are both 
# run on the same day and could even cause delays for each other

# however, are 0604, 0608, 0609 ever run on the same day?  if not, the scheduled departure was changed for 
# this route, but it's still considered the same route

# what is a good cutoff time to consider two routes different from their initial departure?  
# probably 10 minutes or so b/c the same station isn't going to have the exact same route leave 10 minutes apart
# that would just be wasteful unless there's a fucking huge amount of traffic

# # this route is the first of the day to run through this particular set of stops
# # what about 0935? this is the second route of the day to run through this particular set of stops

In [None]:
df_604 = df_stops.loc[df_stops["departure_sched"] == "0609"]
print(len(df_604))


In [None]:
# pd pivot ?
# date_list = ["2016-01-03", "2016-01-04","2016-01-05", "2016-01-06", "2016-01-07", "2016-01-08", "2016-01-09", "2016-01-10", "2016-01-11", "2016-01-12"]
# for i in date_list:
#     df_test = df.loc[df["date"] == i]
    

# # date_list = ["2016-01-05"]
# for i in date_list:
#     df_test = df.loc[df["date"] == i]
#     unique_rid = np.unique(df_test["RID"])
#     unique_id = np.unique(df_test["train_id"])
# #     print(unique_rid)
#     print(len(unique_rid) == len(unique_id))
# #     print(unique_id)
# #     print(len(unique_id))


# RID gives a unique route for each day, but it is not clear if this transfers between days
# i.e. will day_1_train_id_1 == day_2_train_id_1 for the same OD stations?
# how many examples of the same route being run do we have during the data period t?


# # see if train ids repeat within the dataset
# #TODO i'm sure the result of this means something, but what?
# for i in df["train_id"]:
#     df_test = df.loc[df["train_id"] == i]
#     unique_routes = np.unique(df_test["RID"])
#     if len(unique_routes) > 1:
#         print(len(unique_routes))
    
#     print(len(df_test))
        
#         print(df.loc[df["train_id"] == i]["departure_sched"])

# train_id_list = ["1279959"]
# for i in train_id_list:
#     df_test = df.loc[df["train_id"] == i]
# #     print(len(df_test))
    


In [None]:
# # number of unique routes are run each day of the year (i.e. with unique station_origin and station_destination)
# datetime_start = min(df["arrival_actual_datetime"])
# datetime_end = max(df["arrival_actual_datetime"])
# num_days = (datetime_end - datetime_start).days
# day_arr = np.arange(start=0, stop=num_days, step=1)
# train_arr = np.zeros(num_days)

# for i in range(num_days):
#     curr_year = (datetime_start + pd.Timedelta(i, unit="d")).year
#     curr_month = "{:02d}".format((datetime_start + pd.Timedelta(i, unit="d")).month)
#     curr_day = "{:02d}".format((datetime_start + pd.Timedelta(i, unit="d")).day)
#     curr_date = "{}-{}-{}".format(curr_year, curr_month, curr_day)
#     df_tmp = df.loc[df["date"] == curr_date]
    
#     # 



In [None]:
# number of trains running each day of the year
datetime_start = min(df["arrival_actual_datetime"])
datetime_end = max(df["arrival_actual_datetime"])
num_days = (datetime_end - datetime_start).days
day_arr = np.arange(start=0, stop=num_days, step=1)
train_arr = np.zeros(num_days)

for i in range(num_days):
    curr_year = (datetime_start + pd.Timedelta(i, unit="d")).year
    curr_month = "{:02d}".format((datetime_start + pd.Timedelta(i, unit="d")).month)
    curr_day = "{:02d}".format((datetime_start + pd.Timedelta(i, unit="d")).day)
    curr_date = "{}-{}-{}".format(curr_year, curr_month, curr_day)
    df_tmp = df.loc[df["date"] == curr_date]
    
    # the RIDs of the particular trains running that day
    trains_running = np.unique(df_tmp["RID"])
    train_arr[i] = len(trains_running)
    



In [None]:
# # the RID consists of the date, followed by another number.  
# # What does that other number signify?  I don't think it is a unique identifier for the particular train
# # df["RID"] = df["RID"].astype(str)
# # df["train_id"] = df["RID"].str.slice(8, -1)
# # num_trains = len(np.unique(df["train_id"]))
# # num_trains

# # it looks like there is a weird data imbalance, each train 
# plt.plot(day_arr, train_arr)
# plt.xlabel("Day of Year")
# plt.ylabel("Number of Unique Trains")
# plt.show()

# #TODO remove the days when 0 trains are running


In [None]:
# make histogram of arrival delay
hist = plt.hist(df["arrival_delay_minutes"], bins=100)
plt.show()

In [None]:
# make a CDF (cumulative distribution function) of arrival delay
x = np.array(df["arrival_delay_minutes"])
x = np.sort(x)
N = len(x)
y = np.array(range(N)) / float(N)
plt.plot(x, y)
plt.show()