In [1]:
import pandas as pd
import pickle
from hdfs3 import HDFileSystem

In [2]:
hdfs = HDFileSystem(host='hdfs://iccluster044.iccluster.epfl.ch', port=8020, user='ebouille')
def read_csv(path, parts=None, **kwargs):
    dfs = []
    file_paths = [file_path for file_path in hdfs.ls(path) if file_path.endswith('.csv')]
    if parts:
        file_paths = file_paths[:parts]
    for file_path in file_paths:
        with hdfs.open(file_path) as file:
            dfs.append(pd.read_csv(file, **kwargs))
    return pd.concat(dfs)

In [3]:
connections = read_csv('/user/datavirus/connections.csv')
connections['start_time'] = ('2019-05-06 ' + connections['start_time']).astype('datetime64[ns]')
connections['stop_time'] = ('2019-05-06 ' + connections['stop_time']).astype('datetime64[ns]')
connections['delay_probability'] = connections['delay_probability'].round(3)
connections['delay_parameter'] = connections['delay_parameter'].round(4)

In [4]:
# recover missing/incomplete line_text using other columns

route_type_str = {
    102: 'IC',
    103: 'IR',
    106: 'R',
    400: 'S',
    700: '',
    900: '',
}

digits = tuple(map(str, range(10)))

def line_text(row):
    line_nr = ''.join([c for c in row['trip_id'].split('-')[1] if c in digits])
    route_text = route_type_str[row['route_type']]
    if row['transport_type'] != 'zug':
        if row['line_text']:
            return row['line_text']
    elif row['line_text']:
        line_text = str(row['line_text'])
        has_line_nr = len([c for c in line_text if c in digits]) > 0
        if has_line_nr:
            return line_text
        else:
            return line_text + line_nr
    return route_text + line_nr  

In [5]:
# update line_text
connections['line_text'] = connections['line_text'].fillna('')
connections['line_text'] = connections.apply(line_text, axis=1)

In [6]:
# sort connections in order required by connection scan algorithm
connections = connections.sort_values(['stop_time', 'start_time', 'stop_sequence'], ascending=False)

# retain only needed columns
connections = connections.loc[:, ['start_id', 'start_time', 'trip_id', 'transport_type', 'line_text', 'stop_time', 'stop_id', 'delay_probability', 'delay_parameter']]

# convert times to seconds
connections['start_time'] = connections['start_time'].astype('int') // 10**9
connections['stop_time'] = connections['stop_time'].astype('int') // 10**9

connections = connections.reset_index(drop=True)

In [7]:
with open('../data/connections.pickle', 'wb') as file:
    pickle.dump(connections, file)

# Prepare Stations

In [8]:
connections_stations = set(connections['start_id']).union(connections['stop_id'])

In [9]:
stations = pd.read_csv('../data/stations.csv')
stations.columns = ['station_id', 'lat', 'lon', 'height', 'station_name', 'distance_from_zurich']
stations = stations.set_index('station_id', drop=False)
stations = stations.drop(['distance_from_zurich', 'height'], axis=1)
stations = stations.loc[stations['station_id'].isin(connections_stations)]
stations.head(5)

Unnamed: 0_level_0,station_id,lat,lon,station_name
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8502186,8502186,8.398942,47.393407,Dietikon Stoffelbach
8502187,8502187,8.377032,47.36474,Rudolfstetten Hofacker
8502188,8502188,8.354599,47.355907,Zufikon Hammergut
8502208,8502208,8.589802,47.258748,Horgen Oberdorf
8502209,8502209,8.577633,47.276724,Oberrieden Dorf


In [10]:
with open('../data/stations.pickle', 'wb') as file:
    pickle.dump(stations, file)

# Prepare Footpaths

In [11]:
footpaths = pd.read_csv('../data/footpaths.csv')
footpaths.columns = ['start_id', 'start_lat', 'start_lon', 'start_height', 'stop_id', 'stop_lat', 'stop_lon', 'stop_height', 'height_difference', 'distance', 'speed', 'time']
footpaths['time'] = footpaths['time'].astype(int)

# remove cycles
footpaths = footpaths[footpaths['start_id'] != footpaths['stop_id']]

# remove unused stations
footpaths = footpaths[
    footpaths['start_id'].isin(connections_stations)
    & footpaths['stop_id'].isin(connections_stations)
]

# retain only required columns
footpaths = footpaths.loc[:, ['start_id', 'stop_id', 'time']]

# create dictionary
footpaths = {stop_id: list(sorted(zip(*row), key=lambda x: x[1])) for stop_id, row in footpaths.groupby('stop_id').agg(list).iterrows()}

In [12]:
with open('../data/footpaths.pickle', 'wb') as file:
    pickle.dump(footpaths, file)