In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_day(day):
    header = ['timestamp', 'line_id', 'direction', 'jrny_patt_id', 'time_frame', 'journey_id', 'operator', 
              'congestion', 'lon', 'lat', 'delay', 'block_id', 'vehicle_id', 'stop_id', 'at_stop']
    types = {'timestamp': np.int64,
             'journey_id': np.int32,
             'congestion': np.int8,
             'lon': np.float64,
             'lat': np.float64,
             'delay': np.int8,
             'vehicle_id': np.int32,
             'at_stop': np.int8}
    file_name = 'data/siri.201301{0:02d}.csv'.format(day)
    df = pd.read_csv(file_name, header=None, names=header, dtype=types, parse_dates=['time_frame'], infer_datetime_format=True)
    null_replacements = {'line_id': 0, 'stop_id': 0}
    df = df.fillna(value=null_replacements)
    df['line_id'] = df['line_id'].astype(np.int32)
    df['stop_id'] = df['stop_id'].astype(np.int32)
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='us')
    return df


In [23]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    
    Taken from here: https://stackoverflow.com/questions/29545704/fast-haversine-approximation-python-pandas#29546836
    """
    lon1, lat1, lon2, lat2 = np.radians(lon1), np.radians(lat1), np.radians(lon2), np.radians(lat2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    #c = 2 * np.arcsin(np.sqrt(a))
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1.0 - a))
    meters = 6372000.0 * c
    return meters

In [4]:
def calculate_durations(data_frame, vehicle_id):
    one_second = np.timedelta64(1000000000, 'ns')
    dv = data_frame[data_frame['vehicle_id']==vehicle_id]
    ts = dv.timestamp.values
    dtd = ts[1:] - ts[:-1]
    dt = np.zeros(len(dtd) + 1)
    dt[1:] = dtd / one_second
    return dt

In [5]:
def calculate_distances(data_frame, vehicle_id):
    dv = data_frame[data_frame['vehicle_id']==vehicle_id]
    lat = dv.lat.values
    lon = dv.lon.values
    dxm = haversine_np(lon[1:], lat[1:], lon[:-1], lat[:-1])
    dx = np.zeros(len(dxm) + 1)
    dx[1:] = dxm
    return dx

In [6]:
def filter_columns(df):
    columns = ['timestamp', 'direction', 'journey_id', 'congestion', 'lon', 'lat', 'delay', 'vehicle_id', 'stop_id', 'at_stop']
    return df[columns]

In [7]:
d1 = load_day(1)
d2 = load_day(2)
d3 = load_day(3)

In [8]:
d1.isnull().sum()

timestamp            0
line_id              0
direction            0
jrny_patt_id    121407
time_frame           0
journey_id           0
operator             0
congestion           0
lon                  0
lat                  0
delay                0
block_id             0
vehicle_id           0
stop_id              0
at_stop              0
dtype: int64

In [9]:
d2.isnull().sum()

timestamp            0
line_id              0
direction            0
jrny_patt_id    252724
time_frame           0
journey_id           0
operator             0
congestion           0
lon                  0
lat                  0
delay                0
block_id             0
vehicle_id           0
stop_id              0
at_stop              0
dtype: int64

In [10]:
d3.isnull().sum()

timestamp            0
line_id              0
direction            0
jrny_patt_id    255489
time_frame           0
journey_id           0
operator             0
congestion           0
lon                  0
lat                  0
delay                0
block_id             0
vehicle_id           0
stop_id              0
at_stop              0
dtype: int64

In [11]:
journeys1 = d1['journey_id'].unique()
journeys2 = d2['journey_id'].unique()
journeys3 = d3['journey_id'].unique()

In [12]:
len(journeys1)

5119

In [13]:
len(journeys2)

7432

In [14]:
print(journeys1.min(), journeys2.max())

258 991819


In [15]:
print(journeys2.min(), journeys2.max())

1 991819


In [16]:
print(journeys3.min(), journeys3.max())

1 998867


In [17]:
journeys3.shape

(7301,)

In [18]:
journeys

NameError: name 'journeys' is not defined

In [None]:
d3[d3['journey_id'] == 258]

In [19]:
days = None
for d in range(31):
    day = filter_columns(load_day(d+1))
    day['dt'] = 0.0
    day['dx'] = 0.0
    day['speed'] = 0.0    
    if days is None:
        days = day
    else:
        days = days.append(day)
    

In [20]:
days.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44455133 entries, 0 to 603920
Data columns (total 13 columns):
timestamp     datetime64[ns]
direction     int64
journey_id    int32
congestion    int8
lon           float64
lat           float64
delay         int8
vehicle_id    int32
stop_id       int32
at_stop       int8
dt            float64
dx            float64
speed         float64
dtypes: datetime64[ns](1), float64(5), int32(3), int64(1), int8(3)
memory usage: 3.3 GB


In [21]:
vehicles = days['vehicle_id'].unique()

In [24]:
for v in vehicles:
    vehicle_selector = days['vehicle_id']==v
    days.loc[vehicle_selector,'dt'] = calculate_durations(days, v)
    days.loc[vehicle_selector,'dx'] = calculate_distances(days, v)

speed_selector = days['dt'] > 0
days.loc[speed_selector,'speed'] = days[speed_selector].dx / days[speed_selector].dt * 3.6

# Filter invalid points (speeds over 100 km/h)
days = days[days['speed'] < 100.0]


KeyboardInterrupt: 

In [None]:
days.head(10)

In [None]:
days[days['vehicle_id']==33491].head(20)

In [None]:
# Number of different vehicles
len(days.groupby('vehicle_id'))

In [None]:
# Number of different stops
len(days.groupby('stop_id'))

In [None]:
days.to_csv("data/201301.csv", index=False)