In [1]:
import numpy as np
import pandas as pd

from geo.geomath import vec_haversine

In [2]:
df = pd.read_parquet("data/sir010113-310113.parquet")

In [11]:
df = df.sort_values(by=['VehicleID', 'Timestamp'])

In [4]:
df['LineID'].unique().shape

(67,)

In [5]:
df['VehicleID'].unique().shape

(911,)

In [6]:
df['StopID'].unique().shape

(4728,)

In [12]:
df['JourneyID'].dropna().unique().shape

(18614,)

In [7]:
df['PatternID'].dropna().unique().shape

(465,)

In [9]:
df.isnull().sum()

Timestamp           0
LineID              0
Direction           0
PatternID     6649920
TimeFrame           0
JourneyID           0
Operator            0
Congestion          0
Lon                 0
Lat                 0
Delay               0
BlockID             0
VehicleID           0
StopID              0
AtStop              0
Hour                0
dtype: int64

In [10]:
df.shape

(44455133, 16)

In [18]:
df.sort_values(by=['VehicleID', 'Timestamp']).head(20)

Unnamed: 0,Timestamp,LineID,Direction,PatternID,TimeFrame,JourneyID,Operator,Congestion,Lon,Lat,Delay,BlockID,VehicleID,StopID,AtStop,Hour
828520,2013-01-02 06:47:34,44,0,,2013-01-02,3997,D2,0,-6.23685,53.307999,0,44005,28047,0,0,6
829863,2013-01-02 06:48:27,44,0,,2013-01-02,3997,D2,0,-6.239767,53.305183,0,44005,28047,0,0,6
830298,2013-01-02 06:48:48,44,0,,2013-01-02,3997,D2,0,-6.239767,53.305183,0,44005,28047,0,0,6
830481,2013-01-02 06:48:56,44,0,,2013-01-02,3997,D2,0,-6.239767,53.305183,0,44005,28047,0,0,6
830851,2013-01-02 06:49:06,44,0,,2013-01-02,3997,D2,0,-6.24565,53.304966,0,44005,28047,0,0,6
831694,2013-01-02 06:49:47,44,0,,2013-01-02,3997,D2,0,-6.2458,53.300999,0,44005,28047,0,0,6
831907,2013-01-02 06:49:57,44,0,,2013-01-02,3997,D2,0,-6.2458,53.300999,0,44005,28047,0,0,6
832331,2013-01-02 06:50:07,44,0,,2013-01-02,3997,D2,0,-6.2465,53.298668,0,44005,28047,0,0,6
833230,2013-01-02 06:50:46,44,0,,2013-01-02,3997,D2,0,-6.245917,53.293999,0,44005,28047,0,0,6
833551,2013-01-02 06:50:58,44,0,,2013-01-02,4260,D2,0,-6.245917,53.293999,0,44005,28047,2825,0,6


In [14]:
df.describe()

Unnamed: 0,LineID,Direction,JourneyID,Congestion,Lon,Lat,Delay,BlockID,VehicleID,StopID,AtStop,Hour
count,44455130.0,44455133.0,44455130.0,44455130.0,44455130.0,44455130.0,44455130.0,44455130.0,44455130.0,44455130.0,44455130.0,44455130.0
mean,77.96518,0.0,9417.119,0.0116326,-6.272803,53.34513,-0.2668891,109243.8,35429.25,2634.347,0.2335773,13.88823
std,114.6405,0.0,61598.42,0.1072254,0.08389026,0.05488355,65.85882,192124.4,3281.093,2236.428,0.4231063,4.921609
min,0.0,0.0,1.0,0.0,-6.617517,53.06802,-128.0,390.0,28047.0,0.0,0.0,0.0
25%,25.0,0.0,2536.0,0.0,-6.30865,53.32005,-48.0,16020.0,33308.0,756.0,0.0,10.0
50%,40.0,0.0,4718.0,0.0,-6.2616,53.34645,0.0,40205.0,33525.0,2030.0,0.0,14.0
75%,83.0,0.0,6769.0,0.0,-6.233166,53.37533,47.0,84004.0,38025.0,4384.0,0.0,18.0
max,747.0,0.0,999856.0,1.0,-6.052917,53.60873,127.0,835002.0,43078.0,7552.0,1.0,23.0


In [17]:
df['LineID'].unique()

array([747,  27,  40,   7,  56,  25,   4, 272,  83, 130,  13,  46,  66,
        41,  65,  15, 151,  39, 122,  16, 150,  33, 145,  29, 123,  37,
       271,  67,  14,  38,  84,  18,  31,  59, 171,   9, 332,  11,  42,
        54,  49, 120, 104,  44,  75,  68,  63, 140,   1, 102,  76, 451,
        79, 238, 220,  17,  53, 331,   0, 114, 142,  51,  32, 111, 116,
       118,  86])

## Calculate the Time and Distance Between Consecutive Points
The distance between consecutive points will first be approximated by the geodesic distance using the [haversine formula](https://en.wikipedia.org/wiki/Haversine_formula). See also [here](https://www.movable-type.co.uk/scripts/latlong.html).

Calculations are performed on a per vehicle basis, with records sorted in ascending order by timeframe.

In [12]:
vehicles = df['VehicleID']

In [49]:
df['dx'] = 0.0
df['dt'] = df['Timestamp'].diff()
df['dt'] = df['dt'].fillna(value=0.0)
df['dt'] = df['dt'] / 1000000

In [50]:
df.head()

Unnamed: 0,Timestamp,LineID,Direction,PatternID,TimeFrame,JourneyID,Operator,Congestion,Lon,Lat,...,VehicleID,StopID,AtStop,DateTime,Day,Hour,Minute,Dist,dx,dt
828520,1357109254000000,44,0,,2013-01-02,3997,D2,0,-6.23685,53.307999,...,28047,0,0,2013-01-02 06:47:34,2,6,47,0.0,0.0,0.0
829863,1357109307000000,44,0,,2013-01-02,3997,D2,0,-6.239767,53.305183,...,28047,0,0,2013-01-02 06:48:27,2,6,48,368.253645,0.0,53.0
830298,1357109328000000,44,0,,2013-01-02,3997,D2,0,-6.239767,53.305183,...,28047,0,0,2013-01-02 06:48:48,2,6,48,0.0,0.0,21.0
830481,1357109336000000,44,0,,2013-01-02,3997,D2,0,-6.239767,53.305183,...,28047,0,0,2013-01-02 06:48:56,2,6,48,0.0,0.0,8.0
830851,1357109346000000,44,0,,2013-01-02,3997,D2,0,-6.24565,53.304966,...,28047,0,0,2013-01-02 06:49:06,2,6,49,391.639886,0.0,10.0


In [51]:
lat0 = df['Lat'][:-1].to_numpy()
lon0 = df['Lon'][:-1].to_numpy()
lat1 = df['Lat'][1:].to_numpy()
lon1 = df['Lon'][1:].to_numpy()
dist = vec_haversine(lat0, lon0, lat1, lon1)

In [52]:
dist.shape

(44455132,)

In [53]:
lat0.shape, lon1.shape, lat1.shape, lon1.shape

((44455132,), (44455132,), (44455132,), (44455132,))

In [54]:
dist.shape

(44455132,)

In [55]:
df.shape

(44455133, 22)

In [56]:
lon1.shape

(44455132,)

In [57]:
df['dx'] = np.insert(dist, 0, 0.0)
df.loc[vehicles.diff() == 1, ['dx', 'dt']] = 0.0

In [58]:
unique_vehicles = df['VehicleID'].unique()

In [59]:
df[df['VehicleID'] == unique_vehicles[10]].head(20)

Unnamed: 0,Timestamp,LineID,Direction,PatternID,TimeFrame,JourneyID,Operator,Congestion,Lon,Lat,...,VehicleID,StopID,AtStop,DateTime,Day,Hour,Minute,Dist,dx,dt
9032078,1357579721000000,15,0,,2013-01-07,2769,RD,0,-6.246883,53.348049,...,33007,0,0,2013-01-07 17:28:41,7,17,28,0.0,0.0,0.0
9032341,1357579727000000,15,0,,2013-01-07,2769,RD,0,-6.24675,53.348034,...,33007,0,0,2013-01-07 17:28:47,7,17,28,8.984467,8.984467,6.0
9032947,1357579741000000,15,0,,2013-01-07,2769,RD,0,-6.24675,53.348034,...,33007,0,0,2013-01-07 17:29:01,7,17,29,0.0,0.0,14.0
9033251,1357579748000000,15,0,,2013-01-07,2769,RD,0,-6.24675,53.348034,...,33007,0,0,2013-01-07 17:29:08,7,17,29,0.0,0.0,7.0
9034662,1357579787000000,15,0,,2013-01-07,2769,RD,0,-6.242517,53.347782,...,33007,0,0,2013-01-07 17:29:47,7,17,29,282.373178,282.373178,39.0
9035256,1357579803000000,15,0,,2013-01-07,2769,RD,0,-6.242517,53.347782,...,33007,7077,0,2013-01-07 17:30:03,7,17,30,0.0,0.0,16.0
9035508,1357579808000000,15,0,,2013-01-07,2769,RD,0,-6.24155,53.347717,...,33007,7077,0,2013-01-07 17:30:08,7,17,30,64.593711,64.593711,5.0
9036190,1357579828000000,15,0,,2013-01-07,2769,RD,0,-6.241583,53.347698,...,33007,7077,0,2013-01-07 17:30:28,7,17,30,3.043318,3.043318,20.0
9036855,1357579847000000,15,0,,2013-01-07,2769,RD,0,-6.241617,53.347668,...,33007,7077,0,2013-01-07 17:30:47,7,17,30,4.027575,4.027575,19.0
9037424,1357579863000000,15,0,,2013-01-07,2769,RD,0,-6.241617,53.347668,...,33007,7077,0,2013-01-07 17:31:03,7,17,31,0.0,0.0,16.0
