In [1]:
import numpy as np
import pandas as pd
import ipywidgets as widgets
import osmnx as ox
from ipywidgets import interact, interact_manual
from tqdm import tqdm
import folium
import multiprocessing
import collections

from geo.geomath import vec_haversine, num_haversine
from geo.df import DataCleaner
from par.allel import parallel_process

In [15]:
df = pd.read_parquet("data/sir010113-310113.parquet")

In [16]:
cleaner = DataCleaner()

In [None]:
cleaner.lat_col

In [17]:
vehicles = df['VehicleID']

In [18]:
unique_vehicles = df['VehicleID'].unique()

In [None]:
def calculate_anomalies(df, max_speed):
    df = calculate_dt(df, 'Timestamp', 'dt', one_second=1000000)
    df = calculate_dx(df, 'Lat', 'Lon', 'dx')
    df = calculate_speed(df, 'dx', 'dt', 'v')
    anom = df[df['v'] > max_speed]
    return df, anom

In [29]:
def remove_speed_anomalies(df, max_speed=100.0):
    removed = None
    df, anom = cleaner.calculate_anomalies(df, max_speed)
    while anom.shape[0] > 0:
        df = cleaner.remove_anomaly(df, anom)
        
        idx0 = anom.index[0]
        if removed is None:
            removed = df.loc[idx0:idx0+1].copy()
        else:
            removed = removed.append(df.loc[idx0:idx0+1].copy())
        
        df = df.drop(anom.index[0])
        df, anom = cleaner.calculate_anomalies(df, max_speed)
    return df, removed

In [None]:
def par_remove_speed_anomalies(vd):
    return { id: vd.id, df: remove_speed_anomalies(vd.df, max_speed=100.0) } 

In [6]:
VehicleData = collections.namedtuple('VehicleData', ['id', 'df'])

In [None]:
vd_list = []
for v in tqdm(unique_vehicles):
    vd_list.append(VehicleData(id=v, df=df[df['VehicleID'] == v].copy()))
vd_tuple = tuple(vd_list)

In [None]:
df = None

In [19]:
vd = {}
anom = {}
for v in tqdm(unique_vehicles):
    vd[v] = df[df['VehicleID'] == v].copy().sort_values(by='Timestamp')

100%|██████████| 911/911 [01:02<00:00, 21.53it/s]


In [26]:
v = unique_vehicles[5]

In [27]:
vd_back = vd[v].copy()

In [30]:
vd[v] = vd_back

In [31]:
vd[v], removed = remove_speed_anomalies(vd[v])

In [32]:
type(removed)

pandas.core.frame.DataFrame

In [34]:
removed

Unnamed: 0,Timestamp,LineID,Direction,PatternID,TimeFrame,JourneyID,Operator,Congestion,Lon,Lat,...,VehicleID,StopID,AtStop,DateTime,Day,Hour,Minute,dt,dx,v
18387,1357030344000000,15,0,,2013-01-01,12721,RD,0,-6.247300,53.343735,...,33488,0,0,2013-01-01 08:52:24,1,8,52,8.0,317.633849,142.935232
20472,1357030644000000,15,0,,2013-01-01,12721,RD,0,-6.272433,53.339451,...,33488,0,0,2013-01-01 08:57:24,1,8,57,6.0,202.438540,121.463124
20913,1357030703000000,15,0,,2013-01-01,12721,RD,0,-6.272400,53.337517,...,33488,0,0,2013-01-01 08:58:23,1,8,58,6.0,189.171700,113.503020
22647,1357030944000000,15,0,,2013-01-01,12721,RD,0,-6.276516,53.328266,...,33488,0,0,2013-01-01 09:02:24,1,9,2,6.0,207.985256,124.791154
24029,1357031124000000,15,0,,2013-01-01,12721,RD,0,-6.284483,53.321117,...,33488,0,0,2013-01-01 09:05:24,1,9,5,4.0,286.192273,257.573046
24486,1357031183000000,15,0,,2013-01-01,12721,RD,0,-6.294283,53.316284,...,33488,0,0,2013-01-01 09:06:23,1,9,6,4.0,315.158138,283.642324
24981,1357031248000000,15,0,,2013-01-01,12721,RD,0,-6.302850,53.310249,...,33488,0,0,2013-01-01 09:07:28,1,9,7,8.0,235.985343,106.193404
25461,1357031303000000,15,0,,2013-01-01,12721,RD,0,-6.307200,53.304634,...,33488,0,0,2013-01-01 09:08:23,1,9,8,4.0,351.994228,316.794805
25981,1357031364000000,15,0,,2013-01-01,12721,RD,0,-6.309300,53.297749,...,33488,0,0,2013-01-01 09:09:24,1,9,9,6.0,312.213362,187.328017
26440,1357031423000000,15,0,,2013-01-01,12721,RD,0,-6.308833,53.294117,...,33488,0,0,2013-01-01 09:10:23,1,9,10,4.0,186.555388,167.899849


In [None]:
vd[v].shape

In [None]:
vd_back.shape

In [None]:
i1 = vd[v].index.get_loc(anom[v].index[0])
i0 = i1 - 1
i2 = i1 + 1

In [None]:
idx2 = vd[v].index[i2]
idx1 = vd[v].index[i1]
idx0 = vd[v].index[i0]

In [None]:
(vd[v])[i0:i2+1]

In [None]:
vd[v].iloc[i0]['dt']

In [None]:
vd[v].iloc[i1]['dt']

In [None]:
# Recalculate the time difference
vd[v].loc[idx2,'dt'] += vd[v].loc[idx1, 'dt']

In [None]:
# Recalculate the distance
lat1 = vd[v].loc[idx0, 'Lat']
lon1 = vd[v].loc[idx0, 'Lon']
lat2 = vd[v].loc[idx2, 'Lat']
lon2 = vd[v].loc[idx2, 'Lon']

vd[v].loc[idx2,'dx'] = num_haversine(lat1, lon1, lat2, lon2)

In [None]:
# Recalculate the speed
vd[v].loc[idx2,'v'] = vd[v].loc[idx2, 'dx'] / vd[v].loc[idx2, 'dt'] * 3.6

In [None]:
pool = multiprocessing.Pool()
result = pool.map(par_remove_speed_anomalies, vd_tuple)

In [None]:
df = vd[v].copy()

In [None]:
anom = df[df['v'] > 70]

In [None]:
while anom.shape[0] > 0:
    df = df.drop(anom.index[0])
    df = calculate_dt(df, 'Timestamp', 'dt', one_second=1000000)
    df = calculate_dx(df, 'Lat', 'Lon', 'dx')
    df = calculate_speed(df, 'dx', 'dt', 'v')
    anom = df[df['v'] > 70]

In [None]:
vt1 = df[(df['Day'] == 3)]

In [None]:
vt0 = vd[v]
vt0 = vt0[vt0['Day'] == 3]

In [None]:
vt1.shape

In [None]:
vt0.shape

In [None]:
v0 = df[df['VehicleID'] == unique_vehicles[0]]

In [None]:
v0['dt'].max()

In [None]:
v0['dx'].max()

In [None]:
v0.to_csv("data/v0.csv", index=False)

In [None]:
# df[df['VehicleID'] == unique_vehicles[10]].head(20)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
df[(df['v'] > 0) & (df['v'] < 80)].boxplot(['v'], figsize=(6,6))

In [None]:
# df['v'].plot.kde()

In [None]:
bad_journeys = df[df['v'] > 70]['JourneyID'].unique()

In [None]:
bad_journeys

In [None]:
bad_journeys.shape

In [None]:
df[df['JourneyID'] == bad_journeys[0]].shape

In [None]:
j0 = df[(df['VehicleID'] == 43055) & (df['Day'] == 2)]

In [None]:
vt = df[(df['VehicleID'] == 28047) & (df['Day'] == 2)]

In [None]:
vt = df.loc[(df['JourneyID'] == bad_journeys[0]) & (df['VehicleID'] == 33231)].sort_values(['Timestamp'])

In [None]:
vt.columns

In [None]:
vt = df.loc[(df['VehicleID'] == 43055) & (df['Day'] == 2) & (df['Hour'] >= 18)].sort_values(['Timestamp']).copy()

In [None]:
vt.to_csv("data/vt.csv", index=False)

In [None]:
lat = vt['Lat'].to_numpy()
lon = vt['Lon'].to_numpy()

In [None]:
lat.shape[0]

In [None]:
@interact
def show_trajectory(t=(0, 226, 1)):
    lat = vt['Lat'][:t].to_numpy()
    lon = vt['Lon'][:t].to_numpy()
    plt.plot(lon, lat)

In [None]:
# for p in points:
#     folium.map.Marker(p).add_to(map)

In [None]:
# map.save("html/index.html")

In [None]:
def create_map(df):
    points = list(zip(df['Lat'].to_numpy(), df['Lon'].to_numpy()))
    map = folium.Map()
    polyline = folium.vector_layers.PolyLine(points)
    map.fit_bounds(points)
    polyline.add_to(map)
    return map

In [None]:
ox.utils.get_nearest_edges(G, [53.221329, 53.220623, 53.223812], [-6.233653, -6.226733, -6.240146], method='balltree')

In [None]:
ox.utils.get_nearest_edges(G, [53.220623], [-6.226733], method='balltree')

In [None]:
ox.utils.get_nearest_edges(G, [53.223812], [-6.240146], method='balltree')

In [None]:
G.edges[1713292579]

In [None]:
n0 = G.nodes(1713292579)
n1 = G.nodes(31932030)

In [None]:
n0.isdisjoint(n1)

In [None]:
vt[50:]

In [None]:
idx = vt[vt['v'] > 90].index

In [None]:
idx

In [None]:
vt[vt['v'] > 90]