# Dublin Buses - Clean Data

Prerequisites: `00-download-data.ipynb`

Before running the code in this notebook, you must download and concatenate all the original per-day data files into a single parquet file. Please use the above notebook to do this.

In [None]:
import numpy as np
import pandas as pd
import ipywidgets as widgets
import osmnx as ox
from ipywidgets import interact, interact_manual
from tqdm import tqdm_notebook as tqdm
import folium
import multiprocessing
import collections
import matplotlib.pyplot as plt

from sklearn.neighbors import BallTree

from geo.geomath import vec_haversine, num_haversine
from geo.df import DataCleaner
from par.allel import parallel_process

Read the data in parquet format, as generated by the first step. Note that not all columns are being read in.

In [None]:
columns_to_read = ['Timestamp', 'LineID', 'Direction', 'PatternID', 
                   'JourneyID', 'Congestion', 'Lon', 'Lat', 
                   'Delay', 'BlockID', 'VehicleID', 'StopID', 'AtStop']
df = pd.read_parquet("data/sir010113-310113.parquet", columns=columns_to_read)

In [None]:
journeys = df.JourneyID.unique()

In [None]:
journeys.shape

In [None]:
cleaner = DataCleaner()

In [None]:
vehicles = df['VehicleID']

In [None]:
unique_vehicles = df['VehicleID'].unique()

In [None]:
def zero_runs(a):
    # Source: https://stackoverflow.com/questions/24885092/finding-the-consecutive-zeros-in-a-numpy-array
    # Create an array that is 1 where a is 0, and pad each end with an extra 0.
    iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
    absdiff = np.abs(np.diff(iszero))
    # Runs start and end where absdiff is 1.
    ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
    return ranges

In [None]:
def get_max_speed(df):
    q = df['v'].quantile([.25, .5, .75])
    iqr = q.loc[0.75] - q.loc[0.25]
    return q.loc[0.75] + 1.5 * iqr

In [None]:
# def par_remove_speed_anomalies(vd):
#     return { id: vd.id, df: remove_speed_anomalies(vd.df, max_speed=100.0) } 

In [None]:
# VehicleData = collections.namedtuple('VehicleData', ['id', 'df'])

In [None]:
vd = {}
anom = {}
for v in tqdm(unique_vehicles):
    vd[v] = df[df['VehicleID'] == v].copy().sort_values(by='Timestamp')
    vd[v] = cleaner.calculate_derived_columns(vd[v])

In [None]:
df = None

In [None]:
max_v = {}
for v in tqdm(unique_vehicles):
    max_v[v] = get_max_speed(vd[v])

In [None]:
import statistics
statistics.mean(max_v.values())

In [None]:
statistics.median(max_v.values())

In [None]:
x = max_v.values()
num_bins = 50
plt.figure(figsize=(12,8))
n, bins, patches = plt.hist(x, num_bins, facecolor='blue', alpha=0.5)

In [None]:
v = unique_vehicles[99]

In [None]:
df = vd[v].copy()

In [None]:
df = cleaner.fix_type1_anomalies(df)

In [None]:
df.shape

In [None]:
q = df['v'].quantile([.25, .5, .75])

In [None]:
iqr = q.loc[0.75] - q.loc[0.25]

In [None]:
q.loc[0.75] + 1.5 * iqr

In [None]:
df[df['v'] < 160]['v'].plot.box()

In [None]:
df, anomalies = cleaner.fix_type2_anomalies(df, max_speed=70.0)

In [None]:
df.shape

In [None]:
anomalies.shape

In [None]:
df['v'].plot.box()

In [None]:
anomalies['v'].plot.box()

In [None]:
# anomalies = None
# type2 = df[df['v'] > 70.0]
# while type2.shape[0] > 0:
#     df = cleaner.fix_anomaly(df, type2)
#     idx = type2.index[0]
#     # print(idx)
#     row = df.loc[idx:idx].copy()
#     if anomalies is None:
#         anomalies = row
#     else:
#         anomalies = pd.concat([anomalies, row])
#     df = df.drop(index=idx)
#     type2 = df[df['v'] > 70.0]

In [None]:
locations = np.radians(df[['Lat', 'Lon']].to_numpy())

In [None]:
tree = BallTree(locations, metric='haversine')

In [None]:
dist, idx = tree.query(np.radians(anomalies[['Lat', 'Lon']].to_numpy()), k=4, sort_results=True)

In [None]:
dist.shape

In [None]:
idx.shape

In [None]:
idx[0]

In [None]:
anomalies.head()

In [None]:
df.iloc[idx[0]]

In [None]:
dist[0] * 6371000.0

In [None]:
df['v'].hist(bins=100, figsize=(12,8))

In [None]:
anomalies[anomalies['v'] < 150.0]['v'].hist(bins=100, figsize=(12,8))

In [None]:
df0 = df[df.dx > 0.0].sort_values(by='Timestamp').copy()

In [None]:
df0.shape

In [None]:
df1 = cleaner.calculate_derived_columns(df0)

In [None]:
df1['v'].plot.box()

In [None]:
df1.to_csv("data/df1.csv", index=False)

In [None]:
journeys = df1['JourneyID'].unique()

In [None]:
journeys.shape

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.head()

In [None]:
v = df.v.to_numpy()

In [None]:
zero_runs(v).shape[0]

In [None]:
idx0 = (df['dx'] == 0.0)

In [None]:
dx0 = df[df['dx'] == 0.0]

In [None]:
idx0

## Calculate type-1 anomalies
These occur when the vehicle is stopped in one observation only and the next has an abnormal speed. This probably happens when the GPS reading fails and defaults to the previous observation, falsely reporting that the vehicle has not moved. The next observation will have the correct time delta but will also have the accumulated distance delta, yielding a falsely high speed.

In [None]:
# df_type1 = cleaner.get_type1_anomalies(df)

In [None]:
df = cleaner.fix_type1_anomalies(df)

In [None]:
df_clean1.to_csv("data/clean1.csv", index=False)

In [None]:
df_clean1.shape

In [None]:
df.shape

In [None]:
df_clean1['v'].plot.box()

In [None]:
df_clean1[(df_clean1['v'] < 75.0)]['v'].plot.box()

In [None]:
(vd[v])[i0:i2+1]

In [None]:
vd[v].iloc[i0]['dt']

In [None]:
vd[v].iloc[i1]['dt']

In [None]:
# Recalculate the time difference
vd[v].loc[idx2,'dt'] += vd[v].loc[idx1, 'dt']

In [None]:
# Recalculate the distance
lat1 = vd[v].loc[idx0, 'Lat']
lon1 = vd[v].loc[idx0, 'Lon']
lat2 = vd[v].loc[idx2, 'Lat']
lon2 = vd[v].loc[idx2, 'Lon']

vd[v].loc[idx2,'dx'] = num_haversine(lat1, lon1, lat2, lon2)

In [None]:
# Recalculate the speed
vd[v].loc[idx2,'v'] = vd[v].loc[idx2, 'dx'] / vd[v].loc[idx2, 'dt'] * 3.6

In [None]:
pool = multiprocessing.Pool()
result = pool.map(par_remove_speed_anomalies, vd_tuple)

In [None]:
df = vd[v].copy()

In [None]:
anom = df[df['v'] > 70]

In [None]:
while anom.shape[0] > 0:
    df = df.drop(anom.index[0])
    df = calculate_dt(df, 'Timestamp', 'dt', one_second=1000000)
    df = calculate_dx(df, 'Lat', 'Lon', 'dx')
    df = calculate_speed(df, 'dx', 'dt', 'v')
    anom = df[df['v'] > 70]

In [None]:
vt1 = df[(df['Day'] == 3)]

In [None]:
vt0 = vd[v]
vt0 = vt0[vt0['Day'] == 3]

In [None]:
vt1.shape

In [None]:
vt0.shape

In [None]:
v0 = df[df['VehicleID'] == unique_vehicles[0]]

In [None]:
v0['dt'].max()

In [None]:
v0['dx'].max()

In [None]:
v0.to_csv("data/v0.csv", index=False)

In [None]:
# df[df['VehicleID'] == unique_vehicles[10]].head(20)

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
df[(df['v'] > 0) & (df['v'] < 80)].boxplot(['v'], figsize=(6,6))

In [None]:
# df['v'].plot.kde()

In [None]:
bad_journeys = df[df['v'] > 70]['JourneyID'].unique()

In [None]:
bad_journeys

In [None]:
bad_journeys.shape

In [None]:
df[df['JourneyID'] == bad_journeys[0]].shape

In [None]:
j0 = df[(df['VehicleID'] == 43055) & (df['Day'] == 2)]

In [None]:
vt = df[(df['VehicleID'] == 28047) & (df['Day'] == 2)]

In [None]:
vt = df.loc[(df['JourneyID'] == bad_journeys[0]) & (df['VehicleID'] == 33231)].sort_values(['Timestamp'])

In [None]:
vt.columns

In [None]:
vt = df.loc[(df['VehicleID'] == 43055) & (df['Day'] == 2) & (df['Hour'] >= 18)].sort_values(['Timestamp']).copy()

In [None]:
vt.to_csv("data/vt.csv", index=False)

In [None]:
lat = vt['Lat'].to_numpy()
lon = vt['Lon'].to_numpy()

In [None]:
lat.shape[0]

In [None]:
@interact
def show_trajectory(t=(0, 226, 1)):
    lat = vt['Lat'][:t].to_numpy()
    lon = vt['Lon'][:t].to_numpy()
    plt.plot(lon, lat)

In [None]:
# for p in points:
#     folium.map.Marker(p).add_to(map)

In [None]:
# map.save("html/index.html")

In [None]:
def create_map(df, polyline=True):
    points = list(zip(df['Lat'].to_numpy(), df['Lon'].to_numpy()))
    map = folium.Map()
    map.fit_bounds(points)
    if polyline:
        polyline = folium.vector_layers.PolyLine(points)
        polyline.add_to(map)
    else:
        for pt in points:
            folium.Marker(location=pt).add_to(map)
    return map

In [None]:
# dd = df[(df['Day']==16) & (df['Hour']==20) & (df['Minute'] > 50)]
dd = df[(df['Day']==17) & (df['Hour'] == 6) & (df['Minute'] > 38)]

create_map(dd, polyline=True)

In [None]:
df[(df['Day'] == 17) & (df['Hour'] == 6) & (df['Minute'] > 38)]

In [None]:
df[(df['Day']==16) & (df['Hour']==20) & (df['Minute'] > 50)].to_csv("data/outlier.csv", index=False)

In [None]:
removed.to_csv("data/removed.csv", index=False)

In [None]:
df.to_csv("data/retained.csv", index=False)

In [None]:
vd[v].to_csv("data/original.csv", index=False)

In [None]:
ox.utils.get_nearest_edges(G, [53.221329, 53.220623, 53.223812], [-6.233653, -6.226733, -6.240146], method='balltree')

In [None]:
ox.utils.get_nearest_edges(G, [53.220623], [-6.226733], method='balltree')

In [None]:
ox.utils.get_nearest_edges(G, [53.223812], [-6.240146], method='balltree')