# Trajectory Clustering 2
In this notebook we calculate the trajectory enpoints from the VED data.

In [1]:
import numpy as np
import pandas as pd
import os
import math
import folium

from sklearn.cluster import DBSCAN
from collections import Counter

In [2]:
def get_ends(df):
    return df.head(n=1).append(df.tail(n=1))

In [3]:
def cluster_ends(df):
    pts = np.radians(df[['Latitude[deg]', 'Longitude[deg]']])

    # Parameters
    eps_in_meters = 50.0
    num_samples = 10

    # Cluster the data
    earth_perimeter = 40070000.0  # In meters
    eps_in_radians = eps_in_meters / earth_perimeter * (2 * math.pi)

    end_clusters = DBSCAN(eps=eps_in_radians, 
                          min_samples=num_samples,
                          metric='haversine',
                          algorithm='ball_tree').fit_predict(pts)
    df['end_id'] = end_clusters
    return df

In [4]:
def make_path_map(df_list):
    m = folium.Map()
    
    all_points = []
    for df in df_list:
        points = df[['Latitude[deg]', 'Longitude[deg]']].to_numpy()
        polyline = folium.vector_layers.PolyLine(locations=points)
        polyline.add_to(m)
        all_points.extend(points.tolist())
    m.fit_bounds(all_points)
    return m

`make_link_map`: Create a map based on a link, defined as the two terminal node identifiers.

In [5]:
def make_link_map(node_ini, node_end):
    link_df = trip_df[(trip_df.NodeIni == node_ini) & (trip_df.NodeEnd == node_end)]
    
    path_list = []
    for row in link_df.itertuples(index=False):
        path = df[(df.DayNum == row.DayNum) & (df.VehId == row.VehId)].sort_values(by=['DayNum', 'Timestamp(ms)'])
        path_list.append(path)
    return make_path_map(path_list)

## Read the Data
We prepared the dataset file in the previous notebook, so it is readily available for use. Here, we set up a few variables related to the data folder name and file names.

In [6]:
data_path = "./data"
endpoints_file = os.path.join(data_path, "endpoints.parquet")
parquet_file = os.path.join(data_path, "ved.parquet")

Read the dataset from the parquet file. If this file does not exist, please create it by using the code in the previous notebook: `1-convert-ved.ipynb`

In [7]:
df = pd.read_parquet(parquet_file)

### The Trip DataFrame
Individual trips are identified by unique values of the `DayNum` and `VehId` pair. This DataFrame relates the unique pairs to the start and end node identifiers, to be later calculated. For now, we set them to `-1`.

In [8]:
trip_df = df.groupby(by=['DayNum', 'VehId']).size().reset_index().rename(columns={0:'Count'})

In [10]:
trip_df['NodeIni'] = -1
trip_df['NodeEnd'] = -1

Here's how the `trip_df` table looks right now.

In [11]:
trip_df.head(10)

Unnamed: 0,DayNum,VehId,Count,NodeIni,NodeEnd
0,1.002938,550,131,-1,-1
1,1.015493,540,545,-1,-1
2,1.017633,156,690,-1,-1
3,1.025782,588,1150,-1,-1
4,1.054483,267,495,-1,-1
5,1.058991,11,396,-1,-1
6,1.062756,130,452,-1,-1
7,1.065486,174,273,-1,-1
8,1.082547,374,471,-1,-1
9,1.101627,156,697,-1,-1


In [None]:
def get_trip_ini(row):
    traj = ends_df[(ends_df.DayNum == row.DayNum) & (ends_df.VehId == row.VehId)].head(n=1)
    return traj.end_id.values[0]

def get_trip_end(row):
    traj = ends_df[(ends_df.DayNum == row.DayNum) & (ends_df.VehId == row.VehId)].tail(n=1)
    return traj.end_id.values[0]

In [None]:
grp = df.groupby(by=['DayNum', 'VehId'])

In [None]:
ends_df = pd.concat([get_ends(d) for k, d in grp])

In [None]:
ends_df = cluster_ends(ends_df)

In [None]:
ends_df

In [None]:
# for k, d in grp:
#     print(type(k), type(d))
#     break

In [None]:
trip_df['NodeIni'] = trip_df.apply(get_trip_ini, axis=1)
trip_df['NodeEnd'] = trip_df.apply(get_trip_end, axis=1)

In [None]:
trip_df

In [None]:
ids = ends_df.end_id.to_numpy()

In [None]:
id_pairs = np.reshape(ids, (ids.shape[0] // 2, 2))

In [None]:
id_pairs

In [None]:
edges = [(id_pairs[i,0], id_pairs[i,1]) for i in range(id_pairs.shape[0]) if id_pairs[i,0] != -1 and id_pairs[i,1] != -1]

In [None]:
cnt = Counter(edges)

In [None]:
data = [(p[0], p[1], cnt[p]) for p in cnt]

In [None]:
edge_df = pd.DataFrame(data, columns=['NodeIni', 'NodeEnd', 'Count'])

In [None]:
edge_df = edge_df.sort_values(by=['Count'], ascending=False)

In [None]:
edge_df.head(20)

In [None]:
m = make_link_map(node_ini=1, node_end=3)

In [None]:
m.save("./html/map-1-3.html")