In [6]:
from tslearn.metrics import dtw, dtw_path, dtw_path_from_metric
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
from importable_functions.render_vehicle_tracks import *
from import_scripts.get_x_y_tuples import *

In [4]:
# ********************
# ****** IMPORT ******
# ********************
df_k729_2022_cuid = pd.read_csv('./datasets/k729_2022_cuid.csv')
df_k729_2022_cuid_grouped = pd.read_csv('./datasets/k729_2022_cuid_grouped.csv')
df_k729_2022_cuid_grouped['x'] = df_k729_2022_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
df_k729_2022_cuid_grouped['y'] = df_k729_2022_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))
df_k729_2022_cuid_grouped['vx'] = df_k729_2022_cuid_grouped['vx'].apply(lambda vx: ast.literal_eval(vx))
df_k729_2022_cuid_grouped['vy'] = df_k729_2022_cuid_grouped['vy'].apply(lambda vy: ast.literal_eval(vy))
list_k729_2022_x_y_tuples = get_x_y_tuple_list(df_k729_2022_cuid_grouped, ['x','y'])
k729_2022_vx_vy_tuples = get_x_y_tuple_list(df_k729_2022_cuid_grouped, ['vx','vy'])

df_k733_2020_cuid = pd.read_csv('./datasets/k733_2020_cuid.csv')
df_k733_2020_cuid_grouped = pd.read_csv('./datasets/k733_2020_cuid_grouped.csv')
df_k733_2020_cuid_grouped['x'] = df_k733_2020_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
df_k733_2020_cuid_grouped['y'] = df_k733_2020_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))
list_k733_2020_x_y_tuples = get_x_y_tuple_list(df_k733_2020_cuid_grouped, ['x','y'])

df_k733_2018_cuid = pd.read_csv('./datasets/k733_2018_cuid.csv')
df_k733_2018_cuid_grouped = pd.read_csv('./datasets/k733_2018_cuid_grouped.csv')
df_k733_2018_cuid_grouped['x'] = df_k733_2018_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
df_k733_2018_cuid_grouped['y'] = df_k733_2018_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))
list_k733_2018_x_y_tuples = get_x_y_tuple_list(df_k733_2018_cuid_grouped, ['x','y'])

In [4]:
def knn_dtw_unsupervised(data, k, sakoe_chiba_radius=1, metric='euclidian', **metric_parameters):
    """
    k-NN outlier detection using DTW as the distance measure with a custom metric

    :param data: List of time series (input data)
    :param k: Number of nearest neighbors to consider
    :param metric: string compatible with sklearn.metrics.DistanceMetric
    
    :return: A list of average distances for each time series, which can be used to identify outliers
    """
    n = len(data)
    distances = np.zeros((n, n))
    


    # Compute all-pair DTW distances
    for i in range(n):
        for j in range(i + 1, n):
            if not metric_parameters:
                path, dist = dtw_path_from_metric(data[i], data[j], sakoe_chiba_radius=sakoe_chiba_radius, metric=metric)
            else:
                path, dist = dtw_path_from_metric(data[i], data[j], sakoe_chiba_radius=sakoe_chiba_radius, metric=metric, kwds=metric_parameters)
            distances[i][j] = dist
            distances[j][i] = dist
    
    # For each time series, find the average distance to its k nearest neighbors
    avg_distances = []
    for i in range(n):
        sorted_distances = np.sort(distances[i])
        avg_distance = np.mean(sorted_distances[1:k + 1])  # skip the first one since it's the distance to itself
        avg_distances.append(avg_distance)
    
    return avg_distances

In [5]:
def detect_outliers(avg_distances, threshold):
    """
    Detect outliers based on average distances and a given threshold.
    
    :param avg_distances: List of average distances for each time series
    :param threshold: Threshold above which a time series is considered an outlier
    
    :return: List of indices that are considered outliers
    """
    outliers = [i for i, distance in enumerate(avg_distances) if distance > threshold]
    return outliers

In [6]:
from importable_functions.render_vehicle_tracks import render_vehicle_tracks_highlight_tracks
def render_outliers(df_cuid, df_cuid_grouped, list_x_y_tuples, k_start, k_stop, intersection_name='undefined_intersection', metric='euclidean', sakoe_chiba_radius=1, **metric_params):

    '''
    Computes k-NN outlier detection using DTW (optional with a custom metric) and renders all outliers higlighted in a plot of the intersection.
    Saves the plot as a file in intersection_name/knn_dtw/render_outliers/metric_name/sakoe_radius/metric_params/


    :param metric: string compatible with sklearn.metrics.DistanceMetric
    '''



    file_path_suffix = ''
    title = ''
    if metric_params!=None and metric!='mahalanobis':
        formatted_items = [f'{k.lower()}_{v.lower()}' for k, v in metric_params.items()]
        file_path_suffix = '_'.join(formatted_items)
        title = ' - '.join(formatted_items)

    file_path = f'knn_dtw/render_outliers/{metric}/radius_{sakoe_chiba_radius}/{file_path_suffix}' 

    for k in range(k_start,k_stop):
        
        if not metric_params:
            avg_dtw_distance = knn_dtw_unsupervised(list_x_y_tuples, k, sakoe_chiba_radius, metric)
        else:
            avg_dtw_distance = knn_dtw_unsupervised(list_x_y_tuples, k, sakoe_chiba_radius, metric, **metric_params)

        threshold = np.mean(avg_dtw_distance) + 2 * np.std(avg_dtw_distance)

        list_outliers = detect_outliers(avg_dtw_distance, threshold)


        suptitle= f'Outliers highglighted for k={k} with metric {metric}'
        file_name=f'outliers_{metric}_k_{k}'
        list_outliers_strings = [str(x) for x in list_outliers]
        render_vehicle_tracks_highlight_tracks(df_cuid, df_cuid_grouped, list_outliers_strings, title=title, suptitle=suptitle, file_name=file_name, intersection_name=intersection_name, file_path=file_path)

In [7]:
def filter_array_indices(list_of_arrays, threshold):
    # List to store indices of arrays where any value is greater than the threshold
    indices = []

    # Iterate through each array in the list
    for i, array in enumerate(list_of_arrays):
        # Convert the array to a NumPy array if it's not already
        np_array = np.array(array)

        # Check if any value in the array is greater than the threshold
        if np.any(np_array > threshold):
            # Append the index of the array to the result list
            indices.append(i)

    return indices

In [8]:
# %%capture
# # SAKOBE RADIUS R=3
# # k733 2018
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,20,intersection_name='k733_2018', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,20,intersection_name='k733_2018', metric='canberra', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,20,intersection_name='k733_2018', metric='minkowski', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')
# # k733 2020
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020', metric='canberra', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020', metric='minkowski', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')
# # k729 2022
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,20,intersection_name='k729_2022', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,20,intersection_name='k729_2022', metric='canberra', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,20,intersection_name='k729_2022', metric='minkowski', file_prefix='sakobe_r_3', title_prefix='Sakobe radius 3: ')

In [9]:
# %%capture
# # VANILLA
# # k733 2018
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,10,intersection_name='k733_2018')
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,10,intersection_name='k733_2018', metric='canberra')
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,10,intersection_name='k733_2018', metric='minkowski')
# # k733 2020
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020')
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020', metric='canberra')
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020', metric='minkowski')
# # k729 2022
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,10,intersection_name='k729_2022')
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,10,intersection_name='k729_2022', metric='canberra')
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,10,intersection_name='k729_2022', metric='minkowski')

In [10]:
# %%capture
# # SAKOBE RADIUS R=0.5
# # k733 2018
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,10,intersection_name='k733_2018', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,10,intersection_name='k733_2018', metric='canberra', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')
# render_outliers(df_k733_2018_cuid, df_k733_2018_cuid_grouped,list_k733_2018_x_y_tuples,1,10,intersection_name='k733_2018', metric='minkowski', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')
# # k733 2020
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020', metric='canberra', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')
# render_outliers(df_k733_2020_cuid, df_k733_2020_cuid_grouped,list_k733_2020_x_y_tuples,1,10,intersection_name='k733_2020', metric='minkowski', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')
# # k729 2022
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,10,intersection_name='k729_2022', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,10,intersection_name='k729_2022', metric='canberra', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped,list_k729_2022_x_y_tuples,1,10,intersection_name='k729_2022', metric='minkowski', file_prefix='sakobe_r_0.5', title_prefix='Sakobe radius 0.5: ')

In [11]:
# %%capture
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped, list_k729_2022_x_y_tuples, 0, 10, 'k729_2022', 'manhattan', 1)
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped, list_k729_2022_x_y_tuples, 0, 10, 'k729_2022', 'euclidean', 1)
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped, list_k729_2022_x_y_tuples, 0, 10, 'k729_2022', 'chebyshev', 1)

In [12]:
# cov_matr = np.identity(len(df_k729_2022_cuid_grouped['x']))
# render_outliers(df_k729_2022_cuid, df_k729_2022_cuid_grouped, list_k729_2022_x_y_tuples, 0, 10, 'k729_2022', 'mahalanobis', 1, VI=cov_matr)

In [8]:
def compute_dtw_distance_matrix(data, **metric_parameters):
    """

    :param data: List of time series (input data)
    :param k: Number of nearest neighbors to consider
    :param metric: string compatible with sklearn.metrics.DistanceMetric
    
    :return: A list of average distances for each time series, which can be used to identify outliers
    """
    n = len(data)
    distances = np.zeros((n, n))
    


    # Compute all-pair DTW distances
    for i in range(n):
        for j in range(i + 1, n):
            if not metric_parameters:
                path, dist = dtw_path_from_metric(data[i], data[j])
            else:
                path, dist = dtw_path_from_metric(data[i], data[j], **metric_parameters)
            distances[i][j] = dist
            distances[j][i] = dist

    return distances
    

In [9]:
distances = compute_dtw_distance_matrix(list_k729_2022_x_y_tuples)