In [3]:
import numpy as np
import pandas as pd
import ast
import matplotlib.pyplot as plt
from tslearn.metrics import dtw_path_from_metric
import random



# ********************
# ****** IMPORT ******
# ********************
df_k729_2022_cuid = pd.read_csv('./datasets/k729_2022_cuid.csv')
df_k729_2022_cuid_grouped = pd.read_csv('./datasets/k729_2022_cuid_grouped.csv')
df_k729_2022_cuid_grouped['x'] = df_k729_2022_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
df_k729_2022_cuid_grouped['y'] = df_k729_2022_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))
df_k729_2022_cuid_grouped['vx'] = df_k729_2022_cuid_grouped['vx'].apply(lambda vx: ast.literal_eval(vx))
df_k729_2022_cuid_grouped['vy'] = df_k729_2022_cuid_grouped['vy'].apply(lambda vy: ast.literal_eval(vy))
df_k729_2022_cuid_grouped['v'] = df_k729_2022_cuid_grouped['v'].apply(lambda v: ast.literal_eval(v))

df_k733_2020_cuid = pd.read_csv('./datasets/k733_2020_cuid.csv')
df_k733_2020_cuid_grouped = pd.read_csv('./datasets/k733_2020_cuid_grouped.csv')
df_k733_2020_cuid_grouped['x'] = df_k733_2020_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
df_k733_2020_cuid_grouped['y'] = df_k733_2020_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))

df_k733_2018_cuid = pd.read_csv('./datasets/k733_2018_cuid.csv')
df_k733_2018_cuid_grouped = pd.read_csv('./datasets/k733_2018_cuid_grouped.csv')
df_k733_2018_cuid_grouped['x'] = df_k733_2018_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
df_k733_2018_cuid_grouped['y'] = df_k733_2018_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))
df_k733_2018_cuid_grouped['vx'] = df_k733_2018_cuid_grouped['vx'].apply(lambda vx: ast.literal_eval(vx))
df_k733_2018_cuid_grouped['vy'] = df_k733_2018_cuid_grouped['vy'].apply(lambda vy: ast.literal_eval(vy))

In [4]:
# create x,y coordinate vectors
def get_x_y_tuple_list(df_cuid_grouped, x_y_feature: list) -> list:
    '''
    Returns a list of [x,y] element lists for each entry in a grouped dataframe

    :param df_cuid_grouped: Grouped dataframe
    :param x_y_feature: List of two features as strings, supported strings: 'x','y','vx','vy'
    '''
    list_all_tuples = []
    list_individual_tuples = []
    x_feature = x_y_feature[0]
    y_feature = x_y_feature[1]
    for i in range(len(df_cuid_grouped[x_feature])):
        for j in range(len(df_cuid_grouped[x_feature][i])): # iterate over all measurements for one track_id
            list_item = [df_cuid_grouped[x_feature][i][j], df_cuid_grouped[y_feature][i][j]] # create x-y tuples for each measumerement point
            list_individual_tuples.append(list_item) # add tuple to list for individual track_id
        list_all_tuples.append(list_individual_tuples) # add tuple list to list for all track_ids
        list_individual_tuples = []

    return list_all_tuples

In [10]:
def compute_dtw_matrix(data, metric:str='euclidean',sakoe_chiba_radius:float=1,**metric_parameters):
    """Compute the DTW distance matrix for all pairs of vectors."""

    n = len(data)
    distances = np.zeros((n, n))

    for i in range(n):
        for j in range(i + 1, n):
            path, dist = dtw_path_from_metric(data[i], data[j], sakoe_chiba_radius=sakoe_chiba_radius, metric=metric, **metric_parameters)
            distances[i][j] = dist
            distances[j][i] = dist

In [11]:
def find_clusters(dtw_matrix, threshold, paths_per_cluster):
    """Find clusters where all DTW distances are below or equal to the threshold."""
    n = len(dtw_matrix)
    clusters = []
    visited_paths = [False] * n
    
    for i in range(n):
        if visited_paths[i]:
            continue
        
        # check dtw distance to all other paths
        cluster = [i]
        for j in range(n):
            if i != j and not visited_paths[j] and dtw_matrix[i, j] <= threshold:
                cluster.append(j)
        
        # Check if the cluster size is at least paths_per_cluster
        if len(cluster) >= paths_per_cluster:
            clusters.append(cluster)
            for idx in cluster:
                visited_paths[idx] = True

    return clusters

In [12]:
def dtw_clustering(vectors, paths_per_cluster, max_threshold=10.0, step=0.5):
    """Perform clustering using DTW distance with increasing threshold."""
    dtw_matrix = compute_dtw_matrix(vectors)
    
    # Find clusters for varying thresholds
    for threshold in np.arange(0, max_threshold, step):
        clusters = find_clusters(dtw_matrix, threshold, paths_per_cluster)
        # Check if we can find a cluster with at least 10 vectors
        large_clusters = [cluster for cluster in clusters if len(cluster) >= 10]
        if large_clusters:
            print(f"Found clusters with at least 10 vectors at threshold {threshold}")
            return large_clusters, threshold

    print("No suitable cluster found.")
    return None, None

In [13]:

k729_2022_vectors = get_x_y_tuple_list(df_k729_2022_cuid_grouped,['x','y'])
# Perform DTW clustering with at least 10 vectors per cluster
clusters, threshold = dtw_clustering(k729_2022_vectors, paths_per_cluster=10)

# If clusters are found, display them
if clusters:
    print(f"Clusters found at DTW threshold {threshold}:")
    for i, cluster in enumerate(clusters):
        print(f"Cluster {i + 1}: {cluster}")

    # Optionally visualize some trajectories in a cluster
    plt.figure(figsize=(10, 6))
    for idx in clusters[0][:5]:  # Visualize up to 5 trajectories from the first cluster
        trajectory = k729_2022_vectors[idx]
        plt.plot(trajectory[:, 0], trajectory[:, 1], marker='o', label=f'Trajectory {idx}')
    plt.legend()
    plt.title(f"Trajectories in Cluster 1 (Threshold: {threshold})")
    plt.show()


TypeError: object of type 'NoneType' has no len()