In [None]:
# PASTE THIS TO THE FIRST CELL OF THE NOTEBOOK IN ORDER TO HAVE WORKING IMPORTS
import sys
import os
current_dir = os.getcwd()
parent_parent_dir = os.path.abspath(os.path.join(current_dir, '../..')) # tweak so that you get dir of code project

sys.path.append(parent_parent_dir)

In [None]:
from sklearn.cluster import OPTICS
import numpy as np
import pandas as pd
import ast

## data import

In [None]:
from src.features.get_x_y_tuples import get_x_y_tuple_list
from src.features.get_first_and_last_x_y_coordinates import get_first_and_last_x_y_coordinates 
from src.features.get_first_x_y_coordinates import get_first_x_y_coordinates
from src.features.get_last_x_y_coordinates import get_last_x_y_coordinates
def import_data(df_path):
    df_cuid = pd.read_csv(df_path)
    df_cuid_grouped_path = df_path.replace('.csv', '_grouped.csv')
    df_cuid_grouped = pd.read_csv(df_cuid_grouped_path)
    # 1.1 CONVERT FEATURES TO NUMBERS
    df_cuid_grouped['x'] = df_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
    df_cuid_grouped['y'] = df_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))
    list_x_y_tuples = get_x_y_tuple_list(df_cuid_grouped, ['x','y'])
    first_last_x_coords, first_last_y_coords = get_first_and_last_x_y_coordinates(list_x_y_tuples)
    X = np.array([first_last_x_coords, first_last_y_coords]).T

    first_x_coords, first_y_coords = get_first_x_y_coordinates(list_x_y_tuples)
    first_coordinates = np.array([first_x_coords, first_y_coords]).T

    last_x_coords, last_y_coords = get_last_x_y_coordinates(list_x_y_tuples)
    last_coordinates = np.array([last_x_coords, last_y_coords]).T
    return df_cuid, df_cuid_grouped, X, first_coordinates, last_coordinates
        

In [None]:
df_path_k729_2022 = f'{parent_parent_dir}/data/processed/k729_2022_cuid.csv'
df_path_k733_2020 = f'{parent_parent_dir}/data/processed/k733_2020_cuid.csv'
df_path_k733_2018 = f'{parent_parent_dir}/data/processed/k733_2018_cuid.csv'

df_cuid_k729_2022, df_cuid_grouped_k729_2022, X_k729_2022, first_coordinates_k729_2022, last_coordinates_k729_2022 = import_data(df_path_k729_2022)
df_cuid_k733_2020, df_cuid_grouped_k733_2020, X_k733_2020, first_coordinates_k733_2020, last_coordinates_k733_2020 = import_data(df_path_k733_2020)
df_cuid_k733_2018, df_cuid_grouped_k733_2018, X_k733_2018, first_coordinates_k733_2018, last_coordinates_k733_2018 = import_data(df_path_k733_2018)

In [None]:
import json
def get_optics_params(path):
    with open(path) as f:
        optics_params = json.load(f)
    return optics_params

#### get optimized optics params

In [None]:
# import optimized clustering parameters using the json file
k729_2022_optics_params = get_optics_params(f'{parent_parent_dir}/src/models/optimized_optics_clustering_parameters/k729_2022_optics_optimized.json')
k733_2020_optics_params = get_optics_params(f'{parent_parent_dir}/src/models/optimized_optics_clustering_parameters/k733_2020_optics_optimized.json')
k733_2018_optics_params = get_optics_params(f'{parent_parent_dir}/src/models/optimized_optics_clustering_parameters/k733_2018_optics_optimized.json')

In [None]:
def extract_optic_params_from_dict(params):
    max_eps = params['epsilon']
    min_samples = params['min_samples']
    metric = params['metric']
    cluster_method = params['cluster_method']
    xi = params['xi']
    return max_eps, min_samples, metric, cluster_method, xi

In [None]:
k729_2022_silhouette_params = k729_2022_optics_params['silhouette']
k733_2020_silhouette_params = k733_2020_optics_params['silhouette']
k733_2018_silhouette_params = k733_2018_optics_params['silhouette']

#### fit optimized optics model


In [None]:
from src.models.DISTANCE_METRICS_WITH_ADDITIONAL_ARGS import DISTANCE_METRICS_WITH_ADDITIONAL_ARGS
from src.models.clustering_optimization.ensure_distance_metric_params import ensure_distance_metric_params
def fit_optics(X, max_eps, min_samples, metric, cluster_method, xi):
    if metric in DISTANCE_METRICS_WITH_ADDITIONAL_ARGS:
        metric_params = ensure_distance_metric_params(X, metric)
        print(f"metric_params: {metric_params}")
    else:   
        metric_params = None
    if cluster_method == 'xi':
        optics = OPTICS(max_eps=max_eps, min_samples=min_samples, metric=metric, cluster_method=cluster_method, xi=xi, metric_params=metric_params)
    else:
        optics = OPTICS(max_eps=max_eps, min_samples=min_samples, metric=metric, cluster_method=cluster_method, metric_params=metric_params)
    optics.fit(X)
    return optics

In [None]:
optics_k729_2022_silhouette = fit_optics(X_k729_2022, 
                                         max_eps=k729_2022_silhouette_params['epsilon'],
                                         min_samples=k729_2022_silhouette_params['min_samples'],
                                         metric=k729_2022_silhouette_params['metric'],
                                         cluster_method=k729_2022_silhouette_params['cluster_method'],
                                         xi=k729_2022_silhouette_params['xi'])
optics_k729_2022_silhouette

#### calculate cluster medoids and convex hulls

In [None]:
from src.models.optics.calculate_cluster_medoids import calculate_cluster_medoids
from src.models.optics.calculate_cluster_convex_hulls import calculate_cluster_convex_hulls
k729_2022_optics_silhouette_medoids = calculate_cluster_medoids(X_k729_2022, optics_k729_2022_silhouette)
k729_2022_optics_silhouette_hulls = calculate_cluster_convex_hulls(X_k729_2022, optics_k729_2022_silhouette)

#### plot vehicle tracks and cluster medoids

In [None]:
from src.visualization.plot_vehicle_tracks_in_notebook import plot_vehicle_tracks_in_notebook
import matplotlib.pyplot as plt

plot_vehicle_tracks_in_notebook(plt.gca(), df_cuid_k729_2022, df_cuid_grouped_k729_2022, 'k729_2022 vehicle tracks and optics medoids')
plt.gca().scatter(k729_2022_optics_silhouette_medoids[:,0], k729_2022_optics_silhouette_medoids[:,1], c='red', s=100)
# Now, plot the hulls by looping through each convex hull
for hull in k729_2022_optics_silhouette_hulls:
    # Create a closed loop for the convex hull by appending the first point at the end
    hull_with_closure = np.vstack([hull, hull[0]])
    plt.gca().plot(hull_with_closure[:, 0], hull_with_closure[:, 1], c='green', linestyle='--', linewidth=2)  # Dotted green lines

plt.show()


#### optimize with hull size

In [None]:
from src.models.clustering_optimization.optimize_optics_parallelized_with_hull_size_optimization import optimize_optics_parallelized_with_hull_size_optimization
max_eps_range = np.arange(0.1, 10, 0.1)
min_samples_range = np.arange(5, 10, 1)
metrics = [
    #'cosine', 
    'euclidean', 
    'manhattan', 
    'braycurtis', 
    'canberra', 
    'chebyshev', 
    'correlation', 
    'mahalanobis', 
    'minkowski', 
    'seuclidean', 
    'sqeuclidean'
]
xis = np.arange(0.01, 0.1, 0.01)
cluster_methods = ['dbscan', 'xi']
k729_2022_hull_optimization_results = optimize_optics_parallelized_with_hull_size_optimization(X_k729_2022,max_eps_range=max_eps_range, min_samples_range=min_samples_range, metrics=metrics, cluster_methods=cluster_methods, xis=xis, alpha=0.9, beta=0.1)

In [None]:
print(k729_2022_hull_optimization_results)

#### silhouette

In [None]:
optics_k729_2022_silhouette_hull_optimized = fit_optics(X_k729_2022,
                                                        max_eps=k729_2022_hull_optimization_results['combined_silhouette']['epsilon'],
                                                        min_samples=k729_2022_hull_optimization_results['combined_silhouette']['min_samples'],
                                                        metric=k729_2022_hull_optimization_results['combined_silhouette']['metric'],
                                                        cluster_method=k729_2022_hull_optimization_results['combined_silhouette']['cluster_method'],
                                                        xi=k729_2022_hull_optimization_results['combined_silhouette']['xi'])
optics_k729_2022_silhouette_hull_optimized

In [None]:
k729_2022_optics_silhouette_hull_optimized_hulls = calculate_cluster_convex_hulls(X_k729_2022, optics_k729_2022_silhouette_hull_optimized)
k729_2022_optics_silhouette_hull_optimized_medoids = calculate_cluster_medoids(X_k729_2022, optics_k729_2022_silhouette_hull_optimized)

In [None]:
plot_vehicle_tracks_in_notebook(plt.gca(), df_cuid_k729_2022, df_cuid_grouped_k729_2022, 'k729_2022 vehicle tracks and optics medoids')
plt.gca().scatter(k729_2022_optics_silhouette_hull_optimized_medoids[:,0], k729_2022_optics_silhouette_hull_optimized_medoids[:,1], c='red', s=100)
# Now, plot the hulls by looping through each convex hull
for hull in k729_2022_optics_silhouette_hull_optimized_hulls:
    # Create a closed loop for the convex hull by appending the first point at the end
    hull_with_closure = np.vstack([hull, hull[0]])
    plt.gca().plot(hull_with_closure[:, 0], hull_with_closure[:, 1], c='green', linestyle='--', linewidth=2)  # Dotted green lines

plt.show()


#### Calinski Harabasz

In [None]:
optics_k729_2022_combined_calinski_harabasz_hull_optimized = fit_optics(X_k729_2022,
                                                        max_eps=k729_2022_hull_optimization_results['combined_calinski_harabasz']['epsilon'],
                                                        min_samples=k729_2022_hull_optimization_results['combined_calinski_harabasz']['min_samples'],
                                                        metric=k729_2022_hull_optimization_results['combined_calinski_harabasz']['metric'],
                                                        cluster_method=k729_2022_hull_optimization_results['combined_calinski_harabasz']['cluster_method'],
                                                        xi=k729_2022_hull_optimization_results['combined_calinski_harabasz']['xi'])
optics_k729_2022_combined_calinski_harabasz_hull_optimized

In [None]:
k729_2022_optics_combined_calinski_harabasz_hull_optimized_hulls = calculate_cluster_convex_hulls(X_k729_2022, optics_k729_2022_combined_calinski_harabasz_hull_optimized)
k729_2022_optics_combined_calinski_harabasz_hull_optimized_medoids = calculate_cluster_medoids(X_k729_2022, optics_k729_2022_combined_calinski_harabasz_hull_optimized)

In [None]:
plot_vehicle_tracks_in_notebook(plt.gca(), df_cuid_k729_2022, df_cuid_grouped_k729_2022, 'k729_2022 vehicle tracks and optics medoids')
plt.gca().scatter(k729_2022_optics_combined_calinski_harabasz_hull_optimized_medoids[:,0], k729_2022_optics_combined_calinski_harabasz_hull_optimized_medoids[:,1], c='red', s=100)
# Now, plot the hulls by looping through each convex hull
for hull in k729_2022_optics_combined_calinski_harabasz_hull_optimized_hulls:
    # Create a closed loop for the convex hull by appending the first point at the end
    hull_with_closure = np.vstack([hull, hull[0]])
    plt.gca().plot(hull_with_closure[:, 0], hull_with_closure[:, 1], c='green', linestyle='--', linewidth=2)  # Dotted green lines

plt.show()


#### Davies Bouldin

In [None]:
optics_k729_2022_combined_davies_bouldin_hull_optimized = fit_optics(X_k729_2022,
                                                        max_eps=k729_2022_hull_optimization_results['combined_davies_bouldin']['epsilon'],
                                                        min_samples=k729_2022_hull_optimization_results['combined_davies_bouldin']['min_samples'],
                                                        metric=k729_2022_hull_optimization_results['combined_davies_bouldin']['metric'],
                                                        cluster_method=k729_2022_hull_optimization_results['combined_davies_bouldin']['cluster_method'],
                                                        xi=k729_2022_hull_optimization_results['combined_davies_bouldin']['xi'])
optics_k729_2022_combined_davies_bouldin_hull_optimized

In [None]:
k729_2022_optics_combined_davies_bouldin_hull_optimized_hulls = calculate_cluster_convex_hulls(X_k729_2022, optics_k729_2022_combined_davies_bouldin_hull_optimized)
k729_2022_optics_combined_davies_bouldin_hull_optimized_medoids = calculate_cluster_medoids(X_k729_2022, optics_k729_2022_combined_davies_bouldin_hull_optimized)

In [None]:
plot_vehicle_tracks_in_notebook(plt.gca(), df_cuid_k729_2022, df_cuid_grouped_k729_2022, 'k729_2022 vehicle tracks and optics medoids')
plt.gca().scatter(k729_2022_optics_combined_davies_bouldin_hull_optimized_medoids[:,0], k729_2022_optics_combined_davies_bouldin_hull_optimized_medoids[:,1], c='red', s=100)
# Now, plot the hulls by looping through each convex hull
for hull in k729_2022_optics_combined_davies_bouldin_hull_optimized_hulls:
    # Create a closed loop for the convex hull by appending the first point at the end
    hull_with_closure = np.vstack([hull, hull[0]])
    plt.gca().plot(hull_with_closure[:, 0], hull_with_closure[:, 1], c='green', linestyle='--', linewidth=2)  # Dotted green lines

plt.show()


#### optimize with hull size

In [None]:
from src.models.clustering_optimization.optimize_optics_parallelized_with_hull_size_optimization import optimize_optics_parallelized_with_hull_size_optimization
max_eps_range = np.arange(0.1, 10, 0.1)
min_samples_range = np.arange(5, 10, 1)
metrics = [
    #'cosine', 
    'euclidean', 
    'manhattan', 
    'braycurtis', 
    'canberra', 
    'chebyshev', 
    'correlation', 
    'mahalanobis', 
    'minkowski', 
    'seuclidean', 
    'sqeuclidean'
]
xis = np.arange(0.01, 0.1, 0.01)
cluster_methods = ['dbscan', 'xi']
alpha=0.9
beta=0.01
k729_2022_hull_optimization_results = optimize_optics_parallelized_with_hull_size_optimization(X_k729_2022,max_eps_range=max_eps_range, min_samples_range=min_samples_range, metrics=metrics, cluster_methods=cluster_methods, xis=xis, alpha=alpha, beta=beta)

In [None]:
print(k729_2022_hull_optimization_results)

#### silhouette

In [None]:
optics_k729_2022_silhouette_hull_optimized = fit_optics(X_k729_2022,
                                                        max_eps=k729_2022_hull_optimization_results['combined_silhouette']['epsilon'],
                                                        min_samples=k729_2022_hull_optimization_results['combined_silhouette']['min_samples'],
                                                        metric=k729_2022_hull_optimization_results['combined_silhouette']['metric'],
                                                        cluster_method=k729_2022_hull_optimization_results['combined_silhouette']['cluster_method'],
                                                        xi=k729_2022_hull_optimization_results['combined_silhouette']['xi'])
optics_k729_2022_silhouette_hull_optimized

In [None]:
k729_2022_optics_silhouette_hull_optimized_hulls = calculate_cluster_convex_hulls(X_k729_2022, optics_k729_2022_silhouette_hull_optimized)
k729_2022_optics_silhouette_hull_optimized_medoids = calculate_cluster_medoids(X_k729_2022, optics_k729_2022_silhouette_hull_optimized)

In [None]:
plot_vehicle_tracks_in_notebook(plt.gca(), df_cuid_k729_2022, df_cuid_grouped_k729_2022, 'k729_2022 vehicle tracks and optics medoids')
plt.gca().scatter(k729_2022_optics_silhouette_hull_optimized_medoids[:,0], k729_2022_optics_silhouette_hull_optimized_medoids[:,1], c='red', s=100)
# Now, plot the hulls by looping through each convex hull
for hull in k729_2022_optics_silhouette_hull_optimized_hulls:
    # Create a closed loop for the convex hull by appending the first point at the end
    hull_with_closure = np.vstack([hull, hull[0]])
    plt.gca().plot(hull_with_closure[:, 0], hull_with_closure[:, 1], c='green', linestyle='--', linewidth=2)  # Dotted green lines

plt.show()


#### Calinski Harabasz

In [None]:
optics_k729_2022_combined_calinski_harabasz_hull_optimized = fit_optics(X_k729_2022,
                                                        max_eps=k729_2022_hull_optimization_results['combined_calinski_harabasz']['epsilon'],
                                                        min_samples=k729_2022_hull_optimization_results['combined_calinski_harabasz']['min_samples'],
                                                        metric=k729_2022_hull_optimization_results['combined_calinski_harabasz']['metric'],
                                                        cluster_method=k729_2022_hull_optimization_results['combined_calinski_harabasz']['cluster_method'],
                                                        xi=k729_2022_hull_optimization_results['combined_calinski_harabasz']['xi'])
optics_k729_2022_combined_calinski_harabasz_hull_optimized

In [None]:
k729_2022_optics_combined_calinski_harabasz_hull_optimized_hulls = calculate_cluster_convex_hulls(X_k729_2022, optics_k729_2022_combined_calinski_harabasz_hull_optimized)
k729_2022_optics_combined_calinski_harabasz_hull_optimized_medoids = calculate_cluster_medoids(X_k729_2022, optics_k729_2022_combined_calinski_harabasz_hull_optimized)

In [None]:
plot_vehicle_tracks_in_notebook(plt.gca(), df_cuid_k729_2022, df_cuid_grouped_k729_2022, 'k729_2022 vehicle tracks and optics medoids')
plt.gca().scatter(k729_2022_optics_combined_calinski_harabasz_hull_optimized_medoids[:,0], k729_2022_optics_combined_calinski_harabasz_hull_optimized_medoids[:,1], c='red', s=100)
# Now, plot the hulls by looping through each convex hull
for hull in k729_2022_optics_combined_calinski_harabasz_hull_optimized_hulls:
    # Create a closed loop for the convex hull by appending the first point at the end
    hull_with_closure = np.vstack([hull, hull[0]])
    plt.gca().plot(hull_with_closure[:, 0], hull_with_closure[:, 1], c='green', linestyle='--', linewidth=2)  # Dotted green lines

plt.show()


#### Davies Bouldin

In [None]:
optics_k729_2022_combined_davies_bouldin_hull_optimized = fit_optics(X_k729_2022,
                                                        max_eps=k729_2022_hull_optimization_results['combined_davies_bouldin']['epsilon'],
                                                        min_samples=k729_2022_hull_optimization_results['combined_davies_bouldin']['min_samples'],
                                                        metric=k729_2022_hull_optimization_results['combined_davies_bouldin']['metric'],
                                                        cluster_method=k729_2022_hull_optimization_results['combined_davies_bouldin']['cluster_method'],
                                                        xi=k729_2022_hull_optimization_results['combined_davies_bouldin']['xi'])
optics_k729_2022_combined_davies_bouldin_hull_optimized

In [None]:
k729_2022_optics_combined_davies_bouldin_hull_optimized_hulls = calculate_cluster_convex_hulls(X_k729_2022, optics_k729_2022_combined_davies_bouldin_hull_optimized)
k729_2022_optics_combined_davies_bouldin_hull_optimized_medoids = calculate_cluster_medoids(X_k729_2022, optics_k729_2022_combined_davies_bouldin_hull_optimized)

In [None]:
plot_vehicle_tracks_in_notebook(plt.gca(), df_cuid_k729_2022, df_cuid_grouped_k729_2022, 'k729_2022 vehicle tracks and optics medoids')
plt.gca().scatter(k729_2022_optics_combined_davies_bouldin_hull_optimized_medoids[:,0], k729_2022_optics_combined_davies_bouldin_hull_optimized_medoids[:,1], c='red', s=100)
# Now, plot the hulls by looping through each convex hull
for hull in k729_2022_optics_combined_davies_bouldin_hull_optimized_hulls:
    # Create a closed loop for the convex hull by appending the first point at the end
    hull_with_closure = np.vstack([hull, hull[0]])
    plt.gca().plot(hull_with_closure[:, 0], hull_with_closure[:, 1], c='green', linestyle='--', linewidth=2)  # Dotted green lines

plt.show()
