In [1]:
import os
import sys
current_dir = os.path.dirname(os.getcwd())
target_dir_name = 'ba_code_project'
while True:
    # Check if the target directory exists in the current directory
    potential_target = os.path.join(current_dir, target_dir_name)
    if os.path.isdir(potential_target):
        code_root_dir = potential_target
        break
    # Move one level up
    parent_dir = os.path.dirname(current_dir)
    # If we're at the root of the file system and still haven't found it, stop
    if parent_dir == current_dir:
        code_root_dir = None
        break
    current_dir = parent_dir
if code_root_dir:
    # Add the found target directory to sys.path
    sys.path.append(code_root_dir)
else:
    print(f'Target directory not found.')

In [20]:
import numpy as np
import pandas as pd
import ast
from src.features.get_first_and_last_x_y_coordinates import get_first_and_last_x_y_coordinates
from src.features.get_x_y_tuple_list import get_x_y_tuple_list
from src.visualization.plot_vehicle_tracks_in_notebook import plot_vehicle_tracks_in_notebook
from sklearn.cluster import OPTICS
import matplotlib.pyplot as plt

In [21]:
# to access while testing
# intersection_name = 'k733_2018'
# intersection_name = 'k733_2020'
intersection_name = 'k729_2022'
data_path = f'{code_root_dir}/data/processed/{intersection_name}_cuid.csv'

df_cuid = pd.read_csv(data_path)
df_cuid_grouped_path = data_path.replace('.csv', '_grouped.csv')
df_cuid_grouped = pd.read_csv(df_cuid_grouped_path)
df_cuid_grouped['x'] = df_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
df_cuid_grouped['y'] = df_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))
list_x_y_tuples = get_x_y_tuple_list(df_cuid_grouped, ['x','y'])
first_last_x_coords, first_last_y_coords = get_first_and_last_x_y_coordinates(list_x_y_tuples)
X = np.array([first_last_x_coords, first_last_y_coords]).T

In [22]:
max_eps_range = np.arange(2, 10, .1)
min_samples_range = np.arange(5, 15, 1)
metrics = [
    'euclidean', 
    'manhattan', 
    'chebyshev', 
    'mahalanobis', 
]
xis = np.arange(0.01, .2, 0.01)
cluster_methods = ['dbscan','xi']

# # sample run
# max_eps_range = np.arange(1, 4, 1)
# min_samples_range = np.arange(5, 7, 1)
# metrics = [
#     'euclidean', 
#     'manhattan', 
#     'chebyshev', 
#     'mahalanobis', 
#     'minkowski', 

# ]
# xis = np.arange(0, 0.1, 0.05)
# cluster_methods = ['dbscan','xi']


In [23]:
#save optimization initializations as json
import json
initializations = {
    'max_eps_range': max_eps_range.tolist(),
    'min_samples_range': min_samples_range.tolist(),
    'metrics': metrics,
    'xis': xis.tolist(),
    'cluster_methods': cluster_methods
}

# choose a random word to append to the file name
import random
import string
random_word = ''.join(random.choices(string.ascii_lowercase, k=5))
initializations_path = f'{code_root_dir}/data/optics_init_params/optics_start_end_initializations_params_{random_word}.json'
with open(initializations_path, 'w') as f:
    json.dump(initializations, f)

In [24]:
from src.models.optics.optimize_optics_parallelized_no_aic_bic import optimize_optics_parallelized_no_aic_bic


optimization_results = optimize_optics_parallelized_no_aic_bic(X=X, 
                                                                         max_eps_range=max_eps_range, 
                                                                         min_samples_range=min_samples_range,
                                                                            metrics=metrics,
                                                                            xis=xis,
                                                                            cluster_methods=cluster_methods)

Iterations: 23500
Iterations: 26400
Iterations: 23500
Iterations: 23700
Iterations: 25100
Iterations: 23100
Iterations: 23300
Iterations: 23300
Iterations: 25100
Iterations: 26500
Iterations: 23600
Iterations: 23600
Iterations: 26500
Iterations: 23800
Iterations: 25200
Iterations: 23400
Iterations: 23400
Iterations: 23200
Iterations: 23700
Iterations: 25200
Iterations: 23500
Iterations: 23500
Iterations: 26600
Iterations: 23800
Iterations: 23700
Iterations: 26600
Iterations: 25300
Iterations: 23900
Iterations: 23300
Iterations: 23600
Iterations: 23600
Iterations: 23900
Iterations: 25300
Iterations: 26700
Iterations: 23700
Iterations: 23700
Iterations: 24000
Iterations: 23800
Iterations: 26700
Iterations: 24000
Iterations: 25400
Iterations: 23400
Iterations: 23800
Iterations: 23800
Iterations: 24100
Iterations: 25400
Iterations: 26800
Iterations: 23900
Iterations: 23900
Iterations: 23900
Iterations: 26800
Iterations: 24100
Iterations: 25500
Iterations: 24200
Iterations: 23500
Iterations

In [28]:
print(optimization_results)

{'silhouette': {'score': 0.6830023540258303, 'epsilon': 3.200000000000001, 'min_samples': 5, 'metric': 'chebyshev', 'cluster_method': 'dbscan', 'xi': None}, 'calinski_harabasz': {'score': 1037.1553423234657, 'epsilon': 4.500000000000002, 'min_samples': 9, 'metric': 'euclidean', 'cluster_method': 'dbscan', 'xi': None}, 'davies_bouldin': {'score': 0.46570219632192716, 'epsilon': 4.100000000000001, 'min_samples': 5, 'metric': 'euclidean', 'cluster_method': 'dbscan', 'xi': None}, 'initialization_params': {'max_eps_range': [2.0, 2.1, 2.2, 2.3000000000000003, 2.4000000000000004, 2.5000000000000004, 2.6000000000000005, 2.7000000000000006, 2.8000000000000007, 2.900000000000001, 3.000000000000001, 3.100000000000001, 3.200000000000001, 3.300000000000001, 3.4000000000000012, 3.5000000000000013, 3.6000000000000014, 3.7000000000000015, 3.8000000000000016, 3.9000000000000017, 4.000000000000002, 4.100000000000001, 4.200000000000002, 4.3000000000000025, 4.400000000000002, 4.500000000000002, 4.60000000

In [29]:
print(optimization_results)
key_to_del = []
for k,v in optimization_results.items():
    if k not in ['silhouette', 'calinski_harabasz', 'davies_bouldin']:
        key_to_del.append(k)
for k in key_to_del:
    del optimization_results[k]
print(optimization_results)

{'silhouette': {'score': 0.6830023540258303, 'epsilon': 3.200000000000001, 'min_samples': 5, 'metric': 'chebyshev', 'cluster_method': 'dbscan', 'xi': None}, 'calinski_harabasz': {'score': 1037.1553423234657, 'epsilon': 4.500000000000002, 'min_samples': 9, 'metric': 'euclidean', 'cluster_method': 'dbscan', 'xi': None}, 'davies_bouldin': {'score': 0.46570219632192716, 'epsilon': 4.100000000000001, 'min_samples': 5, 'metric': 'euclidean', 'cluster_method': 'dbscan', 'xi': None}, 'initialization_params': {'max_eps_range': [2.0, 2.1, 2.2, 2.3000000000000003, 2.4000000000000004, 2.5000000000000004, 2.6000000000000005, 2.7000000000000006, 2.8000000000000007, 2.900000000000001, 3.000000000000001, 3.100000000000001, 3.200000000000001, 3.300000000000001, 3.4000000000000012, 3.5000000000000013, 3.6000000000000014, 3.7000000000000015, 3.8000000000000016, 3.9000000000000017, 4.000000000000002, 4.100000000000001, 4.200000000000002, 4.3000000000000025, 4.400000000000002, 4.500000000000002, 4.60000000

In [30]:
import numpy as np
import json

# Function to recursively convert numpy arrays and numpy types to Python lists and floats
def convert_numpy_to_json_compatible(obj):
    if isinstance(obj, dict):
        return {k: convert_numpy_to_json_compatible(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_to_json_compatible(item) for item in obj]
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    elif isinstance(obj, (np.float64, np.float32, np.int64, np.int32)):
        return obj.item()
    else:
        return obj

optimization_results['initialization_params'] = {
    'max_eps_range': max_eps_range.tolist(),
    'min_samples_range': min_samples_range.tolist(),
    'metrics': metrics,
    'xis': xis.tolist(),
    'cluster_methods': cluster_methods
}

# Apply the conversion to make the data JSON-compatible
optimization_results_json_compatible = convert_numpy_to_json_compatible(optimization_results)


# Save to JSON
file_path = f'{code_root_dir}/src/models/OUTLIER_DETECTION/OPTICS_MODELS/{intersection_name}_optics_optimized_params.json'
with open(file_path, 'w') as f:
    json.dump(optimization_results_json_compatible, f, indent=4)
