In [None]:
# PASTE THIS TO THE FIRST CELL OF THE NOTEBOOK IN ORDER TO HAVE WORKING IMPORTS
import sys
import os
current_dir = os.getcwd()
parent_parent_dir = os.path.abspath(os.path.join(current_dir, '../../..')) # tweak so that you get the root project folder

sys.path.append(parent_parent_dir)

In [None]:
from sklearn.cluster import OPTICS
import numpy as np
import pandas as pd
import ast

## data import

In [None]:
from src.features.get_x_y_tuple_list import get_x_y_tuple_list
from src.features.get_first_and_last_x_y_coordinates import get_first_and_last_x_y_coordinates 
from src.features.get_first_x_y_coordinates import get_first_x_y_coordinates
from src.features.get_last_x_y_coordinates import get_last_x_y_coordinates
def import_data(df_path):
    df_cuid = pd.read_csv(df_path)
    df_cuid_grouped_path = df_path.replace('.csv', '_grouped.csv')
    df_cuid_grouped = pd.read_csv(df_cuid_grouped_path)
    # 1.1 CONVERT FEATURES TO NUMBERS
    df_cuid_grouped['x'] = df_cuid_grouped['x'].apply(lambda x: ast.literal_eval(x))
    df_cuid_grouped['y'] = df_cuid_grouped['y'].apply(lambda y: ast.literal_eval(y))
    list_x_y_tuples = get_x_y_tuple_list(df_cuid_grouped, ['x','y'])
    first_last_x_coords, first_last_y_coords = get_first_and_last_x_y_coordinates(list_x_y_tuples)
    X = np.array([first_last_x_coords, first_last_y_coords]).T

    first_x_coords, first_y_coords = get_first_x_y_coordinates(list_x_y_tuples)
    first_coordinates = np.array([first_x_coords, first_y_coords]).T

    last_x_coords, last_y_coords = get_last_x_y_coordinates(list_x_y_tuples)
    last_coordinates = np.array([last_x_coords, last_y_coords]).T
    return df_cuid, df_cuid_grouped, X, first_coordinates, last_coordinates
        

In [None]:
df_path_k729_2022 = '../../../data/processed/k729_2022_cuid.csv'
df_path_k733_2020 = '../../../data/processed/k733_2020_cuid.csv'
df_path_k733_2018 = '../../../data/processed/k733_2018_cuid.csv'

df_cuid_k729_2022, df_cuid_grouped_k729_2022, X_k729_2022, first_coordinates_k729_2022, last_coordinates_k729_2022 = import_data(df_path_k729_2022)
df_cuid_k733_2020, df_cuid_grouped_k733_2020, X_k733_2020, first_coordinates_k733_2020, last_coordinates_k733_2020 = import_data(df_path_k733_2020)
df_cuid_k733_2018, df_cuid_grouped_k733_2018, X_k733_2018, first_coordinates_k733_2018, last_coordinates_k733_2018 = import_data(df_path_k733_2018)

In [None]:
from scipy.cluster.hierarchy import linkage, cophenet
# Apply OPTICS clustering
optics = OPTICS(min_samples=5, max_eps=5.0, cluster_method='dbscan', metric='euclidean')
optics.fit(X_k729_2022)

# Check if X contains any NaN or infinite values
print(np.any(np.isnan(X_k729_2022)))  # Should print False
print(np.any(np.isinf(X_k729_2022)))  # Should print False

# Check for infinite values in reachability distances
print(np.any(np.isinf(optics.reachability_)))  # Should print False
print(np.any(np.isnan(optics.reachability_)))  # Should print False


# Replace inf values in reachability distances with a large number
reachability = optics.reachability_.copy()
reachability[np.isinf(reachability)] = np.max(reachability[np.isfinite(reachability)])

# Generate linkage matrix
linkage_matrix = linkage(reachability[optics.ordering_].reshape(-1, 1), method='single')


In [None]:
from scipy.spatial.distance import pdist
# Compute the pairwise distance matrix
pairwise_distances = pdist(X_k729_2022)

# Calculate cophenetic correlation coefficient
coph_dist, coph_corr = cophenet(linkage_matrix, pairwise_distances)

print(f"Cophenetic Correlation Coefficient: {coph_corr}")

In [None]:
from sklearn.model_selection import ParameterGrid
from scipy.cluster.hierarchy import linkage, cophenet
from scipy.spatial.distance import pdist
from sklearn.cluster import OPTICS
import numpy as np

def optimize_optics(X, param_grid):
    best_coph_corr = -1  # Initialize with a low value
    best_params = None
    
    for params in ParameterGrid(param_grid):
        # Apply OPTICS with current parameters
        optics = OPTICS(min_samples=params['min_samples'], max_eps=params['max_eps'])
        optics.fit(X)
        
        # Check reachability distances and handle infinities
        reachability = optics.reachability_.copy()
        if np.any(np.isinf(reachability)):
            reachability[np.isinf(reachability)] = np.max(reachability[np.isfinite(reachability)])
        
        # Generate the linkage matrix
        linkage_matrix = linkage(reachability[optics.ordering_].reshape(-1, 1), method='single')
        
        # Compute cophenetic correlation coefficient
        coph_dist, coph_corr = cophenet(linkage_matrix, pdist(X))
        
        # Take the mean of coph_corr to get a single scalar value
        coph_corr_mean = np.mean(coph_corr)
        
        # Only compare the scalar (mean of the cophenetic correlation coefficient)
        if coph_corr_mean > best_coph_corr:
            best_coph_corr = coph_corr_mean
            best_params = params
    
    return best_params, best_coph_corr

# Define parameter grid
param_grid = {
    'min_samples': [5, 10, 15],
    'max_eps': [3.0, 4.0, 5.0]
}

# Run the optimization
best_params, best_coph_corr = optimize_optics(X_k729_2022, param_grid)

print(f"Best parameters: {best_params}")
print(f"Best Cophenetic Correlation Coefficient: {best_coph_corr}")


In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_clusters_with_optimized_params(X, best_params):
    # Run OPTICS with the best parameters
    optics = OPTICS(min_samples=best_params['min_samples'], max_eps=best_params['max_eps'])
    optics.fit(X)
    
    # Extract cluster labels (-1 indicates noise points)
    labels = optics.labels_

    # Generate a scatter plot
    plt.figure(figsize=(8, 6))
    
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    
    # Colors for plotting (excluding noise points, if any)
    unique_labels = set(labels)
    colors = plt.cm.get_cmap("tab10", n_clusters_ + 1)

    for k in unique_labels:
        if k == -1:
            # Black color for noise
            color = [0, 0, 0, 1]
        else:
            color = colors(k / n_clusters_)

        class_member_mask = (labels == k)

        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', color=tuple(color), markersize=6, label=f'Cluster {k}' if k != -1 else 'Noise')

    plt.title(f'OPTICS Clustering with Best Params: min_samples={best_params["min_samples"]}, max_eps={best_params["max_eps"]}')
    #plt.legend(loc='best')
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.show()

# Call the function to plot the clusters
plot_clusters_with_optimized_params(X_k729_2022, best_params)


In [None]:
from src.models.optics.optimize_optics_parallelized_CCC import optimize_optics_parallelized_CCC
max_eps_range = np.arange(1, 11, 1)
min_samples_range = np.arange(5, 15, 1)
metrics = [
    'euclidean', 
    'manhattan', 
    'chebyshev', 
    'mahalanobis', 
    'minkowski', 
    'seuclidean', 
    'sqeuclidean'
]
xis = np.arange(0.01, 0.1, 0.01)
cluster_methods = ['dbscan', 'xi']
optimization_results = optimize_optics_parallelized_CCC(X=X_k729_2022, max_eps_range=max_eps_range, min_samples_range=min_samples_range, metrics=metrics, xis=xis, cluster_methods=cluster_methods)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def plot_clusters_with_optimized_params_2(X, best_params):
    # Run OPTICS with the best parameters
    if best_params['best_cluster_method'] == 'dbscan':
        optics = OPTICS(min_samples=best_params['best_min_samples'], max_eps=best_params['best_epsilon'], cluster_method=best_params['best_cluster_method'], metric=best_params['best_metric'], metric_params=best_params['best_kwargs'])
    else:
        optics = OPTICS(min_samples=best_params['best_min_samples'], max_eps=best_params['best_epsilon'], cluster_method=best_params['best_cluster_method'], xi=best_params['best_xi'], metric=best_params['best_metric'], metric_params=best_params['best_kwargs'])
    optics.fit(X)
    
    # Extract cluster labels (-1 indicates noise points)
    labels = optics.labels_

    # Generate a scatter plot
    plt.figure(figsize=(8, 6))
    
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    
    # Colors for plotting (excluding noise points, if any)
    unique_labels = set(labels)
    print(unique_labels)
    colors = plt.cm.get_cmap("tab10", n_clusters_ + 1)

    for k in unique_labels:
        if k == -1:
            # Black color for noise
            color = [0, 0, 0, 1]
        else:
            color = colors(k / n_clusters_)

        class_member_mask = (labels == k)

        xy = X[class_member_mask]
        plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(color), markersize=6, label=f'Cluster {k}' if k != -1 else 'Noise')

    plt.title(f'OPTICS Clustering with Best Params: min_samples={best_params["best_min_samples"]}, max_eps={best_params["best_epsilon"]}')
    plt.legend(loc='best')
    plt.xlabel("Feature 1")
    plt.ylabel("Feature 2")
    plt.show()


In [None]:
best_params = optimization_results['optimized']
plot_clusters_with_optimized_params_2(X_k729_2022, best_params)
print(best_params)