# Human Activity Recognition Dataset

In [193]:
from aeon.datasets import load_from_tsfile
import numpy as np # for some mathematical operations
def load_data(DATA_PATH,t):
    if t=='train':
        train_x, train_y = load_from_tsfile(DATA_PATH + "/MotionSenseHAR/MotionSenseHAR_TRAIN.ts")
        return [train_x,train_y]
    elif t=='test':
        test_x, test_y = load_from_tsfile(DATA_PATH + "/MotionSenseHAR/MotionSenseHAR_TEST.ts")
        return [test_x, test_y]

In [194]:
#Test the load_data function
[data, data_y] = load_data(r"C:\Users\pappe\OneDrive - IPH Hannover gGmbH\General\Privat\Studienarbeit_Karla\Studienarbeit_Code\Studienarbeit_Code\datasets",'train')  # 'path', 'test'/'train'

In [195]:
print('First time series:')
print(data[0],"\n")
print('First target:')
print(data_y[0],"\n")
print('Shape of train dataset:') #(samples, features,timestamps) 
print(np.shape(data))

First time series:
[[ 1.528132  1.527992  1.527765 ...  1.242636  1.172438  1.088215]
 [-0.733896 -0.716987 -0.706999 ... -0.606728 -0.604834 -0.590965]
 [ 0.696372  0.677762  0.670951 ... -2.673003 -2.664214 -2.658315]
 ...
 [ 0.294894  0.219405  0.010714 ... -0.178695  0.016807  0.549265]
 [-0.184493  0.035846  0.134701 ...  0.007637  0.077783  0.142422]
 [ 0.377542  0.114866 -0.167808 ...  0.353873  0.264784 -0.060651]] 

First target:
dws 

Shape of train dataset:
(966, 12, 1000)


In [196]:
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

def standarize(data):
    scaler = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0)  # Standardize to mean=0, std=1
    scaled_data_3d = scaler.fit_transform(data)
    return scaled_data_3d
    

In [197]:
# Transform from (samples, features,timestamps) to (samples, timestamps, features) to apply standarisation
reshaped_data = np.transpose(data, (0, 2, 1))
print("Original shape:", data.shape)          # (instances, features, timepoints)
print("Reshaped shape:", reshaped_data.shape) # (instances, timepoints, features)

Original shape: (966, 12, 1000)
Reshaped shape: (966, 1000, 12)


In [198]:
scaled_data=standarize(reshaped_data)
# scaled_data
print('First time series:')
print(scaled_data[0],"\n")

First time series:
[[ 6.51829386e-01 -1.67061485e-01  7.02820502e-01 ...  6.56908619e-01
  -6.39617182e-01  9.59936638e-01]
 [ 6.51252682e-01 -6.17606338e-02  6.92783970e-01 ...  4.37960751e-01
   1.43309164e-01  1.46918625e-01]
 [ 6.50317597e-01  4.39667952e-04  6.89110739e-01 ... -1.67325443e-01
   4.94568741e-01 -7.27995923e-01]
 ...
 [-5.24219044e-01  6.24877639e-01 -1.11431238e+00 ... -7.16686233e-01
   4.30746647e-02  8.86677858e-01]
 [-8.13386835e-01  6.36672530e-01 -1.10957240e+00 ... -1.49653340e-01
   2.92323102e-01  6.10935263e-01]
 [-1.16032804e+00  7.23041771e-01 -1.10639102e+00 ...  1.39468483e+00
   5.22003622e-01 -3.96330406e-01]] 



## Test 1: intrinsic metrics before vs. after Feature selection

### Before FS:
* Representation Entropy correlation based
* Variance
* Redundancy Rate RED

TO DO:
* Information Gain Ratio

In [199]:
# Representation Entropy
import numpy as np
from scipy.linalg import eigh

def compute_representation_entropy(data):
    """
    Compute Representation Entropy (RE) of a multivariate dataset.
    
    Args:
        data (numpy.ndarray): 2D-Data with shape (samples, features).
    
    Returns:
        float: Representation Entropy (RE).
    """
    # Step 1: Compute the covariance matrix of the dataset (features x features)
    covariance_matrix = np.cov(data, rowvar=False)  # rowvar=False means variables are columns

    # # Step 2: Compute eigenvalues of the covariance matrix
    # eigenvalues = np.linalg.eigvals(covariance_matrix) THIS METHOD WAS REPLACED BC OF INSTABILITY
    eigenvalues, eigenvectors = eigh(covariance_matrix)

    # Step 3: Normalize the eigenvalues to act as probabilities
    eigenvalues_sum = np.sum(eigenvalues)
    normalized_eigenvalues = eigenvalues / eigenvalues_sum

    # Step 4: Compute Representation Entropy using the formula
    representation_entropy = -np.sum(normalized_eigenvalues * np.log(normalized_eigenvalues))

    return representation_entropy

In [200]:
#Need to flatten the Time Dimension
data_flattened = scaled_data.reshape(-1, scaled_data.shape[2])

In [201]:
np.shape(data_flattened)

(966000, 12)

In [202]:
compute_representation_entropy(data_flattened)

2.332264108310466

In [203]:
import pandas as pd

# calculate variance
overall_variance = data_flattened.var().mean()
print('overall variance:', overall_variance)

# Compute the correlation matrix
corr_matrix = pd.DataFrame(data_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
avg_corr = (corr_matrix.values.sum() - len(corr_matrix)) / (len(corr_matrix) * (len(corr_matrix) - 1))
redundancy_rate = avg_corr
print("Redundancy Rate (Correlation-Based):", redundancy_rate)

overall variance: 0.9999999999999986
Redundancy Rate (Correlation-Based): 0.07660471326380351


In [204]:
# Load TEST dataset
[TESTdata, TESTdata_y] = load_data(r"C:\Users\pappe\OneDrive - IPH Hannover gGmbH\General\Privat\Studienarbeit_Karla\Studienarbeit_Code\Studienarbeit_Code\datasets",'test')  # 'path', 'test'/'train'

In [205]:
#Preprocess the TEST data

reshaped_TESTdata = np.transpose(TESTdata, (0, 2, 1))
print("Original shape:", data.shape)          # (instances, features, timepoints)
print("Reshaped shape:", reshaped_data.shape) # (instances, timepoints, features)

Original shape: (966, 12, 1000)
Reshaped shape: (966, 1000, 12)


In [206]:
scaled_TESTdata=standarize(reshaped_TESTdata)

In [207]:
# Compute all intrinsic metrics again for original data_TEST

# Compute representation entropy
before_TESTdata_flattened = scaled_TESTdata.reshape(-1, scaled_TESTdata.shape[2])
before_representation_entropy = compute_representation_entropy(before_TESTdata_flattened)
print('before Representation entropy: ', before_representation_entropy)

# calculate variance
before_overall_variance = before_TESTdata_flattened.var().mean()
print('before overall variance:', before_overall_variance)

# Compute the correlation matrix
before_corr_matrix = pd.DataFrame(before_TESTdata_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
before_avg_corr = (before_corr_matrix.values.sum() - len(before_corr_matrix)) / (len(before_corr_matrix) * (len(before_corr_matrix) - 1))
before_redundancy_rate = before_avg_corr
print("before Redundancy Rate (Correlation-Based):", before_redundancy_rate)

before Representation entropy:  2.333380847639658
before overall variance: 0.9999999999999997
before Redundancy Rate (Correlation-Based): 0.07103940208190185


In [208]:
np.shape(before_TESTdata_flattened)

(265000, 12)

### Feature Selection 1: CLeVer Hybrid

In [209]:
#Improved Code
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import os
os.environ["OMP_NUM_THREADS"] = "1"

def compute_dcpcs(data, variance_threshold=0.8):
    """
    Compute Descriptive Common Principal Components (DCPCs) for a set of multivariate time series.

    Parameters:
    - data: ndarray of shape (num_samples, num_features, time_steps), the time-series dataset.
    - variance_threshold: float, the cumulative variance explained to determine the number of PCs.

    Returns:
    - dcpc_loadings: ndarray of shape (num_dcpcs, num_features), loadings of the DCPCs.
    """
    num_samples, num_features, time_steps = data.shape

    # Step 1: Compute PCs for each MTS item
    pc_matrices = []  # Store PC loadings for each sample
    for sample in range(num_samples):
        # Compute correlation matrix for each sample
        correlation_matrix = np.corrcoef(data[sample])
        # Perform PCA on the correlation matrix
        pca = PCA()
        pca.fit(correlation_matrix)
        pc_matrices.append(pca.components_[:pca.n_components_])

    # Step 2: Compute DCPCs across all samples using SVD
    all_pc_matrices = np.concatenate(pc_matrices, axis=0)  # Combine PC loadings from all samples
    dcpc_covariance = all_pc_matrices.T @ all_pc_matrices
    eigvals, eigvecs = np.linalg.eigh(dcpc_covariance)
    sorted_indices = np.argsort(eigvals)[::-1]
    eigvecs = eigvecs[:, sorted_indices]  # Sort eigenvectors by eigenvalues

    # Select DCPCs explaining the desired variance threshold
    cumulative_variance = np.cumsum(eigvals[sorted_indices]) / np.sum(eigvals)
    num_dcpcs = np.searchsorted(cumulative_variance, variance_threshold) + 1
    dcpc_loadings = eigvecs[:, :num_dcpcs].T

    return dcpc_loadings

def cluster_features(dcpc_loadings, n_clusters):
    """
    Cluster features based on their DCPC loadings using K-means.

    Parameters:
    - dcpc_loadings: ndarray of shape (num_dcpcs, num_features), loadings of the DCPCs.
    - n_clusters: int, number of clusters.

    Returns:
    - cluster_labels: ndarray of shape (num_features,), cluster assignments for each feature.
    """
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(dcpc_loadings.T)
    return cluster_labels

def rank_features(dcpc_loadings, cluster_labels):
    """
    Rank features within each cluster based on their contribution to DCPCs.

    Parameters:
    - dcpc_loadings: ndarray of shape (num_dcpcs, num_features), loadings of the DCPCs.
    - cluster_labels: ndarray of shape (num_features,), cluster assignments for each feature.

    Returns:
    - ranked_features: dict, keys are cluster labels, values are ranked feature indices.
    """
    ranked_features = {}
    for cluster in np.unique(cluster_labels):
        cluster_indices = np.where(cluster_labels == cluster)[0]
        cluster_loadings = dcpc_loadings[:, cluster_indices]
        scores = np.linalg.norm(cluster_loadings, axis=0)  # L2 norm of loadings
        ranking = cluster_indices[np.argsort(scores)[::-1]]  # Sort by descending contribution
        ranked_features[cluster] = ranking
    return ranked_features

def select_top_features(ranked_features, top_n=1):
    """
    Select top-ranked features from each cluster.

    Parameters:
    - ranked_features: dict, keys are cluster labels, values are ranked feature indices.
    - top_n: int, number of features to select from each cluster.

    Returns:
    - selected_features: list, indices of selected features.
    """
    selected_features = []
    for features in ranked_features.values():
        selected_features.extend(features[:top_n])
    return selected_features

def clever_hybrid(data, variance_threshold=0.8, n_clusters=None, top_n=1):
    """
    Perform feature selection using the CLeVer-Hybrid algorithm.

    Parameters:
    - data: ndarray of shape (num_samples, num_features, time_steps), the time-series dataset.
    - variance_threshold: float, variance threshold for selecting DCPCs.
    - n_clusters: int, number of clusters (if None, sqrt of num_features is used).
    - top_n: int, number of features to select from each cluster.

    Returns:
    - selected_features: list, indices of selected features.
    """
    num_samples, num_features, _ = data.shape
    if n_clusters is None:
        n_clusters = int(np.sqrt(num_features))

    # Step 1: Compute DCPCs
    dcpc_loadings = compute_dcpcs(data, variance_threshold)

    # Step 2: Cluster features based on DCPC loadings
    cluster_labels = cluster_features(dcpc_loadings, n_clusters)

    # Step 3: Rank features within clusters
    ranked_features = rank_features(dcpc_loadings, cluster_labels)

    # Step 4: Select top features from each cluster
    selected_features = select_top_features(ranked_features, top_n)

    return selected_features

In [210]:
scaled_data_ift= np.transpose(scaled_data, (0, 2, 1))
np.shape(scaled_data_ift)

(966, 12, 1000)

In [211]:
selected_features_CLeVerH=clever_hybrid(scaled_data_ift, n_clusters=4, top_n=1)
print("Selected features CLeVer Hybrid: ", selected_features_CLeVerH)

Selected features CLeVer Hybrid:  [7, 11, 10, 0]


In [212]:
np.shape(scaled_TESTdata)

(265, 1000, 12)

In [213]:
# Filter the TEST Dataset according to the selected features from CLeVer
selected_TESTdata_CLeVerH = scaled_TESTdata[:, :, selected_features_CLeVerH]
print('Filtered TEST dataset shape: ', np.shape(selected_TESTdata_CLeVerH))

Filtered TEST dataset shape:  (265, 1000, 4)


In [214]:
# Compute all intrinsic metrics again for selected_TESTdata_CLeVerH

# Compute representation entropy
CLEVERH_TESTdata_flattened = selected_TESTdata_CLeVerH.reshape(-1, selected_TESTdata_CLeVerH.shape[2])
CLEVERH_representation_entropy = compute_representation_entropy(CLEVERH_TESTdata_flattened)
print('CLEVER Hybrid Representation entropy: ', CLEVERH_representation_entropy)

# calculate variance
CLEVERH_overall_variance = CLEVERH_TESTdata_flattened.var().mean()
print('CLEVER Hybrid overall variance:', CLEVERH_overall_variance)

# Compute the correlation matrix
CLEVERH_corr_matrix = pd.DataFrame(CLEVERH_TESTdata_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
CLEVERH_avg_corr = (CLEVERH_corr_matrix.values.sum() - len(CLEVERH_corr_matrix)) / (len(CLEVERH_corr_matrix) * (len(CLEVERH_corr_matrix) - 1))
CLEVERH_redundancy_rate = CLEVERH_avg_corr
print("CLEVER Hybrid Redundancy Rate (Correlation-Based):", CLEVERH_redundancy_rate)

CLEVER Hybrid Representation entropy:  1.3824341826604127
CLEVER Hybrid overall variance: 1.0000000000000002
CLEVER Hybrid Redundancy Rate (Correlation-Based): 0.037143214580118634


In [215]:
np.shape(CLEVERH_TESTdata_flattened)

(265000, 4)

### Feature Selection CLeVer CLUSTERING

In [216]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import pdist, squareform
import numpy as np
import matplotlib.pyplot as plt
os.environ["OMP_NUM_THREADS"] = "1"

# def compute_dcpcs(data, variance_threshold=0.8):
#     """
#     Compute Descriptive Common Principal Components (DCPCs).

#     Parameters:
#     - data: np.array of shape (samples, features, time_steps)
#     - variance_threshold: Minimum variance explained by selected PCs

#     Returns:
#     - dcpc_loadings: Matrix of DCPC loadings for features
#     """
#     num_samples, num_features, time_steps = data.shape

#     # Step 1: Perform PCA for each sample's time-series data
#     pc_matrices = []  # Store the PC loadings for all samples
#     for sample in range(num_samples):
#         sample_data = data[sample]  # Shape: (features, time_steps)
#         pca = PCA()
#         pca.fit(sample_data.T)
        
#         # Select the number of PCs to retain (based on variance threshold)
#         cum_variance = np.cumsum(pca.explained_variance_ratio_)
#         num_pcs = np.searchsorted(cum_variance, variance_threshold) + 1
#         pc_matrices.append(pca.components_[:num_pcs])  # Retain only the top PCs

#     # Step 2: Compute the DCPC loadings (common across all samples)
#     H = np.zeros((num_features, num_features))
#     for pc_matrix in pc_matrices:
#         H += pc_matrix.T @ pc_matrix

#     eigvals, eigvecs = np.linalg.eigh(H)  # Eigen decomposition
#     eigvecs = eigvecs[:, ::-1]  # Sort eigenvectors in descending order of eigenvalues

#     return eigvecs.T  # DCPC loadings (features x components)

def clever_cluster(data, n_clusters=None, variance_threshold=0.8):
    """
    CLeVer-Cluster implementation.

    Parameters:
    - data: np.array of shape (samples, features, time_steps)
    - n_clusters: Number of feature clusters (if None, heuristic is used)
    - variance_threshold: Minimum variance explained by selected PCs

    Returns:
    - selected_features: List of representative feature indices for each cluster
    """
    num_samples, num_features, time_steps = data.shape

    # Step 1: Compute DCPC loadings
    dcpc_loadings = compute_dcpcs(data, variance_threshold=variance_threshold)  # Shape: (components, features)

    # Step 2: Transpose DCPC loadings to cluster features
    feature_embeddings = dcpc_loadings.T  # Shape: (features, components)

    # Step 3: Determine number of clusters
    if n_clusters is None:
        n_clusters = int(np.sqrt(num_features))  # Heuristic for cluster count

    # Step 4: Perform K-means clustering on DCPC loadings
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(feature_embeddings)

    # Step 5: Select representative features (closest to cluster centroids)
    selected_features = []
    for cluster in range(n_clusters):
        cluster_indices = np.where(cluster_labels == cluster)[0]
        centroid = kmeans.cluster_centers_[cluster]

        # Find the feature closest to the centroid
        distances = np.linalg.norm(feature_embeddings[cluster_indices] - centroid, axis=1)
        representative_feature = cluster_indices[np.argmin(distances)]
        selected_features.append(representative_feature)

    return selected_features


In [217]:
#  Usage
# Assuming `data` is a NumPy array with shape (samples, features, time_steps)
selected_features_CLeVerC = clever_cluster(scaled_data_ift, n_clusters=4)
print(f'CLeVer Cluster Selected feature indices: {selected_features_CLeVerC}')

CLeVer Cluster Selected feature indices: [7, 4, 10, 0]


In [218]:
np.shape(scaled_TESTdata)

(265, 1000, 12)

In [219]:
# Filter the TEST Dataset according to the selected features from CLeVer
selected_TESTdata_CLeVerC = scaled_TESTdata[:, :, selected_features_CLeVerC]
print('Filtered TEST dataset shape: ', np.shape(selected_TESTdata_CLeVerC))

Filtered TEST dataset shape:  (265, 1000, 4)


In [220]:
# Compute all intrinsic metrics again for selected_TESTdata_CLeVerC

# Compute representation entropy
CLEVERC_TESTdata_flattened = selected_TESTdata_CLeVerC.reshape(-1, selected_TESTdata_CLeVerC.shape[2])
CLEVERC_representation_entropy = compute_representation_entropy(CLEVERC_TESTdata_flattened)
print('CLEVER Cluster Representation entropy: ', CLEVERC_representation_entropy)

# calculate variance
CLEVERC_overall_variance = CLEVERC_TESTdata_flattened.var().mean()
print('CLEVER Cluster overall variance:', CLEVERC_overall_variance)

# Compute the correlation matrix
CLEVERC_corr_matrix = pd.DataFrame(CLEVERC_TESTdata_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
CLEVERC_avg_corr = (CLEVERC_corr_matrix.values.sum() - len(CLEVERC_corr_matrix)) / (len(CLEVERC_corr_matrix) * (len(CLEVERC_corr_matrix) - 1))
CLEVERC_redundancy_rate = CLEVERC_avg_corr
print("CLEVER Cluster Redundancy Rate (Correlation-Based):", CLEVERC_redundancy_rate)

CLEVER Cluster Representation entropy:  1.381493946362083
CLEVER Cluster overall variance: 1.0000000000000002
CLEVER Cluster Redundancy Rate (Correlation-Based): 0.04656726879094227


### Feature Selection CLeVer Ranking

In [221]:
import numpy as np
from sklearn.decomposition import PCA

# def compute_dcpcs(data, variance_threshold=0.8):
#     """
#     Compute Descriptive Common Principal Components (DCPCs).

#     Parameters:
#     - data: np.array of shape (samples, features, time_steps)
#     - variance_threshold: Minimum variance explained by selected PCs

#     Returns:
#     - dcpc_loadings: Matrix of DCPC loadings for features
#     """
#     num_samples, num_features, time_steps = data.shape

#     # Step 1: Perform PCA for each sample's time-series data
#     pc_matrices = []  # Store the PC loadings for all samples
#     for sample in range(num_samples):
#         sample_data = data[sample]  # Shape: (features, time_steps)
#         pca = PCA()
#         pca.fit(sample_data.T)
        
#         # Select the number of PCs to retain (based on variance threshold)
#         cum_variance = np.cumsum(pca.explained_variance_ratio_)
#         num_pcs = np.searchsorted(cum_variance, variance_threshold) + 1
#         pc_matrices.append(pca.components_[:num_pcs])  # Retain only the top PCs

#     # Step 2: Compute the DCPC loadings (common across all samples)
#     H = np.zeros((num_features, num_features))
#     for pc_matrix in pc_matrices:
#         H += pc_matrix.T @ pc_matrix

#     eigvals, eigvecs = np.linalg.eigh(H)  # Eigen decomposition
#     eigvecs = eigvecs[:, ::-1]  # Sort eigenvectors in descending order of eigenvalues

#     return eigvecs.T  # DCPC loadings (features x components)


def clever_ranking(data, num_features_to_select=5, variance_threshold=0.8):
    """
    CLeVer Ranking method for feature selection.

    Parameters:
    - data: np.array of shape (samples, features, time_steps)
    - num_features_to_select: Number of top-ranked features to select
    - variance_threshold: Variance threshold for PCA

    Returns:
    - selected_features: List of indices of the top-ranked features
    """
    # Step 1: Compute DCPC loadings
    dcpc_loadings = compute_dcpcs(data)

    # Step 2: Rank features based on their contribution to the DCPCs
    feature_scores = np.linalg.norm(dcpc_loadings, axis=1)  # L2 norm of DCPC loadings
    ranked_features = np.argsort(feature_scores)[::-1]  # Sort in descending order

    # Step 3: Select top features
    selected_features = ranked_features[:num_features_to_select]

    return selected_features


In [222]:
selected_features_CLeVerR = clever_ranking(scaled_data_ift, num_features_to_select=4)
print(f'CLeVer Rank Selected feature indices: {selected_features_CLeVerR}')

CLeVer Rank Selected feature indices: [1 8 7 6]


In [223]:
# Filter the TEST Dataset according to the selected features from CLeVer
selected_TESTdata_CLeVerR = scaled_TESTdata[:, :, selected_features_CLeVerR]
print('Filtered TEST dataset shape: ', np.shape(selected_TESTdata_CLeVerR))

Filtered TEST dataset shape:  (265, 1000, 4)


In [224]:
# Compute all intrinsic metrics again for selected_TESTdata_CLeVerR

# Compute representation entropy
CLEVERR_TESTdata_flattened = selected_TESTdata_CLeVerR.reshape(-1, selected_TESTdata_CLeVerR.shape[2])
CLEVERR_representation_entropy = compute_representation_entropy(CLEVERR_TESTdata_flattened)
print('CLEVER Cluster Representation entropy: ', CLEVERR_representation_entropy)

# calculate variance
CLEVERR_overall_variance = CLEVERR_TESTdata_flattened.var().mean()
print('CLEVER Cluster overall variance:', CLEVERR_overall_variance)

# Compute the correlation matrix
CLEVERR_corr_matrix = pd.DataFrame(CLEVERR_TESTdata_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
CLEVERR_avg_corr = (CLEVERR_corr_matrix.values.sum() - len(CLEVERR_corr_matrix)) / (len(CLEVERR_corr_matrix) * (len(CLEVERR_corr_matrix) - 1))
CLEVERR_redundancy_rate = CLEVERR_avg_corr
print("CLEVER Cluster Redundancy Rate (Correlation-Based):", CLEVERR_redundancy_rate)

CLEVER Cluster Representation entropy:  1.3789000451618967
CLEVER Cluster overall variance: 0.9999999999999998
CLEVER Cluster Redundancy Rate (Correlation-Based): 0.05296186142263156


## Test 2: Perform Timeseries-k-Means and evaluate clustering performance UNSUPERVISED

TO DO: 
Check required cluster input shape 

Clusterin evaluation Metrics:
* Silhouette 
* Davies-Bouldin Index

### Before FS

DTW Time series clsutering erfolgreich 133 min

In [225]:
from tslearn.clustering import TimeSeriesKMeans
seed = 0
np.random.seed(seed)
print("DTW k-means")
sdtw_km = TimeSeriesKMeans(n_clusters=6,
                           metric="dtw",
                           verbose=True,
                           random_state=seed)

DTW k-means


In [226]:
# y_pred = sdtw_km.fit_predict(scaled_data)

Clustering should be done on the TEST data

In [227]:
np.shape(scaled_TESTdata)

(265, 1000, 12)

thefit_predict(X, y=None)
Fit k-means clustering using X and then predict the closest cluster each time series in X belongs to.

Parameters:
Xarray-like of shape=(n_ts, sz, d)
n_ts: instance, sz:timestamps, d:features

In [228]:
before_y_pred = sdtw_km.fit_predict(scaled_TESTdata)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    2.1s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    2.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    2.0s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.8s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    2.0s
[Parallel(n_jobs=1)]: Do

16085.256 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.8s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    8.6s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   13.1s


8800.553 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.6s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    9.2s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   15.3s


8689.389 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    2.1s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    7.8s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   12.9s


8646.228 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.5s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    8.2s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   12.8s


8646.228 --> 


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.9s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    4.4s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    7.9s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   12.9s


In [229]:
before_y_pred

array([1, 1, 2, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 5, 5, 1, 1, 3, 4, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 3, 0, 3, 0, 5, 0, 3, 3, 0, 3, 0, 0, 3, 3, 0, 5,
       0, 3, 3, 5, 3, 3, 2, 3, 0, 5, 5, 0, 3, 0, 5, 3, 5, 5, 3, 5, 5, 5,
       3, 3, 3, 3, 3, 4, 4, 2, 2, 3, 3, 3, 5, 5, 3, 5, 5, 5, 5, 5, 3, 4,
       0, 3, 3, 3, 5, 4, 4, 3, 2, 0, 0, 5, 5, 5, 5, 3, 5, 0, 3, 3, 0, 3,
       3, 3, 3, 3, 3, 3, 3, 4, 0, 3, 3, 0, 5, 2, 2, 3, 2, 4, 3, 0, 1, 0,
       5, 0, 2, 0, 4, 2, 2, 3, 3, 3, 0, 5, 2, 2, 4, 3, 3, 4, 2, 2, 3, 0,
       5, 3, 3, 3, 3, 3, 3, 2, 3, 4, 0, 5, 0, 2, 3, 2, 0, 5, 0, 3, 5, 4,
       3, 3, 3, 0, 3, 2, 0, 3, 5, 3, 2, 3, 3, 5, 5, 1, 1, 4, 2, 4, 1, 2,
       4, 4, 1, 1, 4, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1, 1,
       4, 4, 1, 1, 1, 1, 4, 1, 2, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4,
       1, 1, 1, 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1,
       1], dtype=int64)

In [230]:
from sklearn.metrics import silhouette_score

labels = before_y_pred  # Cluster labels from the model
# Flatten the time series for silhouette_score into (instances,timestamps*features)
scaled_TESTdata_flattened_instances = scaled_TESTdata.reshape(scaled_TESTdata.shape[0], -1)  

silhouette_avg = silhouette_score(scaled_TESTdata_flattened_instances, labels, metric='euclidean')
print(f"Silhouette Score: {silhouette_avg:.5f}")


Silhouette Score: 0.00186


In [231]:
from sklearn.metrics import davies_bouldin_score

db_index = davies_bouldin_score(scaled_TESTdata_flattened_instances, labels)
print(f"Davies-Bouldin Index: {db_index:.5f}")

Davies-Bouldin Index: 7.68617


### CLeVer Hybrid

In [232]:
np.shape(selected_TESTdata_CLeVerH)

(265, 1000, 4)

In [233]:
#Clustering
CLEVERH_y_pred = sdtw_km.fit_predict(selected_TESTdata_CLeVerH)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.5s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.2s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Do

4163.076 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.5s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    6.2s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    9.9s


2478.618 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.2s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.8s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    9.1s


2441.339 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.5s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    6.3s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    9.7s


2431.729 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.3s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    6.0s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    9.5s


2431.729 --> 


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.7s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.6s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    6.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   10.1s


In [234]:
# Compute clustering metrics
labels_CH = CLEVERH_y_pred  # Cluster labels from the model
# Flatten the time series for silhouette_score into (instances,timestamps*features)
scaled_TESTdataCH_flattened_instances = selected_TESTdata_CLeVerH.reshape(selected_TESTdata_CLeVerH.shape[0], -1)  

CLEVERH_silhouette_avg = silhouette_score(scaled_TESTdataCH_flattened_instances, labels_CH, metric='euclidean')
print(f"CLEVER Hybrid Silhouette Score: {CLEVERH_silhouette_avg:.5f}")
CLEVERH_db_index = davies_bouldin_score(scaled_TESTdataCH_flattened_instances, labels_CH)
print(f"CLEVER Hybrid Davies-Bouldin Index: {CLEVERH_db_index:.5f}")

CLEVER Silhouette Score: -0.00291
CLEVER Davies-Bouldin Index: 8.45873


### CLeVer Cluster

In [180]:
#Clustering
CLEVERC_y_pred = sdtw_km.fit_predict(selected_TESTdata_CLeVerC)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.3s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Do

4061.476 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    6.2s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    9.9s


2322.929 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.2s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    6.0s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    9.3s


2251.083 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.1s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.5s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.6s


2229.514 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.6s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    6.2s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    9.3s


2209.777 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.5s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.8s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.9s


2201.411 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    6.1s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    9.2s


2199.057 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.5s


2197.776 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.5s


2197.776 --> 


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.5s


In [181]:
# Compute clustering metrics
labels_CC = CLEVERC_y_pred  # Cluster labels from the model
# Flatten the time series for silhouette_score into (instances,timestamps*features)
scaled_TESTdataCC_flattened_instances = selected_TESTdata_CLeVerC.reshape(selected_TESTdata_CLeVerC.shape[0], -1)  

CLEVERC_silhouette_avg = silhouette_score(scaled_TESTdataCC_flattened_instances, labels_CC, metric='euclidean')
print(f"CLEVER Cluster Silhouette Score: {CLEVERC_silhouette_avg:.5f}")
CLEVERC_db_index = davies_bouldin_score(scaled_TESTdataCC_flattened_instances, labels_CC)
print(f"CLEVER Cluster Davies-Bouldin Index: {CLEVERC_db_index:.5f}")

CLEVER Silhouette Score: 0.00473
CLEVER Davies-Bouldin Index: 7.28129


### CLeVer Rank

In [182]:
#Clustering
CLEVERR_y_pred = sdtw_km.fit_predict(selected_TESTdata_CLeVerR)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.2s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.2s
[Parallel(n_jobs=1)]: Do

3982.516 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.5s


2392.891 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.5s


2360.331 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.4s


2351.837 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.5s


2349.271 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.4s


2347.229 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.3s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.7s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.8s


2347.229 --> 


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    3.0s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    8.5s


In [183]:
# Compute clustering metrics
labels_CR = CLEVERR_y_pred  # Cluster labels from the model
# Flatten the time series for silhouette_score into (instances,timestamps*features)
scaled_TESTdataCR_flattened_instances = selected_TESTdata_CLeVerR.reshape(selected_TESTdata_CLeVerR.shape[0], -1)  

CLEVERR_silhouette_avg = silhouette_score(scaled_TESTdataCR_flattened_instances, labels_CR, metric='euclidean')
print(f"CLEVER Rank Silhouette Score: {CLEVERR_silhouette_avg:.5f}")
CLEVERR_db_index = davies_bouldin_score(scaled_TESTdataCR_flattened_instances, labels_CR)
print(f"CLEVER Rank Davies-Bouldin Index: {CLEVERR_db_index:.5f}")

CLEVER Rank Silhouette Score: 0.00020
CLEVER Rank Davies-Bouldin Index: 7.71866


## Validierung mit clustering accuracy

### Before FS

In [184]:
# compare clustering vs labels

from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment

def clustering_accuracy(true_labels, predicted_labels):
    # Create a confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    # Use the Hungarian algorithm to find the optimal assignment of clusters
    row_ind, col_ind = linear_sum_assignment(-cm)  # Maximize the matching (negative to maximize)
    
    # Calculate accuracy based on optimal matching
    accuracy = cm[row_ind, col_ind].sum() / len(true_labels)
    
    return accuracy

In [185]:
TESTdata_y

array(['dws', 'dws', 'dws', 'dws', 'dws', 'dws', 'dws', 'dws', 'dws',
       'dws', 'jog', 'jog', 'jog', 'jog', 'jog', 'jog', 'jog', 'jog',
       'jog', 'jog', 'jog', 'jog', 'jog', 'jog', 'jog', 'jog', 'jog',
       'jog', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit', 'sit',
       'sit', 'sit', 'sit', 'std', 'std', 'std', 'std', 'std', 'std',
       'std', 'std',

In [186]:
before_y_pred

array([1, 1, 2, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 5, 5, 1, 1, 3, 4, 1, 1, 1,
       1, 1, 1, 1, 1, 2, 3, 0, 3, 0, 5, 0, 3, 3, 0, 3, 0, 0, 3, 3, 0, 5,
       0, 3, 3, 5, 3, 3, 2, 3, 0, 5, 5, 0, 3, 0, 5, 3, 5, 5, 3, 5, 5, 5,
       3, 3, 3, 3, 3, 4, 4, 2, 2, 3, 3, 3, 5, 5, 3, 5, 5, 5, 5, 5, 3, 4,
       0, 3, 3, 3, 5, 4, 4, 3, 2, 0, 0, 5, 5, 5, 5, 3, 5, 0, 3, 3, 0, 3,
       3, 3, 3, 3, 3, 3, 3, 4, 0, 3, 3, 0, 5, 2, 2, 3, 2, 4, 3, 0, 1, 0,
       5, 0, 2, 0, 4, 2, 2, 3, 3, 3, 0, 5, 2, 2, 4, 3, 3, 4, 2, 2, 3, 0,
       5, 3, 3, 3, 3, 3, 3, 2, 3, 4, 0, 5, 0, 2, 3, 2, 0, 5, 0, 3, 5, 4,
       3, 3, 3, 0, 3, 2, 0, 3, 5, 3, 2, 3, 3, 5, 5, 1, 1, 4, 2, 4, 1, 2,
       4, 4, 1, 1, 4, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 5, 1, 1, 5, 1, 1,
       4, 4, 1, 1, 1, 1, 4, 1, 2, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4,
       1, 1, 1, 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 5, 5, 1, 1, 1, 1, 1, 1, 1,
       1], dtype=int64)

In [187]:
def convert_labels_to_numeric(labels):
    """
    Convert string labels to numeric labels based on a predefined mapping.

    Args:
        labels (list or array): The string labels to be converted.

    Returns:
        list: Numeric labels.
    """
    # Define the mapping
    label_mapping = {
        'dws': 0,
        'jog': 1,
        'sit': 2,
        'std': 3,
        'ups': 4,
        'wlk': 5
    }
    
    # Map labels
    numeric_labels = [label_mapping[label] for label in labels]

    return numeric_labels

In [188]:
numeric_labels = convert_labels_to_numeric(TESTdata_y)
print("Original labels:", TESTdata_y)
print("Numeric labels:", numeric_labels)

Original labels: ['dws' 'dws' 'dws' 'dws' 'dws' 'dws' 'dws' 'dws' 'dws' 'dws' 'jog' 'jog'
 'jog' 'jog' 'jog' 'jog' 'jog' 'jog' 'jog' 'jog' 'jog' 'jog' 'jog' 'jog'
 'jog' 'jog' 'jog' 'jog' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit'
 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit'
 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit'
 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit'
 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit'
 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit'
 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit'
 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit' 'sit'
 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std'
 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std'
 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std' 'std'
 'std' 'std' 'std' 'std' 'std' 'st

In [189]:
ClusteringACC_before = clustering_accuracy(numeric_labels,before_y_pred)
print(f"Clustering Accuracy before FS: {ClusteringACC_before:.2f}")

Clustering Accuracy before FS: 0.39


### FS 1: CLeVer Hybrid

In [190]:
# compare clustering vs labels
ClusteringACC_CLEVERH = clustering_accuracy(numeric_labels,CLEVERH_y_pred)
print(f"Clustering Accuracy CLEVER Hybrid: {ClusteringACC_CLEVERH:.2f}")

Clustering Accuracy CLEVER Hybrid: 0.37


### FS 2: CLeVer Cluster

In [191]:
ClusteringACC_CLEVERC = clustering_accuracy(numeric_labels,CLEVERC_y_pred)
print(f"Clustering Accuracy CLEVER Cluster: {ClusteringACC_CLEVERC:.2f}")

Clustering Accuracy CLEVER Cluster: 0.38


### FS 3: CLeVer Rank

In [192]:
ClusteringACC_CLEVERR = clustering_accuracy(numeric_labels,CLEVERR_y_pred)
print(f"Clustering Accuracy CLEVER Rank: {ClusteringACC_CLEVERR:.2f}")

Clustering Accuracy CLEVER Rank: 0.39
