# Motor Imagery Dataset

In [5]:
from aeon.datasets import load_from_tsfile
import numpy as np # for some mathematical operations
def load_data(DATA_PATH,t):
    if t=='train':
        train_x, train_y = load_from_tsfile(DATA_PATH + "/MotorImagery/MotorImagery_TRAIN.ts")
        return [train_x,train_y]
    elif t=='test':
        test_x, test_y = load_from_tsfile(DATA_PATH + "/MotorImagery/MotorImagery_TEST.ts")
        return [test_x, test_y]

In [6]:
#Test the load_data function
[data, data_y] = load_data("datasets",'train')  # 'path', 'test'/'train'

In [7]:
print('First time series:')
print(data[0],"\n")
print('First target:')
print(data_y[0],"\n")
print('Shape of train dataset:') #(samples, features,timestamps) 
print(np.shape(data))

First time series:
[[  6.59375   6.59375   6.1875  ...  -0.46875  -0.21875  -0.0625 ]
 [ 13.       12.71875  12.84375 ... -13.3125  -13.09375 -12.84375]
 [ 11.9375   12.4375   12.96875 ...  -8.75     -9.3125  -10.5    ]
 ...
 [-10.125    -9.       -8.8125  ...  20.8125   19.34375  18.09375]
 [ -6.46875  -5.96875  -5.90625 ...   9.09375   9.9375   10.59375]
 [ -7.6875   -8.1875   -8.59375 ...  16.09375  15.96875  14.9375 ]] 

First target:
finger 

Shape of train dataset:
(278, 64, 3000)


In [8]:
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

def standarize(data):
    scaler = TimeSeriesScalerMeanVariance(mu=0.0, std=1.0)  # Standardize to mean=0, std=1
    scaled_data_3d = scaler.fit_transform(data)
    return scaled_data_3d
    

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [9]:
# Transform from (samples, features,timestamps) to (samples, timestamps, features) to apply standarisation
reshaped_data = np.transpose(data, (0, 2, 1))
print("Original shape:", data.shape)          # (instances, features, timepoints)
print("Reshaped shape:", reshaped_data.shape) # (instances, timepoints, features)

Original shape: (278, 64, 3000)
Reshaped shape: (278, 3000, 64)


In [10]:
scaled_data=standarize(reshaped_data)
# scaled_data
print('First time series:')
print(scaled_data[0],"\n")

First time series:
[[ 0.01036473  1.14424664  0.97613651 ... -1.41133536 -0.85179719
  -1.04100093]
 [ 0.01036473  1.09728199  1.05420102 ... -1.2337847  -0.77437809
  -1.11495918]
 [-0.06236365  1.11815517  1.13714456 ... -1.20419293 -0.7647007
  -1.17505026]
 ...
 [-1.2539901  -3.24955691 -2.25378255 ...  3.47130764  1.55787228
   2.47663847]
 [-1.20923417 -3.21302885 -2.34160513 ...  3.2395054   1.68851701
   2.45814891]
 [-1.18126172 -3.1712825  -2.52700834 ...  3.04222689  1.79012957
   2.30561001]] 



In [11]:
np.shape(scaled_data)

(278, 3000, 64)

## Test 1: intrinsic metrics before vs. after Feature selection

### Before FS:
* Representation Entropy correlation based
* Variance
* Redundancy Rate RED

TO DO:
* Information Gain Ratio

In [12]:
# Representation Entropy
import numpy as np
from scipy.linalg import eigh

def compute_representation_entropy(data):
    """
    Compute Representation Entropy (RE) of a multivariate dataset.
    
    Args:
        data (numpy.ndarray): 2D-Data with shape (samples, features).
    
    Returns:
        float: Representation Entropy (RE).
    """
    # Step 1: Compute the covariance matrix of the dataset (features x features)
    #covariance_matrix = np.cov(data, rowvar=False)  # rowvar=False means variables are columns

    block_size=100
    data_centered = data - np.mean(data, axis=0)
    num_features = data.shape[1]
    covariance_matrix = np.zeros((num_features, num_features), dtype=np.float64)

    for i in range(0, num_features, block_size):
        for j in range(i, num_features, block_size):
            block_i = data_centered[:, i:i+block_size]
            block_j = data_centered[:, j:j+block_size]
            block_cov = np.dot(block_i.T, block_j) / (data.shape[0] - 1)
            covariance_matrix[i:i+block_size, j:j+block_size] = block_cov
            if i != j:
                covariance_matrix[j:j+block_size, i:i+block_size] = block_cov.T


    # # Step 2: Compute eigenvalues of the covariance matrix
    # eigenvalues = np.linalg.eigvals(covariance_matrix) THIS METHOD WAS REPLACED BC OF INSTABILITY
    eigenvalues, eigenvectors = eigh(covariance_matrix)

    # Step 3: Normalize the eigenvalues to act as probabilities
    eigenvalues_sum = np.sum(eigenvalues)
    normalized_eigenvalues = eigenvalues / eigenvalues_sum

    # Step 4: Compute Representation Entropy using the formula
    representation_entropy = -np.sum(normalized_eigenvalues * np.log(normalized_eigenvalues))

    return representation_entropy

In [13]:
#Need to flatten the Time Dimension
data_flattened = scaled_data.reshape(-1, scaled_data.shape[2])
np.shape(data_flattened)

(834000, 64)

In [14]:
compute_representation_entropy(data_flattened)

3.291436807583561

In [15]:
import pandas as pd

# calculate variance
overall_variance = data_flattened.var().mean()
print('overall variance:', overall_variance)

# Compute the correlation matrix
corr_matrix = pd.DataFrame(data_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
avg_corr = (corr_matrix.values.sum() - len(corr_matrix)) / (len(corr_matrix) * (len(corr_matrix) - 1))
redundancy_rate = avg_corr
print("Redundancy Rate (Correlation-Based):", redundancy_rate)

overall variance: 1.0000000000000027
Redundancy Rate (Correlation-Based): 0.21865414234438776


In [16]:
# Load TEST dataset
[TESTdata, TESTdata_y] = load_data("datasets",'test')  # 'path', 'test'/'train'

In [17]:
#Preprocess the TEST data

reshaped_TESTdata = np.transpose(TESTdata, (0, 2, 1))
print("Original shape:", data.shape)          # (instances, features, timepoints)
print("Reshaped shape:", reshaped_data.shape) # (instances, timepoints, features)
scaled_TESTdata=standarize(reshaped_TESTdata)

Original shape: (278, 64, 3000)
Reshaped shape: (278, 3000, 64)


In [18]:
# Compute all intrinsic metrics again for original data_TEST

# Compute representation entropy
before_TESTdata_flattened = scaled_TESTdata.reshape(-1, scaled_TESTdata.shape[2])
before_representation_entropy = compute_representation_entropy(before_TESTdata_flattened)
print('before Representation entropy: ', before_representation_entropy)

# calculate variance
before_overall_variance = before_TESTdata_flattened.var().mean()
print('before overall variance:', before_overall_variance)

# Compute the correlation matrix
before_corr_matrix = pd.DataFrame(before_TESTdata_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
before_avg_corr = (before_corr_matrix.values.sum() - len(before_corr_matrix)) / (len(before_corr_matrix) * (len(before_corr_matrix) - 1))
before_redundancy_rate = before_avg_corr
print("before Redundancy Rate (Correlation-Based):", before_redundancy_rate)

before Representation entropy:  3.3064268109668387
before overall variance: 1.000000000000001
before Redundancy Rate (Correlation-Based): 0.16871039936347354


In [19]:
np.shape(before_TESTdata_flattened)

(300000, 64)

### Feature Selection 1: CLeVer Hybrid

In [20]:
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, fcluster
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import os
os.environ["OMP_NUM_THREADS"] = "1"

def compute_dcpcs(data, variance_threshold=0.8):
    """
    Compute Descriptive Common Principal Components (DCPCs) for a set of multivariate time series.

    Parameters:
    - data: ndarray of shape (num_samples, num_features, time_steps), the time-series dataset.
    - variance_threshold: float, the cumulative variance explained to determine the number of PCs.

    Returns:
    - dcpc_loadings: ndarray of shape (num_dcpcs, num_features), loadings of the DCPCs.
    """
    num_samples, num_features, time_steps = data.shape

    # Step 1: Compute PCs for each MTS item
    pc_matrices = []  # Store PC loadings for each sample
    for sample in range(num_samples):
        # Compute correlation matrix for each sample
        correlation_matrix = np.corrcoef(data[sample])
        # Perform PCA on the correlation matrix
        pca = PCA()
        pca.fit(correlation_matrix)
        pc_matrices.append(pca.components_[:pca.n_components_])

    # Step 2: Compute DCPCs across all samples using SVD
    all_pc_matrices = np.concatenate(pc_matrices, axis=0)  # Combine PC loadings from all samples
    dcpc_covariance = all_pc_matrices.T @ all_pc_matrices
    eigvals, eigvecs = np.linalg.eigh(dcpc_covariance)
    sorted_indices = np.argsort(eigvals)[::-1]
    eigvecs = eigvecs[:, sorted_indices]  # Sort eigenvectors by eigenvalues

    # Select DCPCs explaining the desired variance threshold
    cumulative_variance = np.cumsum(eigvals[sorted_indices]) / np.sum(eigvals)
    num_dcpcs = np.searchsorted(cumulative_variance, variance_threshold) + 1
    dcpc_loadings = eigvecs[:, :num_dcpcs].T

    return dcpc_loadings

def cluster_features(dcpc_loadings, n_clusters):
    """
    Cluster features based on their DCPC loadings using K-means.

    Parameters:
    - dcpc_loadings: ndarray of shape (num_dcpcs, num_features), loadings of the DCPCs.
    - n_clusters: int, number of clusters.

    Returns:
    - cluster_labels: ndarray of shape (num_features,), cluster assignments for each feature.
    """
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(dcpc_loadings.T)
    return cluster_labels

def rank_features(dcpc_loadings, cluster_labels):
    """
    Rank features within each cluster based on their contribution to DCPCs.

    Parameters:
    - dcpc_loadings: ndarray of shape (num_dcpcs, num_features), loadings of the DCPCs.
    - cluster_labels: ndarray of shape (num_features,), cluster assignments for each feature.

    Returns:
    - ranked_features: dict, keys are cluster labels, values are ranked feature indices.
    """
    ranked_features = {}
    for cluster in np.unique(cluster_labels):
        cluster_indices = np.where(cluster_labels == cluster)[0]
        cluster_loadings = dcpc_loadings[:, cluster_indices]
        scores = np.linalg.norm(cluster_loadings, axis=0)  # L2 norm of loadings
        ranking = cluster_indices[np.argsort(scores)[::-1]]  # Sort by descending contribution
        ranked_features[cluster] = ranking
    return ranked_features

def select_top_features(ranked_features, top_n=1):
    """
    Select top-ranked features from each cluster.

    Parameters:
    - ranked_features: dict, keys are cluster labels, values are ranked feature indices.
    - top_n: int, number of features to select from each cluster.

    Returns:
    - selected_features: list, indices of selected features.
    """
    selected_features = []
    for features in ranked_features.values():
        selected_features.extend(features[:top_n])
    return selected_features

def clever_hybrid(data, variance_threshold=0.8, n_clusters=None, top_n=1):
    """
    Perform feature selection using the CLeVer-Hybrid algorithm.

    Parameters:
    - data: ndarray of shape (num_samples, num_features, time_steps), the time-series dataset.
    - variance_threshold: float, variance threshold for selecting DCPCs.
    - n_clusters: int, number of clusters (if None, sqrt of num_features is used).
    - top_n: int, number of features to select from each cluster.

    Returns:
    - selected_features: list, indices of selected features.
    """
    num_samples, num_features, _ = data.shape
    if n_clusters is None:
        n_clusters = int(np.sqrt(num_features))

    # Step 1: Compute DCPCs
    dcpc_loadings = compute_dcpcs(data, variance_threshold)

    # Step 2: Cluster features based on DCPC loadings
    cluster_labels = cluster_features(dcpc_loadings, n_clusters)

    # Step 3: Rank features within clusters
    ranked_features = rank_features(dcpc_loadings, cluster_labels)

    # Step 4: Select top features from each cluster
    selected_features = select_top_features(ranked_features, top_n)

    return selected_features

In [21]:
scaled_data_ift= np.transpose(scaled_data, (0, 2, 1))
np.shape(scaled_data_ift)

(278, 64, 3000)

In [22]:
selected_features_CLeVerH=clever_hybrid(scaled_data_ift, n_clusters=15,top_n=1)
print("Selected features CLeVer Hybrid: ", selected_features_CLeVerH)

Selected features CLeVer Hybrid:  [55, 35, 54, 62, 27, 52, 17, 60, 24, 7, 37, 56, 44, 0, 57]


In [23]:
np.shape(scaled_TESTdata)

(100, 3000, 64)

In [24]:
# Filter the TEST Dataset according to the selected features from CLeVer
selected_TESTdata_CLeVerH = scaled_TESTdata[:, :, selected_features_CLeVerH]
print('Filtered TEST dataset shape: ', np.shape(selected_TESTdata_CLeVerH))

Filtered TEST dataset shape:  (100, 3000, 15)


In [25]:
# Compute all intrinsic metrics again for selected_TESTdata_CLeVerH

# Compute representation entropy
CLEVERH_TESTdata_flattened = selected_TESTdata_CLeVerH.reshape(-1, selected_TESTdata_CLeVerH.shape[2])
CLEVERH_representation_entropy = compute_representation_entropy(CLEVERH_TESTdata_flattened)
print('CLEVER Hybrid Representation entropy: ', CLEVERH_representation_entropy)

# calculate variance
CLEVERH_overall_variance = CLEVERH_TESTdata_flattened.var().mean()
print('CLEVER Hybrid overall variance:', CLEVERH_overall_variance)

# Compute the correlation matrix
CLEVERH_corr_matrix = pd.DataFrame(CLEVERH_TESTdata_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
CLEVERH_avg_corr = (CLEVERH_corr_matrix.values.sum() - len(CLEVERH_corr_matrix)) / (len(CLEVERH_corr_matrix) * (len(CLEVERH_corr_matrix) - 1))
CLEVERH_redundancy_rate = CLEVERH_avg_corr
print("CLEVER Hybrid Redundancy Rate (Correlation-Based):", CLEVERH_redundancy_rate)

CLEVER Hybrid Representation entropy:  2.3197363756658502
CLEVER Hybrid overall variance: 1.0000000000000007
CLEVER Hybrid Redundancy Rate (Correlation-Based): 0.18181327379886564


### Feature Selection 2: CLeVer Cluster

In [26]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
os.environ["OMP_NUM_THREADS"] = "1"

def clever_cluster(data, n_clusters=None, variance_threshold=0.8):
    """
    CLeVer-Cluster implementation.

    Parameters:
    - data: np.array of shape (samples, features, time_steps)
    - n_clusters: Number of feature clusters (if None, heuristic is used)
    - variance_threshold: Minimum variance explained by selected PCs

    Returns:
    - selected_features: List of representative feature indices for each cluster
    """
    num_samples, num_features, time_steps = data.shape

    # Step 1: Compute DCPC loadings
    dcpc_loadings = compute_dcpcs(data, variance_threshold=variance_threshold)  # Shape: (components, features)

    # Step 2: Transpose DCPC loadings to cluster features
    feature_embeddings = dcpc_loadings.T  # Shape: (features, components)

    # Step 3: Determine number of clusters
    if n_clusters is None:
        n_clusters = int(np.sqrt(num_features))  # Heuristic for cluster count

    # Step 4: Perform K-means clustering on DCPC loadings
    kmeans = KMeans(n_clusters=n_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(feature_embeddings)

    # Step 5: Select representative features (closest to cluster centroids)
    selected_features = []
    for cluster in range(n_clusters):
        cluster_indices = np.where(cluster_labels == cluster)[0]
        centroid = kmeans.cluster_centers_[cluster]

        # Find the feature closest to the centroid
        distances = np.linalg.norm(feature_embeddings[cluster_indices] - centroid, axis=1)
        representative_feature = cluster_indices[np.argmin(distances)]
        selected_features.append(representative_feature)

    return selected_features


In [27]:
selected_features_CLeVerC = clever_cluster(scaled_data_ift, n_clusters=15)
print(f'CLeVer Cluster Selected feature indices: {selected_features_CLeVerC}')

CLeVer Cluster Selected feature indices: [55, 1, 10, 62, 27, 52, 17, 60, 24, 7, 37, 56, 44, 0, 57]


In [28]:
np.shape(scaled_TESTdata)

(100, 3000, 64)

In [29]:
# Filter the TEST Dataset according to the selected features from CLeVer
selected_TESTdata_CLeVerC = scaled_TESTdata[:, :, selected_features_CLeVerC]
print('Filtered TEST dataset shape: ', np.shape(selected_TESTdata_CLeVerC))

Filtered TEST dataset shape:  (100, 3000, 15)


In [30]:
# Compute all intrinsic metrics again for selected_TESTdata_CLeVerC

# Compute representation entropy
CLEVERC_TESTdata_flattened = selected_TESTdata_CLeVerC.reshape(-1, selected_TESTdata_CLeVerC.shape[2])
CLEVERC_representation_entropy = compute_representation_entropy(CLEVERC_TESTdata_flattened)
print('CLEVER Cluster Representation entropy: ', CLEVERC_representation_entropy)

# calculate variance
CLEVERC_overall_variance = CLEVERC_TESTdata_flattened.var().mean()
print('CLEVER Cluster overall variance:', CLEVERC_overall_variance)

# Compute the correlation matrix
CLEVERC_corr_matrix = pd.DataFrame(CLEVERC_TESTdata_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
CLEVERC_avg_corr = (CLEVERC_corr_matrix.values.sum() - len(CLEVERC_corr_matrix)) / (len(CLEVERC_corr_matrix) * (len(CLEVERC_corr_matrix) - 1))
CLEVERC_redundancy_rate = CLEVERC_avg_corr
print("CLEVER Cluster Redundancy Rate (Correlation-Based):", CLEVERC_redundancy_rate)

CLEVER Cluster Representation entropy:  2.386785643166536
CLEVER Cluster overall variance: 1.0000000000000004
CLEVER Cluster Redundancy Rate (Correlation-Based): 0.15827578681799678


### Feature Selection 3: CLeVer Rank

In [31]:
import numpy as np
from sklearn.decomposition import PCA

def clever_ranking(data, num_features_to_select=5, variance_threshold=0.8):
    """
    CLeVer Ranking method for feature selection.

    Parameters:
    - data: np.array of shape (samples, features, time_steps)
    - num_features_to_select: Number of top-ranked features to select
    - variance_threshold: Variance threshold for PCA

    Returns:
    - selected_features: List of indices of the top-ranked features
    """
    # Step 1: Compute DCPC loadings
    dcpc_loadings = compute_dcpcs(data,variance_threshold=variance_threshold)

    # Step 2: Rank features based on their contribution to the DCPCs
    feature_scores = np.linalg.norm(dcpc_loadings, axis=1)  # L2 norm of DCPC loadings
    ranked_features = np.argsort(feature_scores)[::-1]  # Sort in descending order

    # Step 3: Select top features
    selected_features = ranked_features[:num_features_to_select]

    return selected_features

In [32]:
selected_features_CLeVerR = clever_ranking(scaled_data_ift, num_features_to_select=15)
print(f'CLeVer Rank Selected feature indices: {selected_features_CLeVerR}')

CLeVer Rank Selected feature indices: [ 9 41  2 45  4 22 30  1 48  3 44 19  7 10 37]


In [33]:
# Filter the TEST Dataset according to the selected features from CLeVer
selected_TESTdata_CLeVerR = scaled_TESTdata[:, :, selected_features_CLeVerR]
print('Filtered TEST dataset shape: ', np.shape(selected_TESTdata_CLeVerR))

Filtered TEST dataset shape:  (100, 3000, 15)


In [34]:
# Compute all intrinsic metrics again for selected_TESTdata_CLeVerR

# Compute representation entropy
CLEVERR_TESTdata_flattened = selected_TESTdata_CLeVerR.reshape(-1, selected_TESTdata_CLeVerR.shape[2])
CLEVERR_representation_entropy = compute_representation_entropy(CLEVERR_TESTdata_flattened)
print('CLEVER Cluster Representation entropy: ', CLEVERR_representation_entropy)

# calculate variance
CLEVERR_overall_variance = CLEVERR_TESTdata_flattened.var().mean()
print('CLEVER Cluster overall variance:', CLEVERR_overall_variance)

# Compute the correlation matrix
CLEVERR_corr_matrix = pd.DataFrame(CLEVERR_TESTdata_flattened).corr().abs()
# Calculate average absolute correlation (excluding the diagonal)
CLEVERR_avg_corr = (CLEVERR_corr_matrix.values.sum() - len(CLEVERR_corr_matrix)) / (len(CLEVERR_corr_matrix) * (len(CLEVERR_corr_matrix) - 1))
CLEVERR_redundancy_rate = CLEVERR_avg_corr
print("CLEVER Cluster Redundancy Rate (Correlation-Based):", CLEVERR_redundancy_rate)

CLEVER Cluster Representation entropy:  2.458904459993201
CLEVER Cluster overall variance: 1.0000000000000004
CLEVER Cluster Redundancy Rate (Correlation-Based): 0.15861265696490554


## Test 2: Perform Timeseries-k-Means and evaluate clustering performance UNSUPERVISED

Clustering evaluation Metrics:
* Silhouette 
* Davies-Bouldin Index

### Before FS

DTW Time series clsutering erfolgreich 133 min
clustering of training dataset not required, commented out on last line

In [35]:
from tslearn.clustering import TimeSeriesKMeans
seed = 0
np.random.seed(seed)
print("DTW k-means")
sdtw_km = TimeSeriesKMeans(n_clusters=2,
                           metric="dtw",
                           verbose=True,
                           random_state=seed)
#y_pred = sdtw_km.fit_predict(scaled_data)

DTW k-means


Clustering should be done on the TEST data

In [36]:
np.shape(scaled_TESTdata)

(100, 3000, 64)

thefit_predict(X, y=None)
Fit k-means clustering using X and then predict the closest cluster each time series in X belongs to.

Parameters:
Xarray-like of shape=(n_ts, sz, d)
n_ts: instance, sz:timestamps, d:features

In [37]:
before_y_pred = sdtw_km.fit_predict(scaled_TESTdata)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   22.1s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   20.7s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  1.4min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   21.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  1.4min


148539.284 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   20.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  1.4min


76658.675 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   20.6s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  1.4min


76232.465 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   19.7s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  1.4min


76151.274 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   20.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  1.4min


76151.274 --> 


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:   20.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:  1.4min


In [38]:
before_y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=int64)

In [39]:
from sklearn.metrics import silhouette_score

labels = before_y_pred  # Cluster labels from the model
# Flatten the time series for silhouette_score into (instances,timestamps*features)
scaled_TESTdata_flattened_instances = scaled_TESTdata.reshape(scaled_TESTdata.shape[0], -1)  

before_silhouette_avg = silhouette_score(scaled_TESTdata_flattened_instances, labels, metric='euclidean')
print(f"before Silhouette Score: {before_silhouette_avg:.5f}")


before Silhouette Score: 0.31458


In [40]:
from sklearn.metrics import davies_bouldin_score

before_db_index = davies_bouldin_score(scaled_TESTdata_flattened_instances, labels)
print(f"before Davies-Bouldin Index: {before_db_index:.5f}")

before Davies-Bouldin Index: 1.28375


### FS1: CLeVer Hybrid

In [41]:
np.shape(selected_TESTdata_CLeVerH)

(100, 3000, 15)

In [42]:
#Clustering
CLEVERH_y_pred = sdtw_km.fit_predict(selected_TESTdata_CLeVerH)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.2s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    4.9s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   20.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    4.7s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   19.7s


30154.482 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   22.6s


15841.853 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.5s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   21.5s


15718.864 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   20.8s


15718.864 --> 


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    6.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   23.1s


In [43]:
# Compute clustering metrics
labels_CH = CLEVERH_y_pred  # Cluster labels from the model
# Flatten the time series for silhouette_score into (instances,timestamps*features)
scaled_TESTdataCH_flattened_instances = selected_TESTdata_CLeVerH.reshape(selected_TESTdata_CLeVerH.shape[0], -1)  

CLEVERH_silhouette_avg = silhouette_score(scaled_TESTdataCH_flattened_instances, labels_CH, metric='euclidean')
print(f"CLEVER Hybrid Silhouette Score: {CLEVERH_silhouette_avg:.5f}")
CLEVERH_db_index = davies_bouldin_score(scaled_TESTdataCH_flattened_instances, labels_CH)
print(f"CLEVER Hybrid Davies-Bouldin Index: {CLEVERH_db_index:.5f}")

CLEVER Hybrid Silhouette Score: 0.26295
CLEVER Hybrid Davies-Bouldin Index: 1.45908


### FS 2: CLeVer Cluster

In [44]:
#Clustering
CLEVERC_y_pred = sdtw_km.fit_predict(selected_TESTdata_CLeVerC)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   20.8s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    4.9s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   21.0s


30755.182 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   22.1s


16976.424 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   21.0s


16810.813 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   22.4s


16810.813 --> 


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   23.1s


In [45]:
# Compute clustering metrics
labels_CC = CLEVERC_y_pred  # Cluster labels from the model
# Flatten the time series for silhouette_score into (instances,timestamps*features)
scaled_TESTdataCC_flattened_instances = selected_TESTdata_CLeVerC.reshape(selected_TESTdata_CLeVerC.shape[0], -1)  

CLEVERC_silhouette_avg = silhouette_score(scaled_TESTdataCC_flattened_instances, labels_CC, metric='euclidean')
print(f"CLEVER Cluster Silhouette Score: {CLEVERC_silhouette_avg:.5f}")
CLEVERC_db_index = davies_bouldin_score(scaled_TESTdataCC_flattened_instances, labels_CC)
print(f"CLEVER Cluster Davies-Bouldin Index: {CLEVERC_db_index:.5f}")

CLEVER Cluster Silhouette Score: 0.25703
CLEVER Cluster Davies-Bouldin Index: 1.48753


### FS3: CLeVer Rank

In [46]:
#Clustering
CLEVERR_y_pred = sdtw_km.fit_predict(selected_TESTdata_CLeVerR)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.5s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.3s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   20.7s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   21.8s


38056.341 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.2s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   21.8s


20396.010 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    6.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   25.2s


20146.008 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.9s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   22.2s


20103.296 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   22.0s


20068.713 --> 

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   21.3s


20068.713 --> 


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    5.4s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:   22.2s


In [47]:
# Compute clustering metrics
labels_CR = CLEVERR_y_pred  # Cluster labels from the model
# Flatten the time series for silhouette_score into (instances,timestamps*features)
scaled_TESTdataCR_flattened_instances = selected_TESTdata_CLeVerR.reshape(selected_TESTdata_CLeVerR.shape[0], -1)  

CLEVERR_silhouette_avg = silhouette_score(scaled_TESTdataCR_flattened_instances, labels_CR, metric='euclidean')
print(f"CLEVER Rank Silhouette Score: {CLEVERR_silhouette_avg:.5f}")
CLEVERR_db_index = davies_bouldin_score(scaled_TESTdataCR_flattened_instances, labels_CR)
print(f"CLEVER Rank Davies-Bouldin Index: {CLEVERR_db_index:.5f}")

CLEVER Rank Silhouette Score: 0.30075
CLEVER Rank Davies-Bouldin Index: 1.34867


## Validierung mit clustering accuracy

### Before FS

In [48]:
# compare clustering vs labels

from sklearn.metrics import confusion_matrix
from scipy.optimize import linear_sum_assignment

def clustering_accuracy(true_labels, predicted_labels):
    # Create a confusion matrix
    cm = confusion_matrix(true_labels, predicted_labels)

    # Use the Hungarian algorithm to find the optimal assignment of clusters
    row_ind, col_ind = linear_sum_assignment(-cm)  # Maximize the matching (negative to maximize)
    
    # Calculate accuracy based on optimal matching
    accuracy = cm[row_ind, col_ind].sum() / len(true_labels)
    
    return accuracy

In [49]:
TESTdata_y

array(['tongue', 'tongue', 'finger', 'finger', 'finger', 'finger',
       'finger', 'finger', 'tongue', 'finger', 'tongue', 'tongue',
       'tongue', 'tongue', 'tongue', 'tongue', 'finger', 'finger',
       'tongue', 'tongue', 'tongue', 'finger', 'finger', 'tongue',
       'tongue', 'finger', 'tongue', 'tongue', 'finger', 'tongue',
       'tongue', 'finger', 'tongue', 'finger', 'tongue', 'tongue',
       'tongue', 'tongue', 'finger', 'tongue', 'tongue', 'finger',
       'tongue', 'finger', 'tongue', 'finger', 'finger', 'tongue',
       'tongue', 'finger', 'finger', 'finger', 'tongue', 'tongue',
       'finger', 'finger', 'tongue', 'finger', 'tongue', 'finger',
       'finger', 'finger', 'finger', 'finger', 'tongue', 'tongue',
       'tongue', 'finger', 'tongue', 'finger', 'tongue', 'finger',
       'tongue', 'finger', 'finger', 'tongue', 'tongue', 'finger',
       'tongue', 'tongue', 'finger', 'tongue', 'finger', 'tongue',
       'tongue', 'finger', 'finger', 'finger', 'finger', 'fing

In [50]:
before_y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=int64)

In [51]:
def convert_labels_to_numeric(labels):
    """
    Convert string labels to numeric labels based on a predefined mapping.

    Args:
        labels (list or array): The string labels to be converted.

    Returns:
        list: Numeric labels.
    """
    # Define the mapping
    label_mapping = {
        'tongue': 0,
        'finger': 1,
    }
    
    # Map labels
    numeric_labels = [label_mapping[label] for label in labels]

    return numeric_labels

In [52]:
numeric_labels = convert_labels_to_numeric(TESTdata_y)
print("Original labels:", TESTdata_y)
print("Numeric labels:", numeric_labels)

Original labels: ['tongue' 'tongue' 'finger' 'finger' 'finger' 'finger' 'finger' 'finger'
 'tongue' 'finger' 'tongue' 'tongue' 'tongue' 'tongue' 'tongue' 'tongue'
 'finger' 'finger' 'tongue' 'tongue' 'tongue' 'finger' 'finger' 'tongue'
 'tongue' 'finger' 'tongue' 'tongue' 'finger' 'tongue' 'tongue' 'finger'
 'tongue' 'finger' 'tongue' 'tongue' 'tongue' 'tongue' 'finger' 'tongue'
 'tongue' 'finger' 'tongue' 'finger' 'tongue' 'finger' 'finger' 'tongue'
 'tongue' 'finger' 'finger' 'finger' 'tongue' 'tongue' 'finger' 'finger'
 'tongue' 'finger' 'tongue' 'finger' 'finger' 'finger' 'finger' 'finger'
 'tongue' 'tongue' 'tongue' 'finger' 'tongue' 'finger' 'tongue' 'finger'
 'tongue' 'finger' 'finger' 'tongue' 'tongue' 'finger' 'tongue' 'tongue'
 'finger' 'tongue' 'finger' 'tongue' 'tongue' 'finger' 'finger' 'finger'
 'finger' 'finger' 'finger' 'finger' 'tongue' 'finger' 'finger' 'finger'
 'finger' 'tongue' 'tongue' 'tongue']
Numeric labels: [0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1

In [53]:
ClusteringACC_before = clustering_accuracy(numeric_labels,before_y_pred)
print(f"Clustering Accuracy before FS: {ClusteringACC_before:.2f}")

Clustering Accuracy before FS: 0.58


### FS 1: CLeVer Hybrid

In [54]:
# compare clustering vs labels
ClusteringACC_CLEVERH = clustering_accuracy(numeric_labels,CLEVERH_y_pred)
print(f"Clustering Accuracy CLEVER Hybrid: {ClusteringACC_CLEVERH:.5f}")

Clustering Accuracy CLEVER Hybrid: 0.57000


### FS 2: CLeVer Cluster

In [55]:
ClusteringACC_CLEVERC = clustering_accuracy(numeric_labels,CLEVERC_y_pred)
print(f"Clustering Accuracy CLEVER Cluster: {ClusteringACC_CLEVERC:.5f}")

Clustering Accuracy CLEVER Cluster: 0.58000


### FS3: CLeVer Rank

In [56]:
ClusteringACC_CLEVERR = clustering_accuracy(numeric_labels,CLEVERR_y_pred)
print(f"Clustering Accuracy CLEVER Rank: {ClusteringACC_CLEVERR:.5f}")

Clustering Accuracy CLEVER Rank: 0.58000
