In [None]:
import os
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import torch
from matplotlib import pyplot as plt
from scipy.stats import skew, spearmanr
import shap
from sktime.transformations.panel.rocket import Rocket
from scipy.spatial.distance import euclidean
from sklearn.metrics import silhouette_score

from sklearn.model_selection import train_test_split, KFold, LeaveOneOut
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from scipy.spatial.distance import cdist


from sklearn_extra.cluster import KMedoids 
import statistics


from sklearn.model_selection import LeaveOneOut
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import numpy as np


from sklearn.metrics import mean_absolute_error, r2_score

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor

seed = 69
np.random.seed(seed)
torch.manual_seed(seed)


In [52]:
data = pd.read_csv('all-features-imputed-v2.csv')

#display(data)
#display(data.columns.to_list())


motion         = data.filter(like='motion').columns.tolist()
motion        += data.filter(like='acceleration').columns.tolist()
motion        += data.filter(like='step').columns.tolist()
motion        += data.filter(like='position').columns.tolist()


heartrate = data.filter(like='heartrate').filter(regex='^(?!.*sleep)').columns.tolist() #+ [col for col in ['age', 'sex'] if col in data.columns]
sleep           = data.filter(like='sleep').columns.tolist()
demographics    = ['age','sex']



#display(heartrate)
#display(motion)
#display(sleep)

modalities = motion +heartrate + sleep + demographics

sensor = data[modalities]

x= np.array(sensor)




In [53]:
motion_sensor = data[motion]
heartrate_sensor = data[heartrate]
sleep_sensor = data[sleep]
demographic_array = data[demographics].to_numpy()


#Set Clinical Score here
target = data['sis']
participant = data['participant']





motion_sensor = np.array(motion_sensor)
heartrate_sensor=np.array(heartrate_sensor)
sleep_sensor =np.array(sleep_sensor)
y = np.array(target)
p = np.array(participant)


#feature_names = sensor.columns.to_list()

#display(x.shape, y.shape, p.shape, len(feature_names), feature_names)

In [None]:
Y_TRUES = np.empty([0])
Y_PREDS = np.empty([0])
SHAP = []
X_TEST = []

cv = KFold(n_splits=5, shuffle=True, random_state=seed)

for fold, (train_idx, test_idx) in enumerate(cv.split(x), start=1):
    
    print(f"Fold {fold}: train={len(train_idx)} test={len(test_idx)}")

    x_train, x_test = x[train_idx], x[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)

    normalizer = MinMaxScaler()
    x_train = normalizer.fit_transform(x_train)
    x_test = normalizer.transform(x_test)

    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        depth=3,
        loss_function='RMSE',
        verbose=False
    )

    model.fit(x_train, y_train, eval_set=(x_train, y_train), use_best_model=True, early_stopping_rounds=100)
    y_preds = model.predict(x_test)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(x_test)

    SHAP.append(shap_values)
    X_TEST.append(x_test)

    Y_TRUES = np.append(Y_TRUES, y_test)
    Y_PREDS = np.append(Y_PREDS, y_preds)


shap_df = pd.DataFrame({
    'feature': modalities,
    'mean_abs_shap': np.abs(np.vstack(SHAP)).mean(axis=0).round(4)
}).sort_values(by='mean_abs_shap', ascending=False)






motion_shap = shap_df[shap_df['feature'].isin(motion)]
heartrate_shap = shap_df[shap_df['feature'].isin(heartrate)]
sleep_shap = shap_df[shap_df['feature'].isin(sleep)]

# Calculate how many features to take from each group
total_features = 16
n_motion = min(len(motion_shap), max(1, round(total_features * len(motion)/len(shap_df))))
n_heartrate = min(len(heartrate_shap), max(1, round(total_features * len(heartrate)/len(shap_df))))
n_sleep = min(len(sleep_shap), max(1, round(total_features * len(sleep)/len(shap_df))))

# Adjust counts to ensure total is 16
while n_motion + n_heartrate + n_sleep > total_features:
    # Reduce the largest group
    if n_motion >= n_heartrate and n_motion >= n_sleep:
        n_motion -= 1
    elif n_heartrate >= n_motion and n_heartrate >= n_sleep:
        n_heartrate -= 1
    else:
        n_sleep -= 1

# Select top features from each group
top_motion = motion_shap.head(n_motion)['feature'].tolist()
top_heartrate = heartrate_shap.head(n_heartrate)['feature'].tolist()
top_sleep = sleep_shap.head(n_sleep)['feature'].tolist()


print(top_motion)
print(top_sleep)
print(top_heartrate)

x_motion = np.array(data[top_motion])
x_heartrate = np.array(data[top_heartrate])
x_sleep = np.array(data[top_sleep])






In [55]:









# Reshape the target variable to match weekly scores
weekly_scores = y.reshape(-1, 7)[:, 0]  # Shape: (80 weeks, 7 day)

#Reshaping the sensor data to match weekly scores
motion_bags = motion_sensor.reshape((80, 7, -1))
heartrate_bags = heartrate_sensor.reshape((80, 7, -1))
#demographic_bags= demographic_array.reshape((80, 7, -1))  
sleep_bags = sleep_sensor.reshape((80, 7, -1))








In [56]:
def compute_connectivity(distance_matrix, cluster_labels, k_neighbors=5):
    """
    Compute connectivity score for clustering results.
    Lower values indicate better connectivity.
    """
    n_samples = distance_matrix.shape[0]
    
    # Get k nearest neighbors for each point (excluding self)
    neighbor_indices = np.argsort(distance_matrix, axis=1)[:, 1:k_neighbors+1]  # Exclude self
    
    connectivity = 0
    for i in range(n_samples):
        for neighbor in neighbor_indices[i]:
            if cluster_labels[i] != cluster_labels[neighbor]:
                connectivity += 1
                
    # Normalize to [0,1] range
    max_possible = n_samples * k_neighbors
    return connectivity / max_possible if max_possible > 0 else 0


def compute_distance_matrix(bags):
    """Compute the distance matrix for a list of bags using average Hausdorff distance."""
    n= len(bags)
    matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i+1,n):
            dist = average_hausdorff_distance(bags[i], bags[j])
            matrix[i, j] = matrix[j, i] = dist

    return matrix

def clustering(bags,matrix, n_clusters=10):
    """Perform Clustering using K-medoids"""
    kmedoids=KMedoids(n_clusters=n_clusters, metric='precomputed', method='pam', init="build")
                
    clusters = kmedoids.fit_predict(matrix)
    silhouette_avg = silhouette_score(matrix, clusters, metric="precomputed")
    print(f"Silhouette Score: {silhouette_avg:.3f}")
    valid_medoids = [idx for idx in kmedoids.medoid_indices_ if idx < len(bags)]
    medoids = [bags[idx] for idx in valid_medoids]

    return medoids,silhouette_avg, clusters



def average_hausdorff_distance(bag1, bag2):
    """Calculate average Hausdorff distance between two 2D arrays"""
    bag1 = bag1[~np.isnan(bag1).any(axis=1)]
    bag2 = bag2[~np.isnan(bag2).any(axis=1)]
    bag1 = bag1[np.isfinite(bag1).all(axis=1)]
    bag2 = bag2[np.isfinite(bag2).all(axis=1)]
    
    
    
    dist_matrix = cdist(bag1, bag2, metric='euclidean')
    min_dists_a = [min(euclidean(a, b) for b in bag2) for a in bag1]
    min_dists_b = [min(euclidean(b, a) for a in bag1) for b in bag2]
   
    return (np.mean(min_dists_a) + np.mean(min_dists_b)) / 2


def clustering(bags, distance_matrix, n_clusters=10):
    """Perform clustering and return metrics"""
    kmedoids = KMedoids(n_clusters=n_clusters, 
                       metric='precomputed', 
                       method='pam',
                       init="build")
    
    clusters = kmedoids.fit_predict(distance_matrix)
    silhouette_avg = silhouette_score(distance_matrix, clusters, metric="precomputed")
    connectivity = compute_connectivity(distance_matrix, clusters)
    
    # Handle medoid validation
    valid_medoids = [idx for idx in kmedoids.medoid_indices_ if idx < len(bags)]
    medoids = [bags[idx] for idx in valid_medoids]
    
    return medoids, silhouette_avg, connectivity, clusters






def get_scaling_param(training_bags):
            """Flattens data for scaling and computes mean, std, min, max."""
            flat_data = np.concatenate([arr.reshape(-1, arr.shape[-1]) for arr in training_bags], axis=0)
            return {
                'mean': np.mean(flat_data, axis=0),
                'std': np.std(flat_data, axis=0),
                'min': np.min(flat_data, axis=0),
                'max': np.max(flat_data, axis=0)
            }



In [57]:
def compute_cluster_features(cluster_assignments, y, n_clusters):
    """Compute features for each cluster based on regression labels"""
    features = []
    for j in range(n_clusters):
        mask = (cluster_assignments == j)
        labels = y[mask]
        if len(labels) == 0:
            features.append([np.nan]*5)
            continue
        min_val = np.min(labels)
        max_val = np.max(labels)
        mean_val = np.mean(labels)
        std_val = np.std(labels)
        median_val = np.median(labels)
        features.append([min_val, max_val, mean_val, std_val, median_val])
    return np.array(features)
def transform_bag(training_bag, consensus_medoids):
    #training_bag is a dict: {'motion': motion_bag, 'heartrate': hr_bag, 'sleep': sleep_bag}
    features = []
    for cluster_idx in range(len(consensus_medoids)):
        medoids = consensus_medoids[cluster_idx]
        distances = []
        for (view, medoid_bag) in medoids:
            bag = training_bag[view]
            dist = average_hausdorff_distance(bag, medoid_bag)
            distances.append(dist)
        avg_dist = np.mean(distances) if distances else 0
        features.append(avg_dist)
    return np.array(features)
def perform_consensus_clustering(train_bags, y_train):
    """Performs multi-view consensus clustering with set cluster counts"""
    # 1. Cluster each view with optimal k
    best_k = {}
    best_clusters = {}
    best_medoids = {}
    sil_list = []
    con_list = []

    
    for view in ['motion', 'heartrate', 'sleep']:#, 'demographics']:
        bags = train_bags[view]
        
        
        #Cluster count for each view
        
        distance_matrix = compute_distance_matrix(bags)
        if view=="motion":
            n_clusters =6
        if view=="heartrate":
            n_clusters = 6
        if view=="sleep":
            n_clusters = 9
        if view=="demographics":
            n_clusters = 6
        
        medoids, silhouette_avg, connectivity, clusters = clustering(bags, distance_matrix, n_clusters)
        
        best_n = n_clusters
        best_clusters[view] = clusters
        best_medoids[view] = medoids
        
        sil_list.append(silhouette_avg)
        con_list.append(connectivity)
        best_k[view] = best_n

    #Generate cluster features using regression labels
    view_features = {}
    for view in ['motion', 'heartrate', 'sleep']:#, 'demographics']:
        features = compute_cluster_features(best_clusters[view], y_train, best_k[view])
        view_features[view] = features

    # Aggregate all features and their origins
    all_features = []
    cluster_origins = []  # Track (view, cluster_idx)
    
    for view in ['motion', 'heartrate', 'sleep']:#, 'demographics']:
        for idx, feat in enumerate(view_features[view]):
                all_features.append(feat)
                cluster_origins.append((view, idx))
    
    all_features = np.array(all_features)
    


    
    best_consensus_k = 6

    


    # Consensus clustering using K-medoids
    kmedoids_consensus = KMedoids(
        n_clusters=best_consensus_k,
        metric='euclidean'
    )
    
    

    consensus_labels = kmedoids_consensus.fit_predict(all_features)
    sil_score = silhouette_score(all_features, consensus_labels)
    #print(sil_score)

    consensus_connectivity = 0.0
    medoid_indices = kmedoids_consensus.medoid_indices_
    for i, label in enumerate(consensus_labels):
        medoid_idx = medoid_indices[label]
        consensus_connectivity += np.linalg.norm(all_features[i] - all_features[medoid_idx])
        consensus_connectivity /= len(all_features)


    # Mapping to original medoids
    consensus_medoids = {}
    for cluster_idx in range(best_consensus_k):
        indices = np.where(consensus_labels == cluster_idx)[0]
        origins = [cluster_origins[i] for i in indices]
        
        medoids = []
        for (view, orig_cluster) in origins:
            # Bounds Check
            if orig_cluster < len(best_medoids[view]):
                medoid_bag = best_medoids[view][orig_cluster]
                medoids.append((view, medoid_bag))
        
        consensus_medoids[cluster_idx] = medoids


    return consensus_medoids 

In [58]:


def cross(bags_dict, y, n_folds=5):
    """ Perform cross-validation with consensus clustering and CatBoost regression."""
    kf = KFold(n_splits=n_folds, shuffle=True)
    mae_scores = []
    r2_scores = []
    fold_info = []

    for fold, (train_idx, test_idx) in enumerate(kf.split(y)):
        print(f"Processing fold {fold+1}/{n_folds}")
        
        # Split data for this fold
        train_bags = {view: [bags_dict[view][i] for i in train_idx] for view in bags_dict}
        test_bags = {view: [bags_dict[view][i] for i in test_idx] for view in bags_dict}
        y_train, y_test = y[train_idx], y[test_idx]
       

        # Set scaling Params 
        scaling_param = {
            view: get_scaling_param(
                [item for sublist in [train_bags[view], test_bags[view]] for item in sublist]
            )
            for view in ['motion', 'heartrate', 'sleep']
        }

        scale_bag = lambda bag, stats: (
            lambda reshaped: (
                (reshaped - stats['mean']) / stats['std'] - stats['min']
            ) / (stats['max'] - stats['min'])
        )(bag.reshape(-1, bag.shape[-1])).reshape(bag.shape)

        # Scaling of bags
        for view in ['motion', 'heartrate', 'sleep']:#, 'demographics']:
            train_bags[view] = [scale_bag(bag, scaling_param[view]) for bag in train_bags[view]]
            test_bags[view] = [scale_bag(bag, scaling_param[view]) for bag in test_bags[view]]

        # Set up Consensus medoids using training bags
        consensus_medoids = perform_consensus_clustering(train_bags, y_train)
        
        # Transform bags using consensus medoids derived from clustering
        X_train_transformed = np.array([transform_bag({
            'motion': train_bags['motion'][i],
            'heartrate': train_bags['heartrate'][i],
            'sleep': train_bags['sleep'][i]
        }, consensus_medoids) for i in range(len(train_idx))])
        
        X_test_transformed = np.array([transform_bag({
            'motion': test_bags['motion'][i],
            'heartrate': test_bags['heartrate'][i],
            'sleep': test_bags['sleep'][i]
        }, consensus_medoids) for i in range(len(test_idx))])




        # Train and evaluate CatBoost
        model = CatBoostRegressor(
            iterations=1000,
            learning_rate=0.1,
            depth=3,
            loss_function='RMSE',
            verbose=False
        )
        
        model.fit(
            X_train_transformed, y_train,
            eval_set=(X_test_transformed, y_test),
            use_best_model=True,
            early_stopping_rounds=100
        )
        
        y_pred = model.predict(X_test_transformed)
        
        # Store metrics
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mae_scores.append(mae)
        r2_scores.append(r2)
        
        fold_info.append({
            'fold': fold+1,
            'mae': mae,
            'r2': r2,
            'n_train': len(train_idx),
            'n_test': len(test_idx)
        })

    return {
        'global_mae': np.mean(mae_scores),
        'global_r2': np.mean(r2_scores),
        'fold_details': fold_info
    }

In [59]:
def loso(bags_dict, y):
    """ Perform Leave-One-Sample-Out cross-validation with consensus clustering and CatBoost regression."""
    n_samples = len(y)
    loo = LeaveOneOut()
    abs_errors = []  
    all_y_true = []  
    all_y_pred = []  
    fold_info = []

    for fold, (train_idx, test_idx) in enumerate(loo.split(y)):
        print(f"Processing fold {fold+1}/{n_samples}")
        
        # Split data for this fold
        train_bags = {view: [bags_dict[view][i] for i in train_idx] for view in bags_dict}
        test_bags = {view: [bags_dict[view][i] for i in test_idx] for view in bags_dict}
        y_train, y_test = y[train_idx], y[test_idx]
        true_val = y_test[0]  # Extract scalar value
    

        # Set scaling Params
        scaling_param = {
            view: get_scaling_param(
                [item for sublist in [train_bags[view], test_bags[view]] for item in sublist]
            )
            for view in ['motion', 'heartrate', 'sleep']#, 'demographics']
        }

        scale_bag = lambda bag, stats: (
            lambda reshaped: (
                (reshaped - stats['mean']) / stats['std'] - stats['min']
            ) / (stats['max'] - stats['min'])
        )(bag.reshape(-1, bag.shape[-1])).reshape(bag.shape)

        # Scale bags
        for view in ['motion', 'heartrate', 'sleep']:#, 'demographics']:
            train_bags[view] = [scale_bag(bag, scaling_param[view]) for bag in train_bags[view]]
            test_bags[view] = [scale_bag(bag, scaling_param[view]) for bag in test_bags[view]]

        # 1. Cluster using TRAINING DATA ONLY
        consensus_medoids = perform_consensus_clustering(train_bags, y_train)
        
           
        # Transform bags using consensus medoids derived from clustering
        X_train_transformed = np.array([transform_bag({
            'motion': train_bags['motion'][i],
            'heartrate': train_bags['heartrate'][i],
            'sleep': train_bags['sleep'][i]
            #'demographics': train_bags['demographics'][i]
        }, consensus_medoids) for i in range(len(train_idx))])
        
        X_test_transformed = np.array([transform_bag({
            'motion': test_bags['motion'][0],
            'heartrate': test_bags['heartrate'][0],
            'sleep': test_bags['sleep'][0]
            #'demographics': test_bags['demographics'][0]
        }, consensus_medoids)])
        

        # 3. Train and evaluate CatBoost
        model = CatBoostRegressor(
            iterations=1000,
            learning_rate=0.1,
            depth=3,
            loss_function='RMSE',
            verbose=False
        )
        
        model.fit(
            X_train_transformed, y_train,
            eval_set=(X_test_transformed, y_test),
            use_best_model=True,
            early_stopping_rounds=100
        )
        
        pred_val = model.predict(X_test_transformed)[0]  
        
        # Store metrics for final calculation
        abs_error = abs(true_val - pred_val)
        abs_errors.append(abs_error)
        all_y_true.append(true_val)
        all_y_pred.append(pred_val)
        
        fold_info.append({
            'fold': fold+1,
            'abs_error': abs_error,
            'true': true_val,
            'pred': pred_val,
            'n_train': len(train_idx),
            'n_test': 1
        })

    global_mae = np.mean(abs_errors)
    global_r2 = r2_score(all_y_true, all_y_pred)
    
    return {
        'global_mae': global_mae,
        'global_r2': global_r2,
        'fold_details': fold_info,
        'all_true': all_y_true,
        'all_pred': all_y_pred
    }

In [None]:


sil_list = []
con_list= []
bags_dict = {
    'motion': motion_bags,
    'heartrate': heartrate_bags,
    'sleep': sleep_bags
    #'demographics': demographic_bags

}


for i in range(0,1):
    results = loso(bags_dict, weekly_scores)
    print(f"Final MAE: {results['global_mae']:.3f}, R²: {results['global_r2']:.3f}")


