In [5]:
import pandas as pd

# Read the two parquet files
df1 = pd.read_hdf("cognitive-circles/df40participants.h5")

df1.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,TaskPerceivedPhysicalEffort,TaskPerceivedDifficulty,TaskPerceivedFrustration,TaskPerceivedStress,ParticipantWantedToRerateTaskPerceivedDifficultyLevel,TaskPerceivedMentalDemmandAfterRerated,TaskPerceivedPhysicalEffortAfterRerated,TaskPerceivedDifficultyAfterRerated,TaskPerceivedFrustrationAfterRerated,TaskPerceivedStressAfterRerated
0,378.950623,390.868329,404.769576,422.604988,443.42394,467.209975,482.904738,498.319701,512.78404,527.204489,...,2,5,5,6,,6,2,5,5,6
1,454.875312,465.725686,476.588529,488.401496,500.25187,511.177057,522.0399,532.902743,537.995012,542.438903,...,2,2,4,5,,3,2,2,4,5
2,462.274314,442.548628,423.781796,405.987531,390.097257,376.15212,363.24813,357.103491,351.678928,343.234414,...,3,7,7,7,,7,3,7,7,7
3,621.979426,622.958853,623.0,620.246883,613.720075,605.987531,597.296135,585.975062,573.407107,565.043017,...,2,6,7,5,,6,2,6,7,5
4,612.955611,613.0,613.0,610.266334,605.554863,599.799002,593.087531,584.598005,576.598005,568.775561,...,2,4,5,4,,5,2,4,5,4


In [24]:
import numpy as np
from scipy.spatial.distance import cdist
def medoid(X):
    """Return the index of the medoid of a numpy array X (rows = samples)."""
    distances = cdist(X, X, metric='euclidean')   # pairwise distances
    total_distances = distances.sum(axis=1)
    return np.argmin(total_distances)

medoids = {'X': {}, 'V': {}}
medoid_ids = {'X': {}, 'V': {}}
centroids = {'X': {}, 'V': {}}
global_medoid_ids = {'X': -1, 'V': -1}

for (VAR, VARNAME), df in {('X', 'X'): df2, ('V', 'velocity'): df1}.items():    
    # Suppose df1 has features + "Class"
    medoids_VX = medoids[VAR]
    medoid_ids_VX = medoid_ids[VAR]
    centroids_VX = centroids[VAR]
    for cls, group in df.groupby("RealDifficulty"):
        X = group.drop(columns=["RealDifficulty"])
        for task, subgroup in group.groupby('Task'):
            X_subgroup = group.drop(group.select_dtypes(exclude=[np.float64]), axis=1)
            id_task = medoid(X_subgroup.to_numpy())
            inst_id = X_subgroup.index[id_task]
            X_subgroup.iloc[id_task].to_frame().to_csv(f'data/{VARNAME}_{cls}_{task}_{inst_id}_medoid.csv', header=False)
            centroid_task = np.mean(X_subgroup, axis=0)
            pd.Series(centroid_task, index=X_subgroup.index).to_frame().to_csv(f'data/{VARNAME}_{cls}_{task}_centroid.csv', header=False)
            
        X.drop(X.select_dtypes(exclude=[np.float64]), axis=1, inplace=True)
        X = X.to_numpy()
        idx = medoid(X)
        centroids_VX[f'{cls}'] = np.mean(X, axis=0)
        medoids_VX[cls] = group.iloc[idx]
        medoid_ids_VX[cls] = group.index[idx]

    global_centroid_VX = np.mean(df.drop(df.select_dtypes(exclude=[np.float64]), axis=1), axis=0)
    pd.Series(global_centroid_VX).to_frame().to_csv(f'data/{VARNAME}_global_centroid.csv', header=False)
    for cls, series in centroids_VX.items():
        pd.Series(series, index=global_centroid_VX).to_frame().to_csv(f'data/{VARNAME}_{cls}_centroid.csv', header=False)
    
    global_medoid_ids[VAR] = medoid(df.drop(df.select_dtypes(exclude=[np.float64]), axis=1).to_numpy())
    df.iloc[global_medoid_ids[VAR]].to_frame().to_csv(f'data/{VARNAME}_global_medoid.csv', header=False)
    

In [25]:
metadata_dict = {}

for (VAR, VARNAME), df in {('X', 'X'): df2, ('V', 'velocity'): df1}.items():    
    for index, row in df.iterrows():
        nrow = row.copy()
        cls = row["RealDifficulty"]
        nrow.drop("RealDifficulty", inplace=True)
        filename = f'data/{VARNAME}_instance_{index:03d}_{cls}.csv'
        nrow.to_frame().to_csv(filename, header=False)
        metadata = {}
        metadata['series'] = filename
        metadata['class'] = row['RealDifficulty']
        metadata['participant'] = row['ParticipantID']
        metadata['task'] = row['Task']
        opposite_class = next(iter(set(df['RealDifficulty'].unique()) - {row['RealDifficulty']}))
        metadata['medoid_id_opposite_class'] = medoid_ids[VAR][opposite_class]
        metadata['medoid_opposite_class_series'] = f'data/{VARNAME}_instance_{metadata['medoid_id_opposite_class']:03d}_{opposite_class}.csv'
        metadata['global_centroid_series'] = f'data/{VAR}_global_centroid.csv'
        metadata['centroid_opposite_class_series'] = f'data/{VARNAME}_{opposite_class}_centroid.csv'    
        metadata['beta_attributions'] = f'{VARNAME}/beta_inst_{index:03d}.csv'
        metadata['global_medoid_id'] = global_medoid_ids[VAR]
        metadata['global_medoid_series'] = f'data/{VARNAME}_global_medoid.csv'
        metadata_dict[f'{VAR}' + str(index)] = metadata

In [26]:
meta_df = pd.DataFrame(metadata_dict)
meta_df.T.to_csv("metadata.csv", index=False)
