In [1]:
from pathlib import Path
import os

from bids import BIDSLayout
import numpy as np
import mne
from pyclustering.cluster.kmeans import kmeans
from pyclustering.cluster.center_initializer import random_center_initializer
import pandas as pd

In [2]:
bids_root = Path(os.environ['biomag2020_data-bids'])

In [3]:
layout = BIDSLayout(bids_root, validate=True, derivatives=True)
layout.derivatives

{'01_preprocessing': BIDS Layout: ...s\derivatives\01_preprocessing | Subjects: 33 | Sessions: 66 | Runs: 0,
 '02_eigenvalues': BIDS Layout: ...ids\derivatives\02_eigenvalues | Subjects: 33 | Sessions: 66 | Runs: 0}

In [4]:
eigenvalue_files = layout.get(suffix='eigenvalues', extension='npy')

In [5]:
subjects = [f.entities['subject'] for f in eigenvalue_files]
sessions = [f.entities['session'] for f in eigenvalue_files]

In [6]:
df = (
    pd.DataFrame(
        columns=['subject', 'session'],
        data=zip(subjects, sessions))
    .reset_index()
    .rename(columns=dict(index='eigs_id')))
df.head()

Unnamed: 0,eigs_id,subject,session
0,0,BQBBKEBX,1457629800
1,1,BQBBKEBX,1458832200
2,2,BYADLMJH,1416503760
3,3,BYADLMJH,1417706220
4,4,CECMHHYP,1364481360


In [7]:
eigs_all = np.stack([np.load(f) for f in eigenvalue_files])

In [8]:
k = 2
initial_centers = random_center_initializer(eigs_all, k).initialize()
kmeans_instance = kmeans(eigs_all, initial_centers)

kmeans_instance.process()
clusters = kmeans_instance.get_clusters()
final_centers = kmeans_instance.get_centers()
 

In [9]:
cluster_assignment = pd.DataFrame(columns=['cluster_id', 'eigs_id'],
             data=[(cluster_id, eigs_id) 
                     for cluster_id, cluster in enumerate(clusters, 1)
                     for eigs_id in cluster
                    ]
            )
cluster_assignment.head()

Unnamed: 0,cluster_id,eigs_id
0,1,1
1,1,2
2,1,3
3,1,4
4,1,5


In [10]:
[len(c) for c in clusters]

[37, 29]

In [11]:
df = df.merge(cluster_assignment, on='eigs_id')
df.head()

Unnamed: 0,eigs_id,subject,session,cluster_id
0,0,BQBBKEBX,1457629800,2
1,1,BQBBKEBX,1458832200,1
2,2,BYADLMJH,1416503760,1
3,3,BYADLMJH,1417706220,1
4,4,CECMHHYP,1364481360,1


In [12]:
df.groupby('subject').agg(dict(cluster_id=['min', 'max'])).value_counts().sort_index()

(cluster_id, min)  (cluster_id, max)
1                  1                    12
                   2                    13
2                  2                     8
dtype: int64

In [13]:
df.groupby('subject').agg(dict(cluster_id=['min', 'max'])).nunique(axis=1).value_counts()

1    20
2    13
dtype: int64