In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os
import subprocess
from scripts.GetSummary import summary

Let's use the summary function to collect the data from all participants

In [None]:
participants = []
baseDir = 'data/participantdata'

for folder in os.listdir(baseDir):
    path = os.path.join(baseDir, folder)
    if os.path.isdir(path):
        try:
            summ = summary(path).reset_index()
            summ['participant'] = folder
            summ = summ.set_index('participant')
            participants.append(summ)
        except Exception as e:
            print(f"Skipping {folder}: {e}")

summDF = pd.concat(participants)
summDF.head()

Now, let's pivot our dataframe to get a 16D vector for each participant.

In [None]:
pivot = summDF.pivot_table(
    index='participant',
    columns=['Condition'],
    values=['accuracy', 'RT']
)

pivot.columns = [f'{condition}_{stat}' for stat, condition in pivot.columns]
df = pivot
df.head()

Now, let's filter out participants who have average accuracies on any of the tasks less than 0.5.

In [None]:
accCols = [i for i in df.columns if i.endswith('_accuracy')]
df = df[df[accCols].min(axis=1) >= 0.5]
df.head()

Let's export this CSV so that we can analyze it using the NBClust package in R to determine the optimal number of clusters.

In [None]:
df.to_csv('data/preprocessed_vectors.csv', index=False)

Then, we run our R script and get back the optimal number of clusters.

In [None]:
subprocess.run(['Rscript', 'scripts/NBClustEval.R'], check=True, stdout=subprocess.DEVNULL)

with open('data/optimal_k.txt', 'r') as f:
    optimalK = int(f.read())

print(f"Optimal number of clusters: {optimalK}")

Now using that we've found the number of clusters, let's implement clustering.

In [None]:
scaled = StandardScaler().fit_transform(df)

kmeans = KMeans(n_clusters=optimalK, n_init='auto')
labels = kmeans.fit_predict(scaled)

clusteredDf = df.copy()

clusteredDf['cluster'] = labels
clusteredDf.head()


Exporting the clustered dataframe for further analysis.

In [None]:
clusteredDf.to_csv('data/clustered_data.csv')