In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import os
from scripts.GetSummary import summary

Let's use the summary function to collect the data from all participants

In [2]:
participants = []
baseDir = 'data/participantdata'

for folder in os.listdir(baseDir):
    path = os.path.join(baseDir, folder)
    if os.path.isdir(path):
        try:
            summ = summary(path).reset_index()
            summ['participant'] = folder
            summ = summ.set_index('participant')
            participants.append(summ)
        except Exception as e:
            print(f"Skipping {folder}: {e}")

summDF = pd.concat(participants)
summDF.head()

Skipping reldom2_s041: "['Order', 'Type'] not in index"


Unnamed: 0_level_0,Condition,accuracy,RT
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reldom2_s001,FO_audwhat,1.0,1.015675
reldom2_s001,FO_audwhere,0.78125,1.720722
reldom2_s001,FO_viswhat,1.0,1.29637
reldom2_s001,FO_viswhere,0.96875,1.341634
reldom2_s001,SO_audwhat,1.0,0.808944


Now, let's pivot our dataframe to get a 16D vector for each participant.

In [3]:
pivot = summDF.pivot_table(
    index='participant',
    columns=['Condition'],
    values=['accuracy', 'RT']
)

pivot.columns = [f'{condition}_{stat}' for stat, condition in pivot.columns]
df = pivot
df.head()

Unnamed: 0_level_0,FO_audwhat_RT,FO_audwhere_RT,FO_viswhat_RT,FO_viswhere_RT,SO_audwhat_RT,SO_audwhere_RT,SO_viswhat_RT,SO_viswhere_RT,FO_audwhat_accuracy,FO_audwhere_accuracy,FO_viswhat_accuracy,FO_viswhere_accuracy,SO_audwhat_accuracy,SO_audwhere_accuracy,SO_viswhat_accuracy,SO_viswhere_accuracy
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
reldom2_s001,1.015675,1.720722,1.29637,1.341634,0.808944,1.606454,1.251278,1.128,1.0,0.78125,1.0,0.96875,1.0,0.75,0.875,1.0
reldom2_s002,1.34929,1.356664,1.165648,1.05839,1.461668,1.584935,1.322362,1.095812,0.75,0.8125,0.9375,0.935484,0.875,0.6875,0.9375,0.875
reldom2_s003,1.077459,1.149573,0.877843,0.902444,1.103434,1.035821,0.832946,0.76639,0.9375,1.0,1.0,1.0,0.75,1.0,1.0,1.0
reldom2_s004,0.960171,1.142065,1.030101,0.832803,1.052119,1.008416,0.955326,0.707979,0.96875,1.0,1.0,0.96875,1.0,1.0,1.0,1.0
reldom2_s005,1.505764,1.259297,1.154793,1.004116,1.255591,0.999182,0.895621,0.822283,0.8125,0.9375,0.96875,0.9375,0.9375,1.0,0.9375,0.875


Now, let's filter out participants who have average accuracies on any of the tasks less than 0.5.

In [4]:
accCols = [i for i in df.columns if i.endswith('_accuracy')]
df = df[df[accCols].min(axis=1) >= 0.5]
df.head()

Unnamed: 0_level_0,FO_audwhat_RT,FO_audwhere_RT,FO_viswhat_RT,FO_viswhere_RT,SO_audwhat_RT,SO_audwhere_RT,SO_viswhat_RT,SO_viswhere_RT,FO_audwhat_accuracy,FO_audwhere_accuracy,FO_viswhat_accuracy,FO_viswhere_accuracy,SO_audwhat_accuracy,SO_audwhere_accuracy,SO_viswhat_accuracy,SO_viswhere_accuracy
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
reldom2_s001,1.015675,1.720722,1.29637,1.341634,0.808944,1.606454,1.251278,1.128,1.0,0.78125,1.0,0.96875,1.0,0.75,0.875,1.0
reldom2_s002,1.34929,1.356664,1.165648,1.05839,1.461668,1.584935,1.322362,1.095812,0.75,0.8125,0.9375,0.935484,0.875,0.6875,0.9375,0.875
reldom2_s003,1.077459,1.149573,0.877843,0.902444,1.103434,1.035821,0.832946,0.76639,0.9375,1.0,1.0,1.0,0.75,1.0,1.0,1.0
reldom2_s004,0.960171,1.142065,1.030101,0.832803,1.052119,1.008416,0.955326,0.707979,0.96875,1.0,1.0,0.96875,1.0,1.0,1.0,1.0
reldom2_s005,1.505764,1.259297,1.154793,1.004116,1.255591,0.999182,0.895621,0.822283,0.8125,0.9375,0.96875,0.9375,0.9375,1.0,0.9375,0.875


Assuming k=3, let's implement clustering through KMeans

In [6]:
scaled = StandardScaler().fit_transform(df)

kmeans = KMeans(n_clusters=3, n_init='auto')
labels = kmeans.fit_predict(scaled)

df['cluster'] = labels
df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cluster'] = labels


Unnamed: 0_level_0,FO_audwhat_RT,FO_audwhere_RT,FO_viswhat_RT,FO_viswhere_RT,SO_audwhat_RT,SO_audwhere_RT,SO_viswhat_RT,SO_viswhere_RT,FO_audwhat_accuracy,FO_audwhere_accuracy,FO_viswhat_accuracy,FO_viswhere_accuracy,SO_audwhat_accuracy,SO_audwhere_accuracy,SO_viswhat_accuracy,SO_viswhere_accuracy,cluster
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
reldom2_s001,1.015675,1.720722,1.29637,1.341634,0.808944,1.606454,1.251278,1.128,1.0,0.78125,1.0,0.96875,1.0,0.75,0.875,1.0,0
reldom2_s002,1.34929,1.356664,1.165648,1.05839,1.461668,1.584935,1.322362,1.095812,0.75,0.8125,0.9375,0.935484,0.875,0.6875,0.9375,0.875,2
reldom2_s003,1.077459,1.149573,0.877843,0.902444,1.103434,1.035821,0.832946,0.76639,0.9375,1.0,1.0,1.0,0.75,1.0,1.0,1.0,1
reldom2_s004,0.960171,1.142065,1.030101,0.832803,1.052119,1.008416,0.955326,0.707979,0.96875,1.0,1.0,0.96875,1.0,1.0,1.0,1.0,1
reldom2_s005,1.505764,1.259297,1.154793,1.004116,1.255591,0.999182,0.895621,0.822283,0.8125,0.9375,0.96875,0.9375,0.9375,1.0,0.9375,0.875,0
