In [61]:
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

import subprocess
import sys
import os
sys.path.append(os.path.abspath('..'))

from src.GetSummary import summary

Let's use the summary function to collect the data from all participants

In [62]:
participants = []
baseDir = '../data/participantdata'

for folder in os.listdir(baseDir):
    path = os.path.join(baseDir, folder)
    if os.path.isdir(path):
        try:
            summ = summary(path).reset_index()
            summ['participant'] = folder
            summ = summ.set_index('participant')
            participants.append(summ)
        except Exception as e:
            print(f"Skipping {folder}: {e}")

summDF = pd.concat(participants)
summDF.head()

Skipping reldom2_s041: "['Order', 'Type'] not in index"


Unnamed: 0_level_0,Condition,accuracy,RT
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
reldom2_s001,FO_audwhat,1.0,1.015675
reldom2_s001,FO_audwhere,0.78125,1.720722
reldom2_s001,FO_viswhat,1.0,1.29637
reldom2_s001,FO_viswhere,0.96875,1.341634
reldom2_s001,SO_audwhat,1.0,0.808944


Now, let's pivot our dataframe to get a 16D vector for each participant.

In [63]:
pivot = summDF.pivot_table(
    index='participant',
    columns=['Condition'],
    values=['accuracy', 'RT']
)

pivot.columns = [f'{condition}_{stat}' for stat, condition in pivot.columns]
df = pivot
df.head()

Unnamed: 0_level_0,FO_audwhat_RT,FO_audwhere_RT,FO_viswhat_RT,FO_viswhere_RT,SO_audwhat_RT,SO_audwhere_RT,SO_viswhat_RT,SO_viswhere_RT,FO_audwhat_accuracy,FO_audwhere_accuracy,FO_viswhat_accuracy,FO_viswhere_accuracy,SO_audwhat_accuracy,SO_audwhere_accuracy,SO_viswhat_accuracy,SO_viswhere_accuracy
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
reldom2_s001,1.015675,1.720722,1.29637,1.341634,0.808944,1.606454,1.251278,1.128,1.0,0.78125,1.0,0.96875,1.0,0.75,0.875,1.0
reldom2_s002,1.34929,1.356664,1.165648,1.05839,1.461668,1.584935,1.322362,1.095812,0.75,0.8125,0.9375,0.935484,0.875,0.6875,0.9375,0.875
reldom2_s003,1.077459,1.149573,0.877843,0.902444,1.103434,1.035821,0.832946,0.76639,0.9375,1.0,1.0,1.0,0.75,1.0,1.0,1.0
reldom2_s004,0.960171,1.142065,1.030101,0.832803,1.052119,1.008416,0.955326,0.707979,0.96875,1.0,1.0,0.96875,1.0,1.0,1.0,1.0
reldom2_s005,1.505764,1.259297,1.154793,1.004116,1.255591,0.999182,0.895621,0.822283,0.8125,0.9375,0.96875,0.9375,0.9375,1.0,0.9375,0.875


Now, let's filter out participants who have average accuracies on any of the tasks less than 0.5.

In [64]:
accCols = [i for i in df.columns if i.endswith('_accuracy')]
df = df[df[accCols].min(axis=1) >= 0.5]
df.head()

Unnamed: 0_level_0,FO_audwhat_RT,FO_audwhere_RT,FO_viswhat_RT,FO_viswhere_RT,SO_audwhat_RT,SO_audwhere_RT,SO_viswhat_RT,SO_viswhere_RT,FO_audwhat_accuracy,FO_audwhere_accuracy,FO_viswhat_accuracy,FO_viswhere_accuracy,SO_audwhat_accuracy,SO_audwhere_accuracy,SO_viswhat_accuracy,SO_viswhere_accuracy
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
reldom2_s001,1.015675,1.720722,1.29637,1.341634,0.808944,1.606454,1.251278,1.128,1.0,0.78125,1.0,0.96875,1.0,0.75,0.875,1.0
reldom2_s002,1.34929,1.356664,1.165648,1.05839,1.461668,1.584935,1.322362,1.095812,0.75,0.8125,0.9375,0.935484,0.875,0.6875,0.9375,0.875
reldom2_s003,1.077459,1.149573,0.877843,0.902444,1.103434,1.035821,0.832946,0.76639,0.9375,1.0,1.0,1.0,0.75,1.0,1.0,1.0
reldom2_s004,0.960171,1.142065,1.030101,0.832803,1.052119,1.008416,0.955326,0.707979,0.96875,1.0,1.0,0.96875,1.0,1.0,1.0,1.0
reldom2_s005,1.505764,1.259297,1.154793,1.004116,1.255591,0.999182,0.895621,0.822283,0.8125,0.9375,0.96875,0.9375,0.9375,1.0,0.9375,0.875


Let's export this CSV so that we can analyze it using the NBClust package in R to determine the optimal number of clusters.

In [65]:
df.to_csv('../data/preprocessed_vectors.csv', index=False)

Then, we run our R script and get back the optimal number of clusters.

In [66]:
subprocess.run(['Rscript', '../src/NBClustEval.R'], check=True)

with open('../data/optimal_k.txt', 'r') as f:
    optimalK = int(f.read())

print(f"Optimal number of clusters: {optimalK}")

package ‘NbClust’ was built under R version 4.1.3 


*** : The Hubert index is a graphical method of determining the number of clusters.
                In the plot of Hubert index, we seek a significant knee that corresponds to a 
                significant increase of the value of the measure i.e the significant peak in Hubert
                index second differences plot. 
 
*** : The D index is a graphical method of determining the number of clusters. 
                In the plot of D index, we seek a significant knee (the significant peak in Dindex
                second differences plot) that corresponds to a significant increase of the value of
                the measure. 
 
******************************************************************* 
* Among all indices:                                                
* 10 proposed 2 as the best number of clusters 
* 3 proposed 3 as the best number of clusters 
* 3 proposed 4 as the best number of clusters 
* 5 proposed 9 as the best number of clusters 
* 2 proposed 10 as the best numbe

Now using that we've found the number of clusters, let's implement clustering.

In [67]:
scaled = StandardScaler().fit_transform(df)

kmeans = KMeans(n_clusters=optimalK, n_init='auto')
labels = kmeans.fit_predict(scaled)

clusteredDf = df.copy()

clusteredDf['cluster'] = labels
clusteredDf.head()

Unnamed: 0_level_0,FO_audwhat_RT,FO_audwhere_RT,FO_viswhat_RT,FO_viswhere_RT,SO_audwhat_RT,SO_audwhere_RT,SO_viswhat_RT,SO_viswhere_RT,FO_audwhat_accuracy,FO_audwhere_accuracy,FO_viswhat_accuracy,FO_viswhere_accuracy,SO_audwhat_accuracy,SO_audwhere_accuracy,SO_viswhat_accuracy,SO_viswhere_accuracy,cluster
participant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
reldom2_s001,1.015675,1.720722,1.29637,1.341634,0.808944,1.606454,1.251278,1.128,1.0,0.78125,1.0,0.96875,1.0,0.75,0.875,1.0,0
reldom2_s002,1.34929,1.356664,1.165648,1.05839,1.461668,1.584935,1.322362,1.095812,0.75,0.8125,0.9375,0.935484,0.875,0.6875,0.9375,0.875,1
reldom2_s003,1.077459,1.149573,0.877843,0.902444,1.103434,1.035821,0.832946,0.76639,0.9375,1.0,1.0,1.0,0.75,1.0,1.0,1.0,0
reldom2_s004,0.960171,1.142065,1.030101,0.832803,1.052119,1.008416,0.955326,0.707979,0.96875,1.0,1.0,0.96875,1.0,1.0,1.0,1.0,0
reldom2_s005,1.505764,1.259297,1.154793,1.004116,1.255591,0.999182,0.895621,0.822283,0.8125,0.9375,0.96875,0.9375,0.9375,1.0,0.9375,0.875,0


Exporting the clustered dataframe for further analysis.

In [68]:
clusteredDf.to_csv('../data/clustered_data.csv')

In [69]:
grouped = clusteredDf.groupby('cluster').agg(['mean'])
grouped.columns = grouped.columns.get_level_values(0)
grouped
grouped.to_csv('../data/statsAcrossModalities.csv')
grouped

Unnamed: 0_level_0,FO_audwhat_RT,FO_audwhere_RT,FO_viswhat_RT,FO_viswhere_RT,SO_audwhat_RT,SO_audwhere_RT,SO_viswhat_RT,SO_viswhere_RT,FO_audwhat_accuracy,FO_audwhere_accuracy,FO_viswhat_accuracy,FO_viswhere_accuracy,SO_audwhat_accuracy,SO_audwhere_accuracy,SO_viswhat_accuracy,SO_viswhere_accuracy
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,1.152056,1.296829,1.094133,1.02228,1.072727,1.168934,0.924976,0.928302,0.916532,0.952802,0.988441,0.959937,0.922917,0.95625,0.98125,0.970833
1,1.487076,1.473058,1.286926,1.160971,1.201618,1.730225,1.314087,1.101382,0.809728,0.875,0.960938,0.97085,0.729167,0.671875,0.826042,0.760417


In [70]:
accuracyCols = [i for i in grouped.columns if i.endswith('_accuracy')]
accuracyMeans = grouped[accuracyCols].mean(axis=1)

rtCols = [i for i in grouped.columns if i.endswith('_RT')]
rtMeans = grouped[rtCols].mean(axis=1)


summary_df = pd.concat([accuracyMeans, rtMeans], axis=1)
summary_df.columns = ['mean_accuracy', 'mean_RT']
summary_df

Unnamed: 0_level_0,mean_accuracy,mean_RT
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.95612,1.08253
1,0.825502,1.344418
