# TAUKADIAL Dataset 

By Jimuel Celeste, Jr. 

Objective: Generate a clean metadata for TAUKADIAL dataset. 



In [1]:
import os 
import pandas as pd

## Train

In [2]:
train = '/Users/jimuelcelestejr/Downloads/Dataset/TAUKADIAL/archive/train/train_groundtruth.csv'
train_df = pd.read_csv(train)
train_df.head()

Unnamed: 0,tkdname,age,sex,mmse,dx
0,taukdial-002-1.wav,72,F,29,NC
1,taukdial-002-2.wav,72,F,29,NC
2,taukdial-002-3.wav,72,F,29,NC
3,taukdial-003-1.wav,80,M,23,MCI
4,taukdial-003-2.wav,80,M,23,MCI


## Test

In [3]:
test = '/Users/jimuelcelestejr/Downloads/Dataset/TAUKADIAL/archive/test/testgroundtruth.csv'
test_df = pd.read_csv(test, sep=';')
test_df.head()

Unnamed: 0,tkdname,mmse,dx
0,taukdial-001-1.wav,27,NC
1,taukdial-001-2.wav,27,NC
2,taukdial-001-3.wav,27,NC
3,taukdial-010-1.wav,27,MCI
4,taukdial-010-2.wav,27,MCI


In [4]:
test_metadata = '/Users/jimuelcelestejr/Downloads/Dataset/TAUKADIAL/archive/test/meta_test.csv'
test_metadata_df = pd.read_csv(test_metadata, sep=';')
test_metadata_df.head()

Unnamed: 0,tkdname,age,sex
0,taukdial-036-1.wav,67,M
1,taukdial-036-2.wav,67,M
2,taukdial-036-3.wav,67,M
3,taukdial-022-1.wav,86,M
4,taukdial-022-2.wav,86,M


In [5]:
test_df.merge(test_metadata_df, on='tkdname', how='outer')

Unnamed: 0,tkdname,mmse,dx,age,sex
0,taukdial-001-1.wav,27,NC,70,F
1,taukdial-001-2.wav,27,NC,70,F
2,taukdial-001-3.wav,27,NC,70,F
3,taukdial-010-1.wav,27,MCI,91,F
4,taukdial-010-2.wav,27,MCI,91,F
...,...,...,...,...,...
115,taukdial-166-2.wav,28,MCI,88,M
116,taukdial-166-3.wav,28,MCI,88,M
117,taukdial-168-1.wav,29,NC,65,M
118,taukdial-168-2.wav,29,NC,65,M


In [6]:
test_df.shape, test_metadata_df.shape

((120, 3), (120, 3))

In [7]:
test_df = test_metadata_df.merge(test_df, on='tkdname', how='outer')
test_df

Unnamed: 0,tkdname,age,sex,mmse,dx
0,taukdial-001-1.wav,70,F,27,NC
1,taukdial-001-2.wav,70,F,27,NC
2,taukdial-001-3.wav,70,F,27,NC
3,taukdial-010-1.wav,91,F,27,MCI
4,taukdial-010-2.wav,91,F,27,MCI
...,...,...,...,...,...
115,taukdial-166-2.wav,88,M,28,MCI
116,taukdial-166-3.wav,88,M,28,MCI
117,taukdial-168-1.wav,65,M,29,NC
118,taukdial-168-2.wav,65,M,29,NC


## Merge: Train + Test

In [8]:
merged_df = pd.concat([train_df, test_df])

In [9]:
for column in merged_df.columns: 
    merged_df[column] = merged_df[column].apply(lambda x: x.strip() if type(x) == str else x)

In [10]:
merged_df = merged_df.rename(columns={
    'tkdname': 'filename'
})
merged_df['record_id'] = merged_df['filename'].apply(lambda x: x.split('.')[0])
merged_df['subject_id'] = merged_df['filename'].apply(lambda x: x.split('.')[0][:-2])
merged_df['dx_binary'] = merged_df['dx'].apply(lambda x: 1 if x == 'MCI' else 0)
merged_df = merged_df[[
    'filename', 
    'record_id', 
    'subject_id', 
    'age', 
    'sex', 
    'mmse', 
    'dx', 
    'dx_binary'
]]
merged_df = merged_df.sort_values(by='filename')
merged_df

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary
0,taukdial-001-1.wav,taukdial-001-1,taukdial-001,70,F,27,NC,0
1,taukdial-001-2.wav,taukdial-001-2,taukdial-001,70,F,27,NC,0
2,taukdial-001-3.wav,taukdial-001-3,taukdial-001,70,F,27,NC,0
0,taukdial-002-1.wav,taukdial-002-1,taukdial-002,72,F,29,NC,0
1,taukdial-002-2.wav,taukdial-002-2,taukdial-002,72,F,29,NC,0
...,...,...,...,...,...,...,...,...
118,taukdial-168-2.wav,taukdial-168-2,taukdial-168,65,M,29,NC,0
119,taukdial-168-3.wav,taukdial-168-3,taukdial-168,65,M,29,NC,0
384,taukdial-169-1.wav,taukdial-169-1,taukdial-169,81,F,28,MCI,1
385,taukdial-169-2.wav,taukdial-169-2,taukdial-169,81,F,28,MCI,1


In [11]:
merged_df['dx'].value_counts()

dx
MCI    285
NC     222
Name: count, dtype: int64

## Save: metadata.csv

In [12]:
output_dir = '/Users/jimuelcelestejr/Downloads/Dataset/TAUKADIAL'
output_file = os.path.join(output_dir, 'metadata.csv')
merged_df.to_csv(output_file, index=False)

## Check if all files in metadata are in the folder

In [13]:
metadata = pd.read_csv(output_file)
i = 0
for file in metadata['filename']:
    file = os.path.join(output_dir)
    if os.path.exists(file):
        i += 1
        # print(f"'{file}' exists.")
    else:
        print(f"'{file}' does not exist.")
print(metadata.shape, i)

(507, 8) 507


## Check if all files in the folder are in the metadata

In [14]:
i = 0
for root, dirs, files in os.walk(output_dir):
    for file in files:
        base, ext = os.path.splitext(file)
        if ext == '.wav':
            if file in metadata['filename'].values:
                i += 1
            else:
                print(f"'{file}' not in metadata.")
print(i)

507
