## ADReSSo Dataset

By Jimuel Celeste, Jr. 

Objective: To prepare a clean metadata for the ADReSSo Dataset.

In [1]:
import os
import pandas as pd 

## Train

In [2]:
train = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSSo/archive/ADReSSo21_train/adresso-train-mmse-scores.csv'
train_df = pd.read_csv(train, index_col=0).reset_index(drop=True)
train_df

Unnamed: 0,adressfname,mmse,dx
0,adrso024,20,ad
1,adrso025,11,ad
2,adrso027,18,ad
3,adrso028,18,ad
4,adrso031,26,ad
...,...,...,...
161,adrso309,30,cn
162,adrso310,28,cn
163,adrso312,29,cn
164,adrso315,28,cn


## Test

In [3]:
test_mmse = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSSo/archive/ADReSSo21 2_test/test_results_task2_groundtruth.csv'
test_mmse_df = pd.read_csv(test_mmse)
test_mmse_df

Unnamed: 0,ID,MMSE
0,adrsdt15,30
1,adrsdt40,28
2,adrsdt26,29
3,adrsdt67,30
4,adrsdt58,29
...,...,...
66,adrsdt55,29
67,adrsdt68,30
68,adrsdt31,28
69,adrsdt5,29


In [4]:
test_dx = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSSo/archive/ADReSSo21 2_test/test_results_task1_groundtruth.csv'
test_dx_df = pd.read_csv(test_dx)
test_dx_df

Unnamed: 0,ID,Dx
0,adrsdt15,Control
1,adrsdt40,Control
2,adrsdt26,Control
3,adrsdt67,Control
4,adrsdt58,Control
...,...,...
66,adrsdt55,Control
67,adrsdt68,Control
68,adrsdt31,Control
69,adrsdt5,Control


In [5]:
test_df = test_mmse_df.merge(test_dx_df, on='ID', how='outer')
test_df = test_df.rename(columns={
    'ID': 'adressfname', 
    'MMSE': 'mmse', 
    'Dx': 'dx'
})
test_df

Unnamed: 0,adressfname,mmse,dx
0,adrsdt1,19,ProbableAD
1,adrsdt10,27,Control
2,adrsdt11,30,Control
3,adrsdt12,30,Control
4,adrsdt13,20,ProbableAD
...,...,...,...
66,adrsdt7,24,ProbableAD
67,adrsdt70,18,ProbableAD
68,adrsdt71,24,ProbableAD
69,adrsdt8,10,ProbableAD


## Merge: train + test

In [6]:
merged_df = pd.concat([train_df, test_df])
merged_df = merged_df.rename(columns={'adressfname': 'record_id'})
merged_df['subject_id'] = merged_df['record_id'] 
merged_df['filename'] = merged_df['record_id'].apply(lambda x: x + '.wav')
merged_df = merged_df.reset_index(drop=True)
merged_df

Unnamed: 0,record_id,mmse,dx,subject_id,filename
0,adrso024,20,ad,adrso024,adrso024.wav
1,adrso025,11,ad,adrso025,adrso025.wav
2,adrso027,18,ad,adrso027,adrso027.wav
3,adrso028,18,ad,adrso028,adrso028.wav
4,adrso031,26,ad,adrso031,adrso031.wav
...,...,...,...,...,...
232,adrsdt7,24,ProbableAD,adrsdt7,adrsdt7.wav
233,adrsdt70,18,ProbableAD,adrsdt70,adrsdt70.wav
234,adrsdt71,24,ProbableAD,adrsdt71,adrsdt71.wav
235,adrsdt8,10,ProbableAD,adrsdt8,adrsdt8.wav


In [7]:
merged_df['dx'].value_counts()

dx
ad            87
cn            79
Control       36
ProbableAD    35
Name: count, dtype: int64

In [8]:
merged_df['dx'] = merged_df['dx'].apply(lambda x: 'AD' if x == 'ad' else x)
merged_df['dx'] = merged_df['dx'].apply(lambda x: 'NC' if x == 'cn' else x)
merged_df['dx'] = merged_df['dx'].apply(lambda x: 'NC' if x == 'Control' else x)
merged_df['dx'] = merged_df['dx'].apply(lambda x: 'Probable AD' if x == 'ProbableAD' else x)
merged_df['dx'].value_counts()

dx
NC             115
AD              87
Probable AD     35
Name: count, dtype: int64

In [9]:
merged_df['dx_binary'] = merged_df['dx'].apply(lambda x: 1 if x in ['AD', 'Probable AD'] else 0)
merged_df

Unnamed: 0,record_id,mmse,dx,subject_id,filename,dx_binary
0,adrso024,20,AD,adrso024,adrso024.wav,1
1,adrso025,11,AD,adrso025,adrso025.wav,1
2,adrso027,18,AD,adrso027,adrso027.wav,1
3,adrso028,18,AD,adrso028,adrso028.wav,1
4,adrso031,26,AD,adrso031,adrso031.wav,1
...,...,...,...,...,...,...
232,adrsdt7,24,Probable AD,adrsdt7,adrsdt7.wav,1
233,adrsdt70,18,Probable AD,adrsdt70,adrsdt70.wav,1
234,adrsdt71,24,Probable AD,adrsdt71,adrsdt71.wav,1
235,adrsdt8,10,Probable AD,adrsdt8,adrsdt8.wav,1


In [10]:
merged_df = merged_df[[
    'filename', 
    'record_id', 
    'subject_id', 
    'mmse',
    'dx', 
    'dx_binary'
]].reset_index(drop=True)
merged_df

Unnamed: 0,filename,record_id,subject_id,mmse,dx,dx_binary
0,adrso024.wav,adrso024,adrso024,20,AD,1
1,adrso025.wav,adrso025,adrso025,11,AD,1
2,adrso027.wav,adrso027,adrso027,18,AD,1
3,adrso028.wav,adrso028,adrso028,18,AD,1
4,adrso031.wav,adrso031,adrso031,26,AD,1
...,...,...,...,...,...,...
232,adrsdt7.wav,adrsdt7,adrsdt7,24,Probable AD,1
233,adrsdt70.wav,adrsdt70,adrsdt70,18,Probable AD,1
234,adrsdt71.wav,adrsdt71,adrsdt71,24,Probable AD,1
235,adrsdt8.wav,adrsdt8,adrsdt8,10,Probable AD,1


In [11]:
merged_df[merged_df.duplicated()]

Unnamed: 0,filename,record_id,subject_id,mmse,dx,dx_binary


In [12]:
merged_df['dx'].value_counts()

dx
NC             115
AD              87
Probable AD     35
Name: count, dtype: int64

## Save: metadata.csv

In [13]:
output_dir = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSSo'
output_file = os.path.join(output_dir, 'metadata.csv')
merged_df.to_csv(output_file, index=False)

## Check if all files are in the folder

In [14]:
metadata = pd.read_csv(output_file)
i = 0
for file in metadata['filename']:
    file = os.path.join(output_dir)
    if os.path.exists(file):
        i += 1
        # print(f"'{file}' exists.")
    else:
        print(f"'{file}' does not exist.")
print(metadata.shape, i)

(237, 6) 237


## Check if all files in the folder are in the metadata

In [15]:
i = 0
for root, dirs, files in os.walk(output_dir):
    for file in files:
        base, ext = os.path.splitext(file)
        if ext == '.wav':
            if file in metadata['filename'].values:
                i += 1
            else:
                print(f"'{file}' not in metadata.")
    break # in the root directory only
print(i)

237
