# ADReSS-M Dataset

By Jimuel Celeste, Jr. 

Objective: To prepare a clean metadata for the ADReSS-M Dataset.


In [1]:
import os 
import pandas as pd

## Train

In [2]:
train = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS-M/archive/ADReSS-M-train/training-groundtruth.csv'
train_df = pd.read_csv(train)
train_df

Unnamed: 0,adressfname,age,gender,educ,dx,mmse
0,adrso002,70,female,,Control,26.0
1,adrso003,72,female,,Control,30.0
2,adrso004,74,female,,Control,30.0
3,adrso005,67,female,,Control,27.0
4,adrso006,65,female,,Control,28.0
...,...,...,...,...,...,...
232,adrso313,63,female,13.0,Control,28.0
233,adrso314,61,female,12.0,Control,29.0
234,adrso315,58,male,12.0,Control,28.0
235,adrso316,56,male,16.0,Control,30.0


## Test

In [3]:
test = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS-M/archive/test-gr/ADReSS-M-test-gr-groundtruth.csv'
test_df = pd.read_csv(test)
test_df = test_df.rename(columns={'addressfname': 'adressfname'})
test_df.head()

Unnamed: 0,adressfname,age,gender,educ,dx,mmse
0,madrs017,81,Female,12,Control,28
1,madrs067,74,Female,6,ProbableAD,18
2,madrs098,70,Female,17,Control,27
3,madrs040,75,Male,6,ProbableAD,24
4,madrs020,65,Female,10,ProbableAD,23


In [4]:
test_metadata = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS-M/archive/test-gr/ADReSS-M-meta.csv'
test_metadata_df = pd.read_csv(test_metadata)
test_metadata_df = test_metadata_df.rename(columns={'addressfname': 'adressfname'})
test_metadata_df.head()

Unnamed: 0,adressfname,age,gender,educ
0,madrs017,81,Female,12
1,madrs067,74,Female,6
2,madrs098,70,Female,17
3,madrs040,75,Male,6
4,madrs020,65,Female,10


Age, gender, and education are already included in test_df; I will use that.

In [5]:
test_df.shape, test_metadata_df.shape

((46, 6), (46, 4))

## Sample

In [6]:
sample = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS-M/archive/ADReSS-M-sample-gr/sample-gr-groundtruth.csv'
sample_df = pd.read_csv(sample)
sample_df = sample_df.rename(columns={'addressfname': 'adressfname'})
sample_df

Unnamed: 0,adressfname,age,gender,educ,dx,mmse
0,madrs046,77,Female,6,ProbableAD,20
1,madrs097,69,Male,16,ProbableAD,26
2,madrs047,75,Female,13,Control,30
3,madrs054,81,Female,17,ProbableAD,27
4,madrs009,60,Female,6,Control,29
5,madrs002,78,Female,0,ProbableAD,24
6,madrs081,69,Male,16,Control,29
7,madrs087,64,Female,6,Control,29


## Merge: train + test + sample

In [7]:
merged_df = pd.concat([train_df, test_df, sample_df])
merged_df.head()

Unnamed: 0,adressfname,age,gender,educ,dx,mmse
0,adrso002,70,female,,Control,26.0
1,adrso003,72,female,,Control,30.0
2,adrso004,74,female,,Control,30.0
3,adrso005,67,female,,Control,27.0
4,adrso006,65,female,,Control,28.0


In [8]:
merged_df[merged_df['educ'].isna()]

Unnamed: 0,adressfname,age,gender,educ,dx,mmse
0,adrso002,70,female,,Control,26.0
1,adrso003,72,female,,Control,30.0
2,adrso004,74,female,,Control,30.0
3,adrso005,67,female,,Control,27.0
4,adrso006,65,female,,Control,28.0
5,adrso007,62,female,,Control,30.0
6,adrso008,67,female,,Control,29.0
7,adrso009,68,female,,Control,29.0
8,adrso010,72,female,,Control,29.0
9,adrso011,71,female,,Control,30.0


In [9]:
merged_df.shape, train_df.shape[0] + test_df.shape[0] + sample_df.shape[0]

((291, 6), 291)

Records without education: 21.

In [10]:
merged_df = merged_df.rename(columns={
    'adressfname': 'record_id', 
    'gender': 'sex'
})
merged_df['filename'] = merged_df['record_id'].apply(lambda x: x + '.wav' if x[0:4] == 'madr' else x + '.mp3')
merged_df['subject_id'] = merged_df['record_id']
merged_df['dx'] = merged_df['dx'].apply(lambda x: 'NC' if x == 'Control' else 'Probable AD')
merged_df['dx_binary'] = merged_df['dx'].apply(lambda x: 1 if x == 'Probable AD' else 0)
merged_df['sex'] = merged_df['sex'].apply(lambda x: 'F' if x.lower() == 'female' else 'M')
merged_df = merged_df[[
    'filename', 
    'record_id', 
    'subject_id', 
    'age', 
    'sex', 
    'educ',
    'mmse',
    'dx', 
    'dx_binary'
]].reset_index(drop=True)
merged_df = merged_df.sort_values(by='filename')
merged_df

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary
0,adrso002.mp3,adrso002,adrso002,70,F,,26.0,NC,0
1,adrso003.mp3,adrso003,adrso003,72,F,,30.0,NC,0
2,adrso004.mp3,adrso004,adrso004,74,F,,30.0,NC,0
3,adrso005.mp3,adrso005,adrso005,67,F,,27.0,NC,0
4,adrso006.mp3,adrso006,adrso006,65,F,,28.0,NC,0
...,...,...,...,...,...,...,...,...,...
261,madrs095.wav,madrs095,madrs095,66,F,8.0,20.0,Probable AD,1
273,madrs096.wav,madrs096,madrs096,61,F,16.0,30.0,NC,0
284,madrs097.wav,madrs097,madrs097,69,M,16.0,26.0,Probable AD,1
239,madrs098.wav,madrs098,madrs098,70,F,17.0,27.0,NC,0


In [11]:
merged_df[merged_df.duplicated()]

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary


In [12]:
merged_df['dx'].value_counts()

dx
Probable AD    148
NC             143
Name: count, dtype: int64

## Save: metadata.csv

In [13]:
output_dir = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS-M'
output_file = os.path.join(output_dir, 'metadata.csv')
merged_df.to_csv(output_file, index=False)

## Check if all files are in the folder

In [14]:
metadata = pd.read_csv(output_file)
i = 0
for file in metadata['filename']:
    file = os.path.join(output_dir)
    if os.path.exists(file):
        i += 1
        # print(f"'{file}' exists.")
    else:
        print(f"'{file}' does not exist.")
print(metadata.shape, i)

(291, 9) 291


## Check if all files in the folder are in the metadata

In [15]:
i = 0
for root, dirs, files in os.walk(output_dir):
    for file in files:
        base, ext = os.path.splitext(file)
        if ext in ['.wav', '.mp3']:
            if file in metadata['filename'].values:
                i += 1
            else:
                print(f"'{file}' not in metadata.")
    break # in the root directory only
print(i)

291
