# ADReSS Dataset

By Jimuel Celeste, Jr. 

Objective: To prepare a clean metadata for the ADReSS Dataset.


In [1]:
import os 
import pandas as pd

## Train

In [2]:
train1 = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/archive/ADReSS-IS2020-data_train/cd_meta_data_train.txt'
train1_df = pd.read_csv(train1, sep=';')
train1_df['dx'] = 'AD'
train1_df.head()

Unnamed: 0,ID,age,gender,mmse,dx
0,S079,59,male,11,AD
1,S080,55,male,19,AD
2,S081,69,male,19,AD
3,S082,66,male,11,AD
4,S083,52,male,30,AD


In [3]:
train2 = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/archive/ADReSS-IS2020-data_train/cc_meta_data_train.txt'
train2_df = pd.read_csv(train2, sep=';')
train2_df['dx'] = 'NC'
train2_df.head()

Unnamed: 0,ID,age,gender,mmse,dx
0,S001,74,male,,NC
1,S002,62,female,30.0,NC
2,S003,69,female,29.0,NC
3,S004,71,female,30.0,NC
4,S005,74,female,30.0,NC


In [4]:
train_df = pd.concat([train1_df, train2_df])
train_df

Unnamed: 0,ID,age,gender,mmse,dx
0,S079,59,male,11,AD
1,S080,55,male,19,AD
2,S081,69,male,19,AD
3,S082,66,male,11,AD
4,S083,52,male,30,AD
...,...,...,...,...,...
49,S071,74,male,29,NC
50,S072,75,male,29,NC
51,S073,61,male,29,NC
52,S076,78,female,28,NC


In [5]:
train_df.shape, train1_df.shape[0] + train2_df.shape[0]

((108, 5), 108)

In [6]:
train_df.columns

Index(['ID   ', ' age', ' gender ', 'mmse', 'dx'], dtype='object')

In [7]:
for column in train_df.columns:
    train_df = train_df.rename(columns={column: column.strip()})
train_df.columns

Index(['ID', 'age', 'gender', 'mmse', 'dx'], dtype='object')

In [8]:
for column in train_df.columns: 
    train_df[column] = train_df[column].apply(lambda x: x.strip() if type(x) == str else x)

In [9]:
train_df = train_df.rename(columns={
    'ID': 'record_id', 
    'gender': 'sex'
})
train_df['dx_binary'] = train_df['dx'].apply(lambda x: 1 if x == 'AD' else 0)
train_df['subject_id'] = train_df['record_id']
train_df['filename'] = train_df['record_id'].apply(lambda x: x + '.wav')
train_df['sex'] = train_df['sex'].apply(lambda x: 'M' if x == 'male' else 'F')
train_df = train_df[[
    'filename', 
    'record_id', 
    'subject_id', 
    'age', 
    'sex', 
    'mmse', 
    'dx', 
    'dx_binary'
]].reset_index(drop=True)
train_df

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary
0,S079.wav,S079,S079,59,M,11,AD,1
1,S080.wav,S080,S080,55,M,19,AD,1
2,S081.wav,S081,S081,69,M,19,AD,1
3,S082.wav,S082,S082,66,M,11,AD,1
4,S083.wav,S083,S083,52,M,30,AD,1
...,...,...,...,...,...,...,...,...
103,S071.wav,S071,S071,74,M,29,NC,0
104,S072.wav,S072,S072,75,M,29,NC,0
105,S073.wav,S073,S073,61,M,29,NC,0
106,S076.wav,S076,S076,78,F,28,NC,0


## Test

In [10]:
test = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/archive/ADReSS-IS2020-data 2_test/ADReSS-meta_data_test.txt'
test_df = pd.read_csv(test, sep=';')
test_df.head()

Unnamed: 0,ID,age,gender,Label,mmse
0,S160,63,1,0,28
1,S161,55,1,0,29
2,S162,67,1,1,24
3,S163,71,0,0,30
4,S164,73,1,1,21


In [11]:
test_df.columns

Index(['ID   ', ' age', ' gender', 'Label ', 'mmse'], dtype='object')

In [12]:
for column in test_df.columns:
    test_df = test_df.rename(columns={column: column.strip()})
test_df.columns

Index(['ID', 'age', 'gender', 'Label', 'mmse'], dtype='object')

In [13]:
for column in test_df.columns: 
    test_df[column] = test_df[column].apply(lambda x: x.strip() if type(x) == str else x)

In [14]:
test_metadata = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/archive/ADReSS-IS2020-data 2_test/meta_data_test.txt'
test_metadata_df = pd.read_csv(test_metadata, sep=';')
test_metadata_df.head()

Unnamed: 0,ID,age,gender
0,S160,63,female
1,S161,55,female
2,S162,67,female
3,S163,71,male
4,S164,73,female


In [15]:
test_metadata_df.columns

Index(['ID ', ' age ', ' gender'], dtype='object')

In [16]:
for column in test_metadata_df.columns:
    test_metadata_df = test_metadata_df.rename(columns={column: column.strip()})
test_metadata_df.columns

Index(['ID', 'age', 'gender'], dtype='object')

In [17]:
for column in test_metadata_df.columns: 
    test_metadata_df[column] = test_metadata_df[column].apply(lambda x: x.strip() if type(x) == str else x)

In [18]:
set(test_df['ID']) - set(test_metadata_df['ID'])

set()

In [19]:
set(test_metadata_df['ID']) - set(test_df['ID'])

set()

In [20]:
set(test_metadata_df['ID']) == set(test_df['ID'])

True

Same; I'll only use test_df.

In [21]:
test_df.head()

Unnamed: 0,ID,age,gender,Label,mmse
0,S160,63,1,0,28
1,S161,55,1,0,29
2,S162,67,1,1,24
3,S163,71,0,0,30
4,S164,73,1,1,21


In [22]:
test_df = test_df.rename(columns={
    'ID': 'record_id', 
    'gender': 'sex', 
    'Label': 'dx_binary'
})
test_df['dx'] = test_df['dx_binary'].apply(lambda x: 'AD' if x == 1 else 'NC')
test_df['subject_id'] = test_df['record_id']
test_df['filename'] = test_df['record_id'].apply(lambda x: x + '.wav')
test_df['sex'] = test_df['sex'].apply(lambda x: 'F' if x == 1 else 'M')
test_df = test_df[[
    'filename', 
    'record_id', 
    'subject_id', 
    'age', 
    'sex', 
    'mmse', 
    'dx', 
    'dx_binary'
]].reset_index(drop=True)
test_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary
0,S160.wav,S160,S160,63,F,28,NC,0
1,S161.wav,S161,S161,55,F,29,NC,0
2,S162.wav,S162,S162,67,F,24,AD,1
3,S163.wav,S163,S163,71,M,30,NC,0
4,S164.wav,S164,S164,73,F,21,AD,1


## Merge: train + test

In [23]:
merged_df = pd.concat([train_df, test_df])
merged_df = merged_df.sort_values(by='filename')
merged_df = merged_df.reset_index(drop=True)
merged_df

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary
0,S001.wav,S001,S001,74,M,,NC,0
1,S002.wav,S002,S002,62,F,30,NC,0
2,S003.wav,S003,S003,69,F,29,NC,0
3,S004.wav,S004,S004,71,F,30,NC,0
4,S005.wav,S005,S005,74,F,30,NC,0
...,...,...,...,...,...,...,...,...
151,S203.wav,S203,S203,56,M,18,AD,1
152,S204.wav,S204,S204,56,M,28,NC,0
153,S205.wav,S205,S205,50,M,23,AD,1
154,S206.wav,S206,S206,65,M,28,NC,0


In [24]:
train_df.shape[0] + test_df.shape[0]

156

In [25]:
merged_df['dx'].value_counts()

dx
NC    78
AD    78
Name: count, dtype: int64

## Save: metadata.csv

In [26]:
output_dir = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS'
output_file = os.path.join(output_dir, 'metadata.csv')
merged_df.to_csv(output_file, index=False)

## Check if all files are in the folder

In [27]:
metadata = pd.read_csv(output_file)
i = 0
for file in metadata['filename']:
    file = os.path.join(output_dir)
    if os.path.exists(file):
        i += 1
        # print(f"'{file}' exists.")
    else:
        print(f"'{file}' does not exist.")
print(metadata.shape, i)

(156, 8) 156


## Check if all files in the folder are in the metadata¶

In [28]:
i = 0
for root, dirs, files in os.walk(output_dir):
    for file in files:
        base, ext = os.path.splitext(file)
        if ext == '.wav':
            if file in metadata['filename'].values:
                i += 1
            else:
                print(f"'{file}' not in metadata.")
print(i)

156
