# ADReSS 2020 Metadata 

By Jimuel Celeste, Jr. 

Objective: To generate the metadata for the ADReSS 2020 Dataset. 

In [1]:
import pandas as pd 

## CD: Alzheimer's Disease

In [2]:
cd = '/Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_audio_full_wave/cd_meta_data.txt'
cd = pd.read_csv(cd, sep=';')
cd.head()

Unnamed: 0,ID,age,gender,mmse
0,S079,59,male,11
1,S080,55,male,19
2,S081,69,male,19
3,S082,66,male,11
4,S083,52,male,30


In [3]:
cd.columns

Index(['ID   ', ' age', ' gender ', 'mmse'], dtype='object')

In [4]:
for column in cd.columns:
    cd = cd.rename(columns={column: column.strip()})
cd.columns

Index(['ID', 'age', 'gender', 'mmse'], dtype='object')

In [5]:
for column in cd.columns: 
    print(column, type(cd[column][0])) # first sample

ID <class 'str'>
age <class 'numpy.int64'>
gender <class 'str'>
mmse <class 'numpy.int64'>


In [6]:
cd.head()

Unnamed: 0,ID,age,gender,mmse
0,S079,59,male,11
1,S080,55,male,19
2,S081,69,male,19
3,S082,66,male,11
4,S083,52,male,30


In [7]:
cd['dx'] = 'AD'
cd['binary_label'] = 1
cd.head()

Unnamed: 0,ID,age,gender,mmse,dx,binary_label
0,S079,59,male,11,AD,1
1,S080,55,male,19,AD,1
2,S081,69,male,19,AD,1
3,S082,66,male,11,AD,1
4,S083,52,male,30,AD,1


## CC: Healthy Control

In [8]:
cc = '/Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_audio_full_wave/cc_meta_data.txt'
cc = pd.read_csv(cc, sep=';')
cc.head()

Unnamed: 0,ID,age,gender,mmse
0,S001,74,male,
1,S002,62,female,30.0
2,S003,69,female,29.0
3,S004,71,female,30.0
4,S005,74,female,30.0


In [9]:
cc.columns

Index(['ID   ', ' age', ' gender ', 'mmse'], dtype='object')

In [10]:
for column in cc.columns:
    cc = cc.rename(columns={column: column.strip()})
cc.columns

Index(['ID', 'age', 'gender', 'mmse'], dtype='object')

In [11]:
for column in cc.columns: 
    print(column, type(cc[column][0])) # first sample

ID <class 'str'>
age <class 'numpy.int64'>
gender <class 'str'>
mmse <class 'str'>


In [12]:
cc['mmse'].apply(lambda x: int(x.strip()) if x.strip() != 'NA' else None).head()

0     NaN
1    30.0
2    29.0
3    30.0
4    30.0
Name: mmse, dtype: float64

In [13]:
cc['mmse'] = cc['mmse'].apply(lambda x: int(x.strip()) if x.strip() != 'NA' else None)
for column in cc.columns: 
    print(column, type(cc[column][0])) # first sample

ID <class 'str'>
age <class 'numpy.int64'>
gender <class 'str'>
mmse <class 'numpy.float64'>


In [14]:
cc.head()

Unnamed: 0,ID,age,gender,mmse
0,S001,74,male,
1,S002,62,female,30.0
2,S003,69,female,29.0
3,S004,71,female,30.0
4,S005,74,female,30.0


In [15]:
cc['dx'] = 'HC'
cc['binary_label'] = 0
cc.head()

Unnamed: 0,ID,age,gender,mmse,dx,binary_label
0,S001,74,male,,HC,0
1,S002,62,female,30.0,HC,0
2,S003,69,female,29.0,HC,0
3,S004,71,female,30.0,HC,0
4,S005,74,female,30.0,HC,0


## Metadata: Concatenate CC and CD

In [16]:
metadata = pd.concat([cc, cd])
metadata

Unnamed: 0,ID,age,gender,mmse,dx,binary_label
0,S001,74,male,,HC,0
1,S002,62,female,30.0,HC,0
2,S003,69,female,29.0,HC,0
3,S004,71,female,30.0,HC,0
4,S005,74,female,30.0,HC,0
...,...,...,...,...,...,...
49,S150,58,male,20.0,AD,1
50,S151,72,male,24.0,AD,1
51,S153,68,female,12.0,AD,1
52,S154,65,female,20.0,AD,1


In [17]:
cc.shape, cd.shape, metadata.shape

((54, 6), (54, 6), (108, 6))

In [18]:
metadata = metadata.rename(columns={'ID': 'record_id'})
metadata.head()

Unnamed: 0,record_id,age,gender,mmse,dx,binary_label
0,S001,74,male,,HC,0
1,S002,62,female,30.0,HC,0
2,S003,69,female,29.0,HC,0
3,S004,71,female,30.0,HC,0
4,S005,74,female,30.0,HC,0


In [19]:
metadata['filename'] = metadata['record_id'].apply(lambda x: x.strip() + ".wav")
metadata.head()

Unnamed: 0,record_id,age,gender,mmse,dx,binary_label,filename
0,S001,74,male,,HC,0,S001.wav
1,S002,62,female,30.0,HC,0,S002.wav
2,S003,69,female,29.0,HC,0,S003.wav
3,S004,71,female,30.0,HC,0,S004.wav
4,S005,74,female,30.0,HC,0,S005.wav


In [20]:
metadata['subject_id'] = metadata['record_id']
metadata.head()

Unnamed: 0,record_id,age,gender,mmse,dx,binary_label,filename,subject_id
0,S001,74,male,,HC,0,S001.wav,S001
1,S002,62,female,30.0,HC,0,S002.wav,S002
2,S003,69,female,29.0,HC,0,S003.wav,S003
3,S004,71,female,30.0,HC,0,S004.wav,S004
4,S005,74,female,30.0,HC,0,S005.wav,S005


## Missing Values

In [21]:
metadata[metadata['mmse'].isna()]

Unnamed: 0,record_id,age,gender,mmse,dx,binary_label,filename,subject_id
0,S001,74,male,,HC,0,S001.wav,S001


In [22]:
metadata['record_id'] = metadata['record_id'].apply(lambda x: x.strip())
metadata['subject_id'] = metadata['subject_id'].apply(lambda x: x.strip())
metadata['gender'] = metadata['gender'].apply(lambda x: x.strip())

Only one record has missing 'mmse' value.

## Audio Metadata

In [23]:
output_file = '/Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_audio_full_wave/metadata.csv'
metadata.to_csv(output_file, index=False)

## Text Metadata

In [24]:
metadata_text = metadata.copy()
metadata_text['filename'] = metadata['record_id'].apply(lambda x: x + '.txt')
metadata_text

Unnamed: 0,record_id,age,gender,mmse,dx,binary_label,filename,subject_id
0,S001,74,male,,HC,0,S001.txt,S001
1,S002,62,female,30.0,HC,0,S002.txt,S002
2,S003,69,female,29.0,HC,0,S003.txt,S003
3,S004,71,female,30.0,HC,0,S004.txt,S004
4,S005,74,female,30.0,HC,0,S005.txt,S005
...,...,...,...,...,...,...,...,...
49,S150,58,male,20.0,AD,1,S150.txt,S150
50,S151,72,male,24.0,AD,1,S151.txt,S151
51,S153,68,female,12.0,AD,1,S153.txt,S153
52,S154,65,female,20.0,AD,1,S154.txt,S154


In [25]:
output_file_text = '/Users/jimuelcelestejr/Documents/codebook/MLSpeech4MH/data/ADReSS2020/train_transcription_txt/metadata.csv'
metadata_text.to_csv(output_file_text, index=False)