# Combining Datasets

By: Jimuel Celeste, Jr. 

Objective: To generate a clean metadata of combined datasets.

In [1]:
import os 
import shutil

import pandas as pd

## ADReSS

In [2]:
adress = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/metadata.csv'
adress_df = pd.read_csv(adress)
adress_df['dataset'] = 'ADReSS'
adress_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset
0,S001.wav,S001,S001,74,M,,NC,0,ADReSS
1,S002.wav,S002,S002,62,F,30.0,NC,0,ADReSS
2,S003.wav,S003,S003,69,F,29.0,NC,0,ADReSS
3,S004.wav,S004,S004,71,F,30.0,NC,0,ADReSS
4,S005.wav,S005,S005,74,F,30.0,NC,0,ADReSS


In [3]:
adress_df.shape

(156, 9)

## ADReSSo

In [4]:
adresso = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSSo/metadata.csv'
adresso_df = pd.read_csv(adresso)
adresso_df['dataset'] = 'ADReSSo'
adresso_df.head()

Unnamed: 0,filename,record_id,subject_id,mmse,dx,dx_binary,dataset
0,adrso024.wav,adrso024,adrso024,20,AD,1,ADReSSo
1,adrso025.wav,adrso025,adrso025,11,AD,1,ADReSSo
2,adrso027.wav,adrso027,adrso027,18,AD,1,ADReSSo
3,adrso028.wav,adrso028,adrso028,18,AD,1,ADReSSo
4,adrso031.wav,adrso031,adrso031,26,AD,1,ADReSSo


In [5]:
adresso_df.shape

(237, 7)

## ADReSS-M

In [6]:
adressm = '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS-M/metadata.csv'
adressm_df = pd.read_csv(adressm)
adressm_df['dataset'] = 'ADReSS-M'
adressm_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset
0,adrso002.mp3,adrso002,adrso002,70,F,,26.0,NC,0,ADReSS-M
1,adrso003.mp3,adrso003,adrso003,72,F,,30.0,NC,0,ADReSS-M
2,adrso004.mp3,adrso004,adrso004,74,F,,30.0,NC,0,ADReSS-M
3,adrso005.mp3,adrso005,adrso005,67,F,,27.0,NC,0,ADReSS-M
4,adrso006.mp3,adrso006,adrso006,65,F,,28.0,NC,0,ADReSS-M


In [7]:
adressm_df.shape

(291, 10)

## TAUKADIAL

In [8]:
taukadial = '/Users/jimuelcelestejr/Downloads/Dataset/TAUKADIAL/metadata.csv'
taukadial_df = pd.read_csv(taukadial)
taukadial_df['dataset'] = 'TAUKADIAL'
taukadial_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset
0,taukdial-001-1.wav,taukdial-001-1,taukdial-001,70,F,27,NC,0,TAUKADIAL
1,taukdial-001-2.wav,taukdial-001-2,taukdial-001,70,F,27,NC,0,TAUKADIAL
2,taukdial-001-3.wav,taukdial-001-3,taukdial-001,70,F,27,NC,0,TAUKADIAL
3,taukdial-002-1.wav,taukdial-002-1,taukdial-002,72,F,29,NC,0,TAUKADIAL
4,taukdial-002-2.wav,taukdial-002-2,taukdial-002,72,F,29,NC,0,TAUKADIAL


In [9]:
taukadial_df.shape

(507, 9)

## Merge: ADReSS + ADReSSo + ADReSS-M + TAUKADIAL

In [10]:
datasets = [adress_df, adresso_df, adressm_df, taukadial_df]
merged_df = pd.concat(datasets, join='outer')
merged_df

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset,educ
0,S001.wav,S001,S001,74.0,M,,NC,0,ADReSS,
1,S002.wav,S002,S002,62.0,F,30.0,NC,0,ADReSS,
2,S003.wav,S003,S003,69.0,F,29.0,NC,0,ADReSS,
3,S004.wav,S004,S004,71.0,F,30.0,NC,0,ADReSS,
4,S005.wav,S005,S005,74.0,F,30.0,NC,0,ADReSS,
...,...,...,...,...,...,...,...,...,...,...
502,taukdial-168-2.wav,taukdial-168-2,taukdial-168,65.0,M,29.0,NC,0,TAUKADIAL,
503,taukdial-168-3.wav,taukdial-168-3,taukdial-168,65.0,M,29.0,NC,0,TAUKADIAL,
504,taukdial-169-1.wav,taukdial-169-1,taukdial-169,81.0,F,28.0,MCI,1,TAUKADIAL,
505,taukdial-169-2.wav,taukdial-169-2,taukdial-169,81.0,F,28.0,MCI,1,TAUKADIAL,


In [11]:
merged_df.shape, sum([d.shape[0] for d in datasets])

((1191, 10), 1191)

## Remove duplicates: check record_id

In [12]:
duplicates = merged_df[merged_df['record_id'].duplicated(keep=False)]
duplicates.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset,educ
0,adrso024.wav,adrso024,adrso024,,,20.0,AD,1,ADReSSo,
1,adrso025.wav,adrso025,adrso025,,,11.0,AD,1,ADReSSo,
2,adrso027.wav,adrso027,adrso027,,,18.0,AD,1,ADReSSo,
3,adrso028.wav,adrso028,adrso028,,,18.0,AD,1,ADReSSo,
4,adrso031.wav,adrso031,adrso031,,,26.0,AD,1,ADReSSo,


In [13]:
duplicates.shape

(332, 10)

In [14]:
duplicates['record_id'].value_counts().value_counts()

count
2    166
Name: count, dtype: int64

There are 166 unique record_id with 2 samples each.

In [15]:
duplicates['dataset'].value_counts()

dataset
ADReSSo     166
ADReSS-M    166
Name: count, dtype: int64

The duplicates are from ADReSSo and ADReSS-M datasets.

In [16]:
output_dir = '/Users/jimuelcelestejr/Downloads/Dataset/Combined'
duplicates_csv = os.path.join(output_dir, 'archive', 'duplicates.csv')
duplicates.to_csv(duplicates_csv)

It seems like the file types are either wav and mp3. Is it the case that there is a one-to-one mapping between the mp3 and wav files? Manually checking, the duplicate files have the same filenames, but different file types: mp3 in ADReSS-M and wav in ADReSSo.

In [17]:
ext = duplicates['filename'].apply(lambda x: x.split('.')[1])
mp3 = duplicates[ext == 'mp3']
wav = duplicates[ext == 'wav']
set(mp3['record_id'].values) == set(wav['record_id'].values)

True

The mp3 files will be removed from the dataset.

In [18]:
merged_df = merged_df[~merged_df['filename'].isin(mp3['filename'].values)]
merged_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset,educ
0,S001.wav,S001,S001,74.0,M,,NC,0,ADReSS,
1,S002.wav,S002,S002,62.0,F,30.0,NC,0,ADReSS,
2,S003.wav,S003,S003,69.0,F,29.0,NC,0,ADReSS,
3,S004.wav,S004,S004,71.0,F,30.0,NC,0,ADReSS,
4,S005.wav,S005,S005,74.0,F,30.0,NC,0,ADReSS,


In [19]:
merged_df.shape

(1025, 10)

## Remove records without classification labels

In [20]:
merged_df[merged_df['dx'].isna()]

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset,educ


In [21]:
merged_df[merged_df['dx_binary'].isna()]

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset,educ


All records include a diagnosis label.

## Remove records without MMSE score

In [22]:
no_mmse = merged_df[merged_df['mmse'].isna()]
no_mmse

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset,educ
0,S001.wav,S001,S001,74.0,M,,NC,0,ADReSS,
180,adrso256.mp3,adrso256,adrso256,74.0,M,,NC,0,ADReSS-M,16.0


In [23]:
no_mmse_csv = os.path.join(output_dir, 'archive', 'no_mmse.csv')
no_mmse.to_csv(no_mmse_csv)

Two records do not have MMSE scores.

In [24]:
merged_df = merged_df[~merged_df['filename'].isin(no_mmse['filename'].values)]
merged_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,mmse,dx,dx_binary,dataset,educ
1,S002.wav,S002,S002,62.0,F,30.0,NC,0,ADReSS,
2,S003.wav,S003,S003,69.0,F,29.0,NC,0,ADReSS,
3,S004.wav,S004,S004,71.0,F,30.0,NC,0,ADReSS,
4,S005.wav,S005,S005,74.0,F,30.0,NC,0,ADReSS,
5,S006.wav,S006,S006,67.0,F,29.0,NC,0,ADReSS,


In [25]:
merged_df.shape

(1023, 10)

## Age & Sex Distribution

In [26]:
merged_df[merged_df['age'].isna()]['dataset'].value_counts()

dataset
ADReSSo    237
Name: count, dtype: int64

In [27]:
merged_df[merged_df['sex'].isna()]['dataset'].value_counts()

dataset
ADReSSo    237
Name: count, dtype: int64

Only the records from ADReSSo do not include age and sex information. However, I noticed from the mp3 files (see duplicates) that they include these information. The data from that dataframe will be used to supply the missing information.

In [28]:
mp3_rows = merged_df['record_id'].isin(mp3['record_id'].values)
merged_df.loc[mp3_rows, 'age'] = mp3['age'].values
merged_df.loc[mp3_rows, 'sex'] = mp3['sex'].values

In [29]:
merged_df[merged_df['age'].isna()]['dataset'].value_counts()

dataset
ADReSSo    71
Name: count, dtype: int64

In [30]:
merged_df[merged_df['sex'].isna()]['dataset'].value_counts()

dataset
ADReSSo    71
Name: count, dtype: int64

**Age Distribution**

In [31]:
merged_df['age'].dropna().describe()

count    952.000000
mean      70.304622
std        7.504479
min       50.000000
25%       65.000000
50%       70.000000
75%       76.000000
max       91.000000
Name: age, dtype: float64

The mean age is 70 with standard deviation of 7.5. The youngest participant is 50, while the oldest is 91.

**Sex Distribution**

In [32]:
merged_df['sex'].dropna().value_counts()

sex
F    584
M    368
Name: count, dtype: int64

In [33]:
merged_df['sex'].dropna().value_counts() / merged_df['sex'].dropna().shape[0] * 100

sex
F    61.344538
M    38.655462
Name: count, dtype: float64

There are more women (61.34%) than men (38.66%).

## Missing Values

In [34]:
merged_df.isnull().sum()

filename        0
record_id       0
subject_id      0
age            71
sex            71
mmse            0
dx              0
dx_binary       0
dataset         0
educ          905
dtype: int64

## Save: Metadata

In [35]:
metadata = merged_df[[
    'filename', 
    'record_id', 
    'subject_id',     
    'age', 
    'sex',
    'educ',
    'mmse', 
    'dx', 
    'dx_binary',
    'dataset'
]]
metadata = metadata.sort_values(by='filename').reset_index(drop=True)
metadata.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset
0,S002.wav,S002,S002,62.0,F,,30.0,NC,0,ADReSS
1,S003.wav,S003,S003,69.0,F,,29.0,NC,0,ADReSS
2,S004.wav,S004,S004,71.0,F,,30.0,NC,0,ADReSS
3,S005.wav,S005,S005,74.0,F,,30.0,NC,0,ADReSS
4,S006.wav,S006,S006,67.0,F,,29.0,NC,0,ADReSS


In [36]:
metadata_csv = os.path.join(output_dir, 'metadata.csv')
metadata.to_csv(metadata_csv, index=False)

## Copy audio files to the output directory

In [37]:
datasets_dir = '/Users/jimuelcelestejr/Downloads/Dataset'
for _, row in metadata.iterrows():
    source_file = os.path.join(datasets_dir, row['dataset'], row['filename'])
    dest_file = os.path.join(output_dir, row['filename'])
    shutil.copy(source_file, dest_file)
    print(f"File '{source_file}' copied to '{dest_file}'")

File '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/S002.wav' copied to '/Users/jimuelcelestejr/Downloads/Dataset/Combined/S002.wav'
File '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/S003.wav' copied to '/Users/jimuelcelestejr/Downloads/Dataset/Combined/S003.wav'
File '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/S004.wav' copied to '/Users/jimuelcelestejr/Downloads/Dataset/Combined/S004.wav'
File '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/S005.wav' copied to '/Users/jimuelcelestejr/Downloads/Dataset/Combined/S005.wav'
File '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/S006.wav' copied to '/Users/jimuelcelestejr/Downloads/Dataset/Combined/S006.wav'
File '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/S007.wav' copied to '/Users/jimuelcelestejr/Downloads/Dataset/Combined/S007.wav'
File '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/S009.wav' copied to '/Users/jimuelcelestejr/Downloads/Dataset/Combined/S009.wav'
File '/Users/jimuelcelestejr/Downloads/Dataset/ADReSS/S