# Data Preparation - Splitting

By: Jimuel Celeste, Jr. 

Objective: To split the combined dataset into two for internal cross-validation and external validation.

In [1]:
import os 

import pandas as pd

In [2]:
feature_sets = {
    'eGeMAPS': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - Acoustic Features/egemaps_functionals', 
    'BoAW ComParE 2016': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - Acoustic Features/boaw_size_200_compare_2016_lld', 
    'LFTK': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - Linguistic Features/lftk_features',
    'LFTK Syntax': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - Linguistic Features/syntax_features',
    'LFTK Discourse': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - Linguistic Features/discourse_features',
    'LFTK Lexico-Semantics': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - Linguistic Features/lexico_semantics_features',
    'LFTK Surface': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - Linguistic Features/surface_features',
    'BERT': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - BERT Features', 
    'wav2vec': '/Users/jimuelcelestejr/Downloads/Dataset/Thesis - Features/Thesis - wav2vec Features'
}

In [3]:
# check if metadata exists in the folds
for feature_set in feature_sets:
    metadata = os.path.join(feature_sets[feature_set], 'metadata.csv')
    if os.path.exists(metadata):
        print(feature_set, 'ok')
    else:
        print(feature_set, 'does not exist')

eGeMAPS ok
BoAW ComParE 2016 ok
LFTK ok
LFTK Syntax ok
LFTK Discourse ok
LFTK Lexico-Semantics ok
LFTK Surface ok
BERT ok
wav2vec ok


## External Validation Sets

In [4]:
metadata = os.path.join(feature_sets['LFTK'], 'metadata.csv') # reference feature set 
metadata_df = pd.read_csv(metadata)
metadata_df.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset,language
0,S002.csv,S002,S002,62.0,F,,30.0,NC,0,ADReSS,en
1,S003.csv,S003,S003,69.0,F,,29.0,NC,0,ADReSS,en
2,S004.csv,S004,S004,71.0,F,,30.0,NC,0,ADReSS,en
3,S005.csv,S005,S005,74.0,F,,30.0,NC,0,ADReSS,en
4,S006.csv,S006,S006,67.0,F,,29.0,NC,0,ADReSS,en


In [5]:
metadata_df['language'].value_counts() # languages

language
en    708
zh    261
el     54
Name: count, dtype: int64

### Chinese

In [6]:
chinese = metadata_df[metadata_df['language']=='zh']
chinese.shape

(261, 11)

In [7]:
chinese.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset,language
516,taukdial-001-1.csv,taukdial-001-1,taukdial-001,70.0,F,,27.0,NC,0,TAUKADIAL,zh
517,taukdial-001-2.csv,taukdial-001-2,taukdial-001,70.0,F,,27.0,NC,0,TAUKADIAL,zh
518,taukdial-001-3.csv,taukdial-001-3,taukdial-001,70.0,F,,27.0,NC,0,TAUKADIAL,zh
522,taukdial-003-1.csv,taukdial-003-1,taukdial-003,80.0,M,,23.0,MCI,1,TAUKADIAL,zh
523,taukdial-003-2.csv,taukdial-003-2,taukdial-003,80.0,M,,23.0,MCI,1,TAUKADIAL,zh


In [8]:
chinese['dataset'].value_counts()

dataset
TAUKADIAL    261
Name: count, dtype: int64

### Greek

In [9]:
greek = metadata_df[metadata_df['language']=='el']
greek.shape

(54, 11)

In [10]:
greek.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset,language
462,madrs002.csv,madrs002,madrs002,78.0,F,0.0,24.0,Probable AD,1,ADReSS-M,el
463,madrs009.csv,madrs009,madrs009,60.0,F,6.0,29.0,NC,0,ADReSS-M,el
464,madrs010.csv,madrs010,madrs010,62.0,F,16.0,30.0,NC,0,ADReSS-M,el
465,madrs012.csv,madrs012,madrs012,82.0,F,15.0,23.0,Probable AD,1,ADReSS-M,el
466,madrs013.csv,madrs013,madrs013,63.0,F,10.0,29.0,NC,0,ADReSS-M,el


In [11]:
greek['dataset'].value_counts()

dataset
ADReSS-M    54
Name: count, dtype: int64

### English

In [12]:
# English & TAUKADIAL 
english = metadata_df[(metadata_df['language']=='en') & (metadata_df['dataset']=='TAUKADIAL')]
english.size

2706

In [13]:
english.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset,language
519,taukdial-002-1.csv,taukdial-002-1,taukdial-002,72.0,F,,29.0,NC,0,TAUKADIAL,en
520,taukdial-002-2.csv,taukdial-002-2,taukdial-002,72.0,F,,29.0,NC,0,TAUKADIAL,en
521,taukdial-002-3.csv,taukdial-002-3,taukdial-002,72.0,F,,29.0,NC,0,TAUKADIAL,en
528,taukdial-005-1.csv,taukdial-005-1,taukdial-005,68.0,F,,29.0,NC,0,TAUKADIAL,en
529,taukdial-005-2.csv,taukdial-005-2,taukdial-005,68.0,F,,29.0,NC,0,TAUKADIAL,en


In [14]:
english['dataset'].value_counts()

dataset
TAUKADIAL    246
Name: count, dtype: int64

In [15]:
# control case: english, cookie theft
def get_task_id(record_id):
    return record_id.split('-')[2]

english_cookie_theft = english[english['record_id'].apply(lambda x: get_task_id(x)=='3')]
english_cookie_theft.shape

(82, 11)

In [16]:
english_cookie_theft.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset,language
521,taukdial-002-3.csv,taukdial-002-3,taukdial-002,72.0,F,,29.0,NC,0,TAUKADIAL,en
530,taukdial-005-3.csv,taukdial-005-3,taukdial-005,68.0,F,,29.0,NC,0,TAUKADIAL,en
539,taukdial-008-3.csv,taukdial-008-3,taukdial-008,84.0,F,,29.0,NC,0,TAUKADIAL,en
545,taukdial-010-3.csv,taukdial-010-3,taukdial-010,91.0,F,,27.0,MCI,1,TAUKADIAL,en
548,taukdial-011-3.csv,taukdial-011-3,taukdial-011,66.0,F,,25.0,MCI,1,TAUKADIAL,en


In [17]:
english_cookie_theft['record_id'].apply(lambda x: get_task_id(x)).value_counts()

record_id
3    82
Name: count, dtype: int64

Task ids are all 3.

In [18]:
english_others = english[~english['record_id'].isin(english_cookie_theft['record_id'].values)]
english_others.size

1804

In [19]:
english_others.head()

Unnamed: 0,filename,record_id,subject_id,age,sex,educ,mmse,dx,dx_binary,dataset,language
519,taukdial-002-1.csv,taukdial-002-1,taukdial-002,72.0,F,,29.0,NC,0,TAUKADIAL,en
520,taukdial-002-2.csv,taukdial-002-2,taukdial-002,72.0,F,,29.0,NC,0,TAUKADIAL,en
528,taukdial-005-1.csv,taukdial-005-1,taukdial-005,68.0,F,,29.0,NC,0,TAUKADIAL,en
529,taukdial-005-2.csv,taukdial-005-2,taukdial-005,68.0,F,,29.0,NC,0,TAUKADIAL,en
537,taukdial-008-1.csv,taukdial-008-1,taukdial-008,84.0,F,,29.0,NC,0,TAUKADIAL,en


In [20]:
english_others['record_id'].apply(lambda x: get_task_id(x)).value_counts()

record_id
1    82
2    82
Name: count, dtype: int64

### Save: metadata files

Only tasks 1 and 2 are here.

In [21]:
# Saving validation sets in each of the feature set directories

external_validation_sets = {
    'metadata_ev_english_control': english_cookie_theft,
    'metadata_ev_english': english_others,
    'metadata_ev_chinese': chinese,
    'metadata_ev_greek': greek
}

for fs in feature_sets:
    current_dir = feature_sets[fs]
    print(fs)
    for evs in external_validation_sets:
        current_metadata_csv = os.path.join(current_dir, evs + '.csv')
        current_metadata = external_validation_sets[evs]

        current_metadata.to_csv(current_metadata_csv, index=False)
        
        if os.path.exists(current_metadata_csv):
            print('\t +', evs)
        else:
            print('\t -', evs, )


eGeMAPS
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chinese
	 + metadata_ev_greek
BoAW ComParE 2016
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chinese
	 + metadata_ev_greek
LFTK
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chinese
	 + metadata_ev_greek
LFTK Syntax
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chinese
	 + metadata_ev_greek
LFTK Discourse
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chinese
	 + metadata_ev_greek
LFTK Lexico-Semantics
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chinese
	 + metadata_ev_greek
LFTK Surface
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chinese
	 + metadata_ev_greek
BERT
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chinese
	 + metadata_ev_greek
wav2vec
	 + metadata_ev_english_control
	 + metadata_ev_english
	 + metadata_ev_chines

## Internal Cross-Validation

In [22]:
external_cv_record_ids = []
for evs in external_validation_sets:
    current_metadata = external_validation_sets[evs]
    external_cv_record_ids.extend(current_metadata['record_id'].values)
len(external_cv_record_ids)  

561

In [23]:
metadata_df.shape, metadata_df.shape[0] - 561

((1023, 11), 462)

462 samples left for internal CV.

In [24]:
internal_cv = metadata_df[~metadata_df['record_id'].isin(external_cv_record_ids)]
internal_cv.shape

(462, 11)

In [25]:
internal_cv['language'].value_counts()

language
en    462
Name: count, dtype: int64

In [26]:
internal_cv['dataset'].value_counts()

dataset
ADReSSo     237
ADReSS      155
ADReSS-M     70
Name: count, dtype: int64

In [27]:
# Saving internal cross-validation set in each of the feature set directories

for fs in feature_sets:
    print(fs)
    current_dir = feature_sets[fs]
    current_metadata_csv = os.path.join(current_dir, 'metadata_cv_english.csv')
    internal_cv.to_csv(current_metadata_csv, index=False)
    
    if os.path.exists(current_metadata_csv):
        print('\t +', 'metadata_cv_english.csv')
    else:
        print('\t -', 'metadata_cv_english.csv')

eGeMAPS
	 + metadata_cv_english.csv
BoAW ComParE 2016
	 + metadata_cv_english.csv
LFTK
	 + metadata_cv_english.csv
LFTK Syntax
	 + metadata_cv_english.csv
LFTK Discourse
	 + metadata_cv_english.csv
LFTK Lexico-Semantics
	 + metadata_cv_english.csv
LFTK Surface
	 + metadata_cv_english.csv
BERT
	 + metadata_cv_english.csv
wav2vec
	 + metadata_cv_english.csv
