<a href="https://colab.research.google.com/github/karlychann/neuro140/blob/main/uvfp_testxgboost_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final models trained on the entire dataset, with and without biased features

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from io import BytesIO
import pickle
import requests
pd.set_option('display.max_columns', None)

In [2]:
all_features = ['F0semitoneFrom27.5Hz_sma3nz_amean',
       'F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_percentile20.0',
       'F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
       'F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
       'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
       'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
       'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
       'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope',
       'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope', 'loudness_sma3_amean',
       'loudness_sma3_stddevNorm', 'loudness_sma3_percentile20.0',
       'loudness_sma3_percentile50.0', 'loudness_sma3_percentile80.0',
       'loudness_sma3_pctlrange0-2', 'loudness_sma3_meanRisingSlope',
       'loudness_sma3_stddevRisingSlope', 'loudness_sma3_meanFallingSlope',
       'loudness_sma3_stddevFallingSlope', 'spectralFlux_sma3_amean',
       'spectralFlux_sma3_stddevNorm', 'mfcc1_sma3_amean',
       'mfcc1_sma3_stddevNorm', 'mfcc2_sma3_amean', 'mfcc2_sma3_stddevNorm',
       'mfcc3_sma3_amean', 'mfcc3_sma3_stddevNorm', 'mfcc4_sma3_amean',
       'mfcc4_sma3_stddevNorm', 'jitterLocal_sma3nz_amean',
       'jitterLocal_sma3nz_stddevNorm', 'shimmerLocaldB_sma3nz_amean',
       'shimmerLocaldB_sma3nz_stddevNorm', 'HNRdBACF_sma3nz_amean',
       'HNRdBACF_sma3nz_stddevNorm', 'logRelF0-H1-H2_sma3nz_amean',
       'logRelF0-H1-H2_sma3nz_stddevNorm', 'logRelF0-H1-A3_sma3nz_amean',
       'logRelF0-H1-A3_sma3nz_stddevNorm', 'F1frequency_sma3nz_amean',
       'F1frequency_sma3nz_stddevNorm', 'F1bandwidth_sma3nz_amean',
       'F1bandwidth_sma3nz_stddevNorm', 'F1amplitudeLogRelF0_sma3nz_amean',
       'F1amplitudeLogRelF0_sma3nz_stddevNorm', 'F2frequency_sma3nz_amean',
       'F2frequency_sma3nz_stddevNorm', 'F2bandwidth_sma3nz_amean',
       'F2bandwidth_sma3nz_stddevNorm', 'F2amplitudeLogRelF0_sma3nz_amean',
       'F2amplitudeLogRelF0_sma3nz_stddevNorm', 'F3frequency_sma3nz_amean',
       'F3frequency_sma3nz_stddevNorm', 'F3bandwidth_sma3nz_amean',
       'F3bandwidth_sma3nz_stddevNorm', 'F3amplitudeLogRelF0_sma3nz_amean',
       'F3amplitudeLogRelF0_sma3nz_stddevNorm', 'alphaRatioV_sma3nz_amean',
       'alphaRatioV_sma3nz_stddevNorm', 'hammarbergIndexV_sma3nz_amean',
       'hammarbergIndexV_sma3nz_stddevNorm', 'slopeV0-500_sma3nz_amean',
       'slopeV0-500_sma3nz_stddevNorm', 'slopeV500-1500_sma3nz_amean',
       'slopeV500-1500_sma3nz_stddevNorm', 'spectralFluxV_sma3nz_amean',
       'spectralFluxV_sma3nz_stddevNorm', 'mfcc1V_sma3nz_amean',
       'mfcc1V_sma3nz_stddevNorm', 'mfcc2V_sma3nz_amean',
       'mfcc2V_sma3nz_stddevNorm', 'mfcc3V_sma3nz_amean',
       'mfcc3V_sma3nz_stddevNorm', 'mfcc4V_sma3nz_amean',
       'mfcc4V_sma3nz_stddevNorm', 'alphaRatioUV_sma3nz_amean',
       'hammarbergIndexUV_sma3nz_amean', 'slopeUV0-500_sma3nz_amean',
       'slopeUV500-1500_sma3nz_amean', 'spectralFluxUV_sma3nz_amean',
       'loudnessPeaksPerSec', 'VoicedSegmentsPerSec',
       'MeanVoicedSegmentLengthSec', 'StddevVoicedSegmentLengthSec',
       'MeanUnvoicedSegmentLength', 'StddevUnvoicedSegmentLength',
       'equivalentSoundLevel_dBp']


# from classification_wo_correlated_features.ipynb features that correlate least with biased features
uncorrelated_features = ['mfcc4V_sma3nz_amean',
       'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope', 'mfcc1_sma3_amean',
       'F3bandwidth_sma3nz_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
       'F1frequency_sma3nz_stddevNorm', 'jitterLocal_sma3nz_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
       'alphaRatioV_sma3nz_stddevNorm', 'mfcc1_sma3_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope', 'mfcc4_sma3_amean',
       'F3frequency_sma3nz_amean', 'mfcc2_sma3_amean',
       'VoicedSegmentsPerSec', 'F1bandwidth_sma3nz_amean',
       'mfcc2V_sma3nz_amean', 'F3frequency_sma3nz_stddevNorm',
       'hammarbergIndexV_sma3nz_stddevNorm',
       'logRelF0-H1-H2_sma3nz_amean', 'slopeV500-1500_sma3nz_stddevNorm',
       'F2bandwidth_sma3nz_amean', 'mfcc3_sma3_amean',
       'F2bandwidth_sma3nz_stddevNorm', 'alphaRatioV_sma3nz_amean',
       'mfcc2_sma3_stddevNorm', 'mfcc1V_sma3nz_amean',
       'slopeUV0-500_sma3nz_amean', 'mfcc1V_sma3nz_stddevNorm',
       'mfcc3V_sma3nz_amean', 'F2frequency_sma3nz_amean',
       'logRelF0-H1-A3_sma3nz_amean', 'hammarbergIndexV_sma3nz_amean',
       'F1bandwidth_sma3nz_stddevNorm', 'mfcc3_sma3_stddevNorm',
       'mfcc2V_sma3nz_stddevNorm', 'F1frequency_sma3nz_amean',
       'F2frequency_sma3nz_stddevNorm',
       'logRelF0-H1-H2_sma3nz_stddevNorm', 'mfcc4V_sma3nz_stddevNorm',
       'mfcc4_sma3_stddevNorm', 'F3bandwidth_sma3nz_amean',
       'mfcc3V_sma3nz_stddevNorm', 'slopeV0-500_sma3nz_stddevNorm']

In [3]:
# load pretrained model
# model_name = 'less-biased'
# training_model_name = 'rf'
# task = 'speech'
# feature_set = uncorrelated_features

# url_path = f'https://github.com/danielmlow/vfp/blob/main/data/output/{training_model_name}_{model_name}_{task}.pkl?raw=true' # speech models trained on reading task
# mfile = BytesIO(requests.get(url_path).content) # load from url
# model = pickle.load(mfile)

# How to extract features on your own wav files using egemaps

To test on your own data, the test set should match our features (egemaps) using the the same variables and sampling rate (16k)

In [None]:
# from os.path import exists
# # config: depends whether you're on Google Colab or local


# # Get URL from github csv by clicking on Download > Copy Link Address

# load_from_google_drive = False

# if load_from_google_drive:
#       # On google colab
#       # Mount GDrive and attach it to the colab for data I/O
#     from google.colab import drive
#     drive.mount('/content/drive')
#     input_dir = '/content/drive/My Drive/datum/vfp/data/input/'
#     output_dir = '/content/drive/My Drive/datum/vfp/data/output/'
#     os.makedirs(output_dir, exist_ok=True)

# else:
#   # If using jupyter-lab or jupyter notebook, load locally:
#   input_dir = './data/input/'
#   output_dir = './data/output/'



In [5]:
from google.colab import drive
drive.flush_and_unmount()

from google.colab import drive
drive.mount('/content/drive')
input_dir = '/content/drive/My Drive/neuro140/vfp/data/input/'
output_dir = '/content/drive/My Drive/neuro140/vfp/data/output/'
os.makedirs(output_dir, exist_ok=True)

Drive not mounted, so nothing to flush and unmount.
Mounted at /content/drive


In [None]:
# !unzip /content/drive/MyDrive/neuro140/vfp/data/archive.zip -d /content/drive/MyDrive/neuro140/vfp/data

In [None]:
! grep 'Vocal fold paralysis' /content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/*.txt

/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice087-info.txt:Diagnosis:	hypokinetic dysphonia (Vocal fold paralysis)
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice093-info.txt:Diagnosis:	hypokinetic dysphonia (Vocal fold paralysis)
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice112-info.txt:Diagnosis:	hypokinetic dysphonia (Vocal fold paralysis)
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice136-info.txt:Diagnosis:	hypokinetic dysphonia (Vocal fold paralysis)
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice170-info.txt:Diagnosis:	hyperkinetic dysphonia (Vocal fold paralysis)


In [None]:
# ! grep 'healthy' /content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/*.txt | head

/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice002-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice019-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice024-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice025-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice032-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice034-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice040-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice045-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice049-info.txt:Diagnosis:	healthy
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice051-info.txt:Diagnosis:	healthy


In [None]:
#pip install wfdb scipy

In [None]:
# import wfdb
# from scipy.io.wavfile import write

# def wfdb_to_wav(input_path, output_path, channel=0):
#     """
#     Convert a WFDB file to a WAV file.

#     Parameters:
#     - input_path: Path to the input WFDB file.
#     - output_path: Path to the output WAV file.
#     - channel: The channel of the WFDB file to convert (default is 0)
#     """
#     # Read the WFDB file
#     record = wfdb.rdrecord(input_path)

#     # Extract the signal from the specified channel
#     signal = record.p_signal[:, channel]

#     # Normalize the signal to be in the range of int16 (required for WAV files)
#     signal_normalized = ((signal - signal.min()) / (signal.max() - signal.min()) * (2**15 - 1) - 2**15).astype('int16')

#     # Write the normalized signal to a WAV file
#     # Note: The sample rate is set according to the WFDB record's sampling frequency
#     write(output_path, record.fs, signal_normalized)


# new_data = ['voice087', 'voice093', 'voice112', 'voice136', 'voice170',
#             'voice002', 'voice019', 'voice024', 'voice025', 'voice032']
# for input in new_data:
#   wfdb_to_wav(f'/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/{input}'
#               , f'/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/{input}.wav')


In [None]:
# ! rm /content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice001.wav


In [None]:
! ls /content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/*.wav

/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice002.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice019.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice024.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice025.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice032.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice087.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice093.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice112.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice136.wav
/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice170.wav


In [None]:
# !pip install -q opensmile

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/996.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/996.2 kB[0m [31m4.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━[0m [32m532.5/996.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m952.3/996.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.2/996.2 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.9/140.9 kB[0m [31m6.3 MB/s[0m et

In [None]:
# import glob
# import opensmile

In [None]:
# wav_dir = '/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/*.wav'
# wav_paths = glob.glob(wav_dir)

In [None]:
# smile = opensmile.Smile(
#             feature_set=opensmile.FeatureSet.eGeMAPSv02, #or path to conf: 'gemaps/eGeMAPSv02.conf'
#             feature_level=opensmile.FeatureLevel.Functionals,
#             sampling_rate=16000,
#             resample=True,
#             # num_workers = 4,
#             verbose=True,
#         )
# feature_vectors = smile.process_files(wav_paths)
# df_voiced = feature_vectors.reset_index()




In [None]:
! find '/content/drive/MyDrive/neuro140/vfp/data/' -name '*.csv**'

/content/drive/MyDrive/neuro140/vfp/data/extracted_voiced.csv


In [6]:
df_voiced = pd.read_csv('/content/drive/MyDrive/neuro140/vfp/data/extracted_voiced.csv')

In [None]:
df.shape, df_voiced.shape

((453, 96), (10, 94))

In [None]:
set(df.columns)-set(df_voiced.columns)

{'cpp_amean',
 'cpp_percentile20',
 'cpp_percentile80',
 'cpp_stddevNorm',
 'filename',
 'sid',
 'token'}

In [None]:
set(df_voiced.columns)-set(df.columns)

{'Unnamed: 0', 'Unnamed: 0.1', 'end', 'file', 'start'}

In [None]:
df_voiced

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,file,start,end,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,loudness_sma3_amean,loudness_sma3_stddevNorm,loudness_sma3_percentile20.0,loudness_sma3_percentile50.0,loudness_sma3_percentile80.0,loudness_sma3_pctlrange0-2,loudness_sma3_meanRisingSlope,loudness_sma3_stddevRisingSlope,loudness_sma3_meanFallingSlope,loudness_sma3_stddevFallingSlope,spectralFlux_sma3_amean,spectralFlux_sma3_stddevNorm,mfcc1_sma3_amean,mfcc1_sma3_stddevNorm,mfcc2_sma3_amean,mfcc2_sma3_stddevNorm,mfcc3_sma3_amean,mfcc3_sma3_stddevNorm,mfcc4_sma3_amean,mfcc4_sma3_stddevNorm,jitterLocal_sma3nz_amean,jitterLocal_sma3nz_stddevNorm,shimmerLocaldB_sma3nz_amean,shimmerLocaldB_sma3nz_stddevNorm,HNRdBACF_sma3nz_amean,HNRdBACF_sma3nz_stddevNorm,logRelF0-H1-H2_sma3nz_amean,logRelF0-H1-H2_sma3nz_stddevNorm,logRelF0-H1-A3_sma3nz_amean,logRelF0-H1-A3_sma3nz_stddevNorm,F1frequency_sma3nz_amean,F1frequency_sma3nz_stddevNorm,F1bandwidth_sma3nz_amean,F1bandwidth_sma3nz_stddevNorm,F1amplitudeLogRelF0_sma3nz_amean,F1amplitudeLogRelF0_sma3nz_stddevNorm,F2frequency_sma3nz_amean,F2frequency_sma3nz_stddevNorm,F2bandwidth_sma3nz_amean,F2bandwidth_sma3nz_stddevNorm,F2amplitudeLogRelF0_sma3nz_amean,F2amplitudeLogRelF0_sma3nz_stddevNorm,F3frequency_sma3nz_amean,F3frequency_sma3nz_stddevNorm,F3bandwidth_sma3nz_amean,F3bandwidth_sma3nz_stddevNorm,F3amplitudeLogRelF0_sma3nz_amean,F3amplitudeLogRelF0_sma3nz_stddevNorm,alphaRatioV_sma3nz_amean,alphaRatioV_sma3nz_stddevNorm,hammarbergIndexV_sma3nz_amean,hammarbergIndexV_sma3nz_stddevNorm,slopeV0-500_sma3nz_amean,slopeV0-500_sma3nz_stddevNorm,slopeV500-1500_sma3nz_amean,slopeV500-1500_sma3nz_stddevNorm,spectralFluxV_sma3nz_amean,spectralFluxV_sma3nz_stddevNorm,mfcc1V_sma3nz_amean,mfcc1V_sma3nz_stddevNorm,mfcc2V_sma3nz_amean,mfcc2V_sma3nz_stddevNorm,mfcc3V_sma3nz_amean,mfcc3V_sma3nz_stddevNorm,mfcc4V_sma3nz_amean,mfcc4V_sma3nz_stddevNorm,alphaRatioUV_sma3nz_amean,hammarbergIndexUV_sma3nz_amean,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,target
0,0,0,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.760000,41.106068,0.010632,40.764374,41.2163,41.442463,0.678089,4.052532,1.518139,2.77226,2.565815,1.677246,0.299108,1.408646,1.869771,1.98744,0.578794,1.542931,1.708003,3.117692,1.895939,0.437334,0.351182,31.968632,0.193377,-15.0872,-0.626004,14.951662,0.256656,1.331085,6.264448,0.009515,0.379991,0.738326,0.399185,15.831425,0.019406,-3.141293,-0.580946,13.18476,0.244583,698.7873,0.177723,1472.1261,0.112237,-52.385483,-1.497761,1706.3716,0.061709,1060.0283,0.125768,-52.048748,-1.385113,2932.7512,0.021896,620.7319,0.234964,-59.20183,-1.158872,-22.150267,-0.073255,35.34545,0.061266,-0.08094,-0.060457,-0.006472,-0.719644,0.486579,0.18006,34.712284,0.082829,-18.000603,-0.262242,15.262449,0.248425,-2.05089,-2.558102,-33.24524,48.095634,-0.098215,0.00646,0.250932,1.263158,0.212766,3.79,0.0,0.44,0.35,-4.723413,1
1,1,1,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.760000,37.20432,0.016108,36.793556,37.266777,37.707714,0.914158,10.261423,3.980399,15.291357,13.499083,1.876834,0.137128,1.843049,1.914129,1.973783,0.130734,3.702135,3.241131,2.709963,4.822861,0.622507,0.389533,31.338884,0.086164,-27.209099,-0.254807,-1.439971,-3.572105,1.710625,3.015475,0.016649,1.042344,0.813495,0.627839,13.86736,0.052134,-1.421866,-1.819328,12.752843,0.256537,634.50104,0.215446,1451.3651,0.148046,-5.794622,-6.850891,1471.4595,0.181315,1368.1002,0.252958,-2.326,-17.228569,2798.285,0.060613,999.9029,0.263721,-19.29652,-1.881067,-15.409818,-0.138588,38.89668,0.060541,-0.058884,-0.09312,0.015513,0.305979,0.621621,0.350658,31.590034,0.067121,-27.874657,-0.152972,-1.816402,-2.661994,1.741285,2.79173,-27.715534,46.730465,-0.081661,0.002009,0.632736,1.263158,0.852878,1.11,0.675315,0.0425,0.022776,-5.265399,1
2,2,2,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.760000,35.428787,0.005093,35.293533,35.433758,35.558456,0.264923,1.79666,0.544999,1.764542,0.748955,1.607697,0.151625,1.537248,1.650445,1.732086,0.194837,0.65247,0.277681,1.132635,0.616883,0.488467,0.373802,38.86423,0.096682,-6.365295,-0.560211,5.621553,0.700412,-2.914335,-1.319369,0.009172,0.462453,0.831551,0.439327,13.438738,0.026903,1.724576,1.166209,19.205034,0.20626,640.69244,0.18929,1505.832,0.13908,-9.211846,-3.056225,1627.7123,0.174951,1576.0908,0.219038,-19.06639,-1.40193,2770.6323,0.093171,1132.7268,0.306635,-27.541555,-0.918543,-23.276455,-0.070173,38.845062,0.062881,-0.065167,-0.10634,-0.012343,-0.39263,0.497046,0.337763,39.29364,0.061196,-6.738256,-0.340642,5.489578,0.708529,-3.285148,-0.959987,-43.191696,58.81359,-0.118975,-0.004001,0.000861,0.421053,0.21322,4.58,0.0,0.09,0.0,-4.709769,1
3,3,3,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.760000,40.850372,0.01521,40.323463,40.717545,41.136993,0.81353,6.483765,9.287079,4.604616,5.386822,1.944762,0.143177,1.828358,1.943302,2.156005,0.327647,2.008369,1.534833,0.608676,0.31893,0.479896,0.314984,34.50317,0.077715,-25.758411,-0.25985,-0.914443,-3.853494,10.382014,0.361173,0.008078,0.658741,0.848791,0.389521,15.772016,0.019279,-7.1127,-0.272598,10.689266,0.387414,657.0971,0.138422,1280.1122,0.107761,-6.671498,-3.999251,1661.741,0.121123,1404.7126,0.20321,-8.914583,-2.981258,2733.2285,0.06456,1140.562,0.233265,-19.018642,-1.321659,-17.889074,-0.126223,34.37605,0.074275,-0.07915,-0.09121,-0.00306,-1.824382,0.490753,0.27926,34.78932,0.046352,-26.481201,-0.157339,-1.144098,-2.68333,10.24622,0.361351,-43.18728,58.80553,-0.118996,-0.004144,0.000986,0.631579,0.21322,4.59,0.0,0.08,0.0,-4.526144,1
4,4,4,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.800000,36.23328,0.059988,36.37696,36.602196,36.802765,0.425804,283.82495,273.56128,3.976065,5.979946,1.951305,0.15282,1.769434,2.0571,2.139549,0.370116,2.604363,0.974446,1.774735,1.285041,0.44925,0.505085,35.66005,0.087623,-22.229004,-0.284119,-1.398474,-3.655908,-3.277927,-1.261695,0.006651,1.774984,0.691918,0.730693,13.942858,0.080141,-1.999017,-1.578555,12.208253,0.267755,586.0009,0.128322,1172.4172,0.102521,-16.640812,-3.277353,1501.0493,0.095056,1254.0808,0.160787,-16.358696,-3.340623,2770.5425,0.024416,671.83527,0.224784,-28.58118,-1.782915,-17.605465,-0.1007,35.865105,0.081589,-0.056594,-0.116409,-0.002184,-2.150509,0.420588,0.416441,36.058186,0.060131,-23.227291,-0.171593,-1.222732,-3.808157,-3.704934,-0.971938,-24.22927,45.35611,-0.076723,-0.001166,0.751877,0.835073,0.845666,1.07,1.315485,0.0925,0.023848,-5.320218,1
5,5,5,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.760000,26.695305,0.003511,26.615082,26.6767,26.783375,0.168293,1.602813,0.669348,1.243749,0.695814,1.657886,0.143102,1.573567,1.666054,1.80487,0.231302,3.533619,3.254621,9.292496,9.404042,0.290199,0.379394,40.2556,0.10283,-8.739722,-0.551302,-14.176144,-0.346187,-11.862214,-0.404914,0.001134,1.190704,0.200038,1.308644,9.464295,0.012392,0.075817,12.918955,12.366653,0.266591,548.7727,0.124064,1412.3701,0.168536,5.977069,4.7318,1283.9342,0.140055,1465.8936,0.126396,6.854311,4.185886,2530.8418,0.065658,929.55206,0.134217,-12.599596,-2.059884,-17.971264,-0.111975,40.372906,0.086804,-0.055526,-0.085737,-0.032319,-0.181946,0.291062,0.287341,40.641808,0.072216,-9.061907,-0.42242,-14.751199,-0.222682,-12.331261,-0.281377,-43.187252,58.810467,-0.118193,-0.003547,0.000863,0.631579,0.21322,4.59,0.0,0.08,0.0,-5.424413,0
6,6,6,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.760000,32.850452,0.003456,32.74565,32.848923,32.93831,0.192657,2.044203,0.838449,2.043701,0.558836,1.871007,0.130897,1.809073,1.888086,1.98717,0.178096,1.077944,0.769276,0.363097,0.0,0.242101,0.415635,41.70554,0.093084,-4.829098,-0.732677,-6.483942,-0.64483,-7.4535,-0.512495,0.002426,0.788895,0.198923,0.71675,12.627122,0.007893,3.795844,0.395946,21.290071,0.180653,600.8079,0.138767,1497.7969,0.1123,-8.642124,-3.040251,1438.3898,0.116333,1620.2036,0.174076,-11.202502,-2.34986,2804.9226,0.03942,1031.5386,0.198674,-27.945599,-0.85616,-19.798729,-0.088502,42.02256,0.067673,-0.079074,-0.030416,-0.011655,-0.206219,0.241948,0.284839,42.1599,0.054341,-5.176785,-0.513959,-6.917502,-0.474289,-7.815508,-0.342651,-43.187252,58.808556,-0.11951,-0.004254,0.001048,0.210526,0.21322,4.59,0.0,0.08,0.0,-3.83874,0
7,7,7,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.760000,32.972847,0.003848,32.875767,32.98311,33.081703,0.205936,2.079094,0.819861,1.559061,0.72127,1.64917,0.153983,1.529679,1.630333,1.821755,0.292076,4.626455,3.525516,3.018314,2.478646,0.267122,0.620177,42.010868,0.116464,3.088284,1.589301,2.641295,1.354276,-6.335207,-0.770151,0.001853,1.354973,0.269381,1.467224,12.657359,0.018514,-1.198298,-1.293996,20.2522,0.229739,454.33414,0.084211,1365.3839,0.094121,-6.867095,-4.099307,1369.703,0.068281,1229.7515,0.104403,-14.976548,-1.807332,2553.607,0.027489,802.4443,0.230224,-27.153296,-0.943858,-24.551783,-0.093229,40.32903,0.111468,-0.070611,-0.057942,-0.023845,-0.182867,0.266932,0.506095,42.508705,0.084377,2.850834,1.67567,2.367646,1.388496,-6.729807,-0.606138,-43.191616,58.812946,-0.119244,-0.004547,0.000924,0.842105,0.21322,4.58,0.0,0.09,0.0,-4.014141,0
8,8,8,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.800000,35.27871,0.114073,36.9644,37.137844,37.193092,0.228691,253.21909,101.37671,66.60473,55.721546,1.814318,0.133779,1.743313,1.836948,1.952879,0.209566,3.645113,2.120595,0.215894,0.0,0.343266,0.254828,38.068314,0.096133,-10.787603,-0.415746,-5.737471,-0.746291,-11.965443,-0.363118,0.037477,1.153269,2.058662,0.399974,11.898176,0.11278,-10.514579,-0.534545,14.176907,0.205537,620.81,0.145958,1574.2892,0.174015,-0.85477,-34.453777,1369.0472,0.211109,1670.6656,0.171746,-3.811663,-7.658443,2634.5579,0.091821,1106.5784,0.173242,-20.957756,-1.247029,-19.087164,-0.092369,40.76043,0.053933,-0.070826,-0.034959,-0.023815,-0.10164,0.347858,0.166691,38.45159,0.062165,-11.362943,-0.257343,-6.020252,-0.559592,-12.544798,-0.197109,-43.191647,58.813446,-0.119213,-0.003953,0.000908,0.208768,0.212314,4.6,0.0,0.09,0.0,-4.140578,0
9,9,9,/content/drive/MyDrive/neuro140/vfp/data/VOICE...,0 days,0 days 00:00:04.760000,30.11433,0.002636,30.078041,30.118309,30.163399,0.085358,0.777915,0.377245,0.975632,0.598027,2.296844,0.130976,2.272999,2.375695,2.416687,0.143687,1.229326,0.416339,0.293502,0.166271,0.249762,0.340022,37.686813,0.079299,-17.642561,-0.269183,2.378913,0.853408,-6.576218,-0.53796,0.000853,0.66878,0.117392,0.529681,11.244315,0.004369,-0.928878,-1.032473,7.501792,0.214691,474.19098,0.07535,1040.1292,0.051134,0.278679,98.505936,1376.4232,0.03942,1002.47156,0.075888,0.951758,29.022131,2411.729,0.02419,685.6438,0.130977,-7.987405,-3.301029,-19.65977,-0.081831,32.10185,0.067725,-0.05293,-0.071657,-0.024816,-0.108717,0.250277,0.191073,38.038563,0.039267,-18.248281,-0.126282,2.237939,0.665701,-6.91278,-0.343313,-43.187252,58.808155,-0.119868,-0.004556,0.001107,0.421053,0.21322,4.59,0.0,0.08,0.0,-3.263012,0


# How we trained these models

In [None]:
# We choose Random Forest as it tends to have highest median classification across analyses
# model = RandomForestClassifier(n_estimators= 100)

# Others:
# LogisticRegressionCV(solver='liblinear', penalty = 'l1', max_iter = 100)
# MLPClassifier(alpha = 1, max_iter= 1000)
# SGDClassifier(loss='log', penalty="elasticnet", early_stopping=True, max_iter = 5000)

# XGBoost Model

In [7]:
training_model_name = 'xgboost'

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
import pandas as pd
import pickle

# Assume df_voiced, all_features, uncorrelated_features, and output_dir are defined elsewhere

for task in ['speech', 'vowel']:
    df = pd.read_csv(f'https://github.com/danielmlow/vfp/raw/main/data/input/features/egemaps_vector_{task}_cpp.csv', index_col=0)

    if task == 'speech':
        df = pd.concat([df, df_voiced])  # Assuming df_voiced is defined elsewhere

    for feature_set, model_name in zip([all_features, uncorrelated_features],  # Assuming these are defined
                                       ['biased', 'less-biased']):

        X = df[feature_set].values
        y = df['target'].values

        # Define the model
        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # Adjust parameters as necessary

        # Initialize StratifiedKFold to maintain the target distribution within each fold
        cv = StratifiedKFold(n_splits=5)

        # Perform 5-fold cross-validation and compute the ROC AUC score for each fold
        # Note: It's important to use a metric suitable for your problem, here we assume binary classification
        cv_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')

        # Print out the mean cross-validation score
        print(f'5-Fold CV ROC AUC Scores for {task} - {model_name} - {training_model_name}:', cv_scores)
        print(f'Mean CV ROC AUC for {task} - {model_name} - {training_model_name} :', cv_scores.mean())

        # You might still want to train and save the final model on the entire dataset
        model.fit(X, y)  # Train the model on the entire dataset

        # Save the trained model
        output_path = f'{output_dir}{task}_{model_name}_{training_model_name}.pkl'  # Assuming output_dir is defined
        with open(output_path, 'wb') as f:
            pickle.dump(model, f)

        # Optional: Load model example (commented)
        # with open(output_path, 'rb') as f:
        #     model = pickle.load(f)

        # Since we're now using cross-validation, the performance metric printed here would be from CV
        print(f'Completed training and saving for {task} - {model_name} - {training_model_name}with mean CV ROC AUC: {cv_scores.mean()}')


5-Fold CV ROC AUC Scores for speech - biased - xgboost: [0.94865865 0.93293247 0.92962963 0.91016548 0.89598109]
Mean CV ROC AUC for speech - biased - xgboost : 0.9234734642135198
Completed training and saving for speech - biased - xgboostwith mean CV ROC AUC: 0.9234734642135198
5-Fold CV ROC AUC Scores for speech - less-biased - xgboost: [0.88482886 0.83718779 0.87546296 0.9248227  0.88652482]
Mean CV ROC AUC for speech - less-biased - xgboost : 0.8817654263884606
Completed training and saving for speech - less-biased - xgboostwith mean CV ROC AUC: 0.8817654263884606
5-Fold CV ROC AUC Scores for vowel - biased - xgboost: [0.9705314  0.86183575 0.88309179 0.88599034 0.89651838]
Mean CV ROC AUC for vowel - biased - xgboost : 0.8995935301208198
Completed training and saving for vowel - biased - xgboostwith mean CV ROC AUC: 0.8995935301208198
5-Fold CV ROC AUC Scores for vowel - less-biased - xgboost: [0.95555556 0.8705314  0.88454106 0.84347826 0.84526112]
Mean CV ROC AUC for vowel - les

In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import pandas as pd
import pickle

# Updated hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 9],
    # 'learning_rate': [0.01, 0.1, 0.2],
    # 'min_child_weight': [1, 3, 5],
    # 'subsample': [0.6, 0.8, 1.0],
    # 'colsample_bytree': [0.6, 0.8, 1.0]
}

# Initialize StratifiedKFold
cv = StratifiedKFold(n_splits=5)

for task in ['speech', 'vowel']:
    df = pd.read_csv(f'https://github.com/danielmlow/vfp/raw/main/data/input/features/egemaps_vector_{task}_cpp.csv', index_col=0)
    if task == 'speech':
        df = pd.concat([df, df_voiced])  # Assuming df_voiced is defined elsewhere

    for feature_set, model_name in zip([all_features, uncorrelated_features],  # Assuming these are defined
                                       ['biased', 'less-biased']):
        X = df[feature_set].values
        y = df['target'].values

        # Initialize the classifier
        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

        # Set up GridSearchCV
        grid_search = GridSearchCV(model, param_grid, scoring='roc_auc', cv=cv, verbose=1, n_jobs=-1)
        grid_search.fit(X, y)

        # Best model after grid search
        best_model = grid_search.best_estimator_

        # Print out the best parameters and mean cross-validation score
        print(f'Best parameters for {task} - {model_name}:', grid_search.best_params_)
        print(f'Best CV score for {task} - {model_name}:', grid_search.best_score_)

        # Save the trained model
        output_path = f'{output_dir}{task}_{model_name}_{training_model_name}_hypertuned.pkl'  # Assuming output_dir is defined
        with open(output_path, 'wb') as f:
            pickle.dump(best_model, f)

        # Save the best parameters in a file
        output_path = f'{output_dir}{task}_{model_name}_{training_model_name}_hypertunedparameters.pkl'  # Assuming output_dir is defined
        with open(output_path, 'wb') as f:
            pickle.dump(grid_search.best_params_, f)

        print(f'Model saved for {task} - {model_name} with parameters {grid_search.best_params_}')


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters for speech - biased: {'max_depth': 3, 'n_estimators': 100}
Best CV score for speech - biased: 0.9299737897008942
Model saved for speech - biased with parameters {'max_depth': 3, 'n_estimators': 100}
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters for speech - less-biased: {'max_depth': 3, 'n_estimators': 100}
Best CV score for speech - less-biased: 0.8912711138520575
Model saved for speech - less-biased with parameters {'max_depth': 3, 'n_estimators': 100}
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters for vowel - biased: {'max_depth': 6, 'n_estimators': 50}
Best CV score for vowel - biased: 0.9022030667451574
Model saved for vowel - biased with parameters {'max_depth': 6, 'n_estimators': 50}
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best parameters for vowel - less-biased: {'max_depth': 3, 'n_estimators': 50}
Best CV score for vowel - l

# Other features

In [None]:
cpp_features = ['cpp_amean', 'cpp_stddevNorm', 'cpp_percentile20', 'cpp_percentile80']