<a href="https://colab.research.google.com/github/karlychann/neuro140/blob/main/uvfp_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Final models trained on the entire dataset, with and without biased features

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from io import BytesIO
import pickle
import requests
pd.set_option('display.max_columns', None)

In [3]:
all_features = ['F0semitoneFrom27.5Hz_sma3nz_amean',
       'F0semitoneFrom27.5Hz_sma3nz_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_percentile20.0',
       'F0semitoneFrom27.5Hz_sma3nz_percentile50.0',
       'F0semitoneFrom27.5Hz_sma3nz_percentile80.0',
       'F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2',
       'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope',
       'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
       'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope',
       'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope', 'loudness_sma3_amean',
       'loudness_sma3_stddevNorm', 'loudness_sma3_percentile20.0',
       'loudness_sma3_percentile50.0', 'loudness_sma3_percentile80.0',
       'loudness_sma3_pctlrange0-2', 'loudness_sma3_meanRisingSlope',
       'loudness_sma3_stddevRisingSlope', 'loudness_sma3_meanFallingSlope',
       'loudness_sma3_stddevFallingSlope', 'spectralFlux_sma3_amean',
       'spectralFlux_sma3_stddevNorm', 'mfcc1_sma3_amean',
       'mfcc1_sma3_stddevNorm', 'mfcc2_sma3_amean', 'mfcc2_sma3_stddevNorm',
       'mfcc3_sma3_amean', 'mfcc3_sma3_stddevNorm', 'mfcc4_sma3_amean',
       'mfcc4_sma3_stddevNorm', 'jitterLocal_sma3nz_amean',
       'jitterLocal_sma3nz_stddevNorm', 'shimmerLocaldB_sma3nz_amean',
       'shimmerLocaldB_sma3nz_stddevNorm', 'HNRdBACF_sma3nz_amean',
       'HNRdBACF_sma3nz_stddevNorm', 'logRelF0-H1-H2_sma3nz_amean',
       'logRelF0-H1-H2_sma3nz_stddevNorm', 'logRelF0-H1-A3_sma3nz_amean',
       'logRelF0-H1-A3_sma3nz_stddevNorm', 'F1frequency_sma3nz_amean',
       'F1frequency_sma3nz_stddevNorm', 'F1bandwidth_sma3nz_amean',
       'F1bandwidth_sma3nz_stddevNorm', 'F1amplitudeLogRelF0_sma3nz_amean',
       'F1amplitudeLogRelF0_sma3nz_stddevNorm', 'F2frequency_sma3nz_amean',
       'F2frequency_sma3nz_stddevNorm', 'F2bandwidth_sma3nz_amean',
       'F2bandwidth_sma3nz_stddevNorm', 'F2amplitudeLogRelF0_sma3nz_amean',
       'F2amplitudeLogRelF0_sma3nz_stddevNorm', 'F3frequency_sma3nz_amean',
       'F3frequency_sma3nz_stddevNorm', 'F3bandwidth_sma3nz_amean',
       'F3bandwidth_sma3nz_stddevNorm', 'F3amplitudeLogRelF0_sma3nz_amean',
       'F3amplitudeLogRelF0_sma3nz_stddevNorm', 'alphaRatioV_sma3nz_amean',
       'alphaRatioV_sma3nz_stddevNorm', 'hammarbergIndexV_sma3nz_amean',
       'hammarbergIndexV_sma3nz_stddevNorm', 'slopeV0-500_sma3nz_amean',
       'slopeV0-500_sma3nz_stddevNorm', 'slopeV500-1500_sma3nz_amean',
       'slopeV500-1500_sma3nz_stddevNorm', 'spectralFluxV_sma3nz_amean',
       'spectralFluxV_sma3nz_stddevNorm', 'mfcc1V_sma3nz_amean',
       'mfcc1V_sma3nz_stddevNorm', 'mfcc2V_sma3nz_amean',
       'mfcc2V_sma3nz_stddevNorm', 'mfcc3V_sma3nz_amean',
       'mfcc3V_sma3nz_stddevNorm', 'mfcc4V_sma3nz_amean',
       'mfcc4V_sma3nz_stddevNorm', 'alphaRatioUV_sma3nz_amean',
       'hammarbergIndexUV_sma3nz_amean', 'slopeUV0-500_sma3nz_amean',
       'slopeUV500-1500_sma3nz_amean', 'spectralFluxUV_sma3nz_amean',
       'loudnessPeaksPerSec', 'VoicedSegmentsPerSec',
       'MeanVoicedSegmentLengthSec', 'StddevVoicedSegmentLengthSec',
       'MeanUnvoicedSegmentLength', 'StddevUnvoicedSegmentLength',
       'equivalentSoundLevel_dBp']


# from classification_wo_correlated_features.ipynb features that correlate least with biased features
uncorrelated_features = ['mfcc4V_sma3nz_amean',
       'F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope', 'mfcc1_sma3_amean',
       'F3bandwidth_sma3nz_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope',
       'F1frequency_sma3nz_stddevNorm', 'jitterLocal_sma3nz_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope',
       'alphaRatioV_sma3nz_stddevNorm', 'mfcc1_sma3_stddevNorm',
       'F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope', 'mfcc4_sma3_amean',
       'F3frequency_sma3nz_amean', 'mfcc2_sma3_amean',
       'VoicedSegmentsPerSec', 'F1bandwidth_sma3nz_amean',
       'mfcc2V_sma3nz_amean', 'F3frequency_sma3nz_stddevNorm',
       'hammarbergIndexV_sma3nz_stddevNorm',
       'logRelF0-H1-H2_sma3nz_amean', 'slopeV500-1500_sma3nz_stddevNorm',
       'F2bandwidth_sma3nz_amean', 'mfcc3_sma3_amean',
       'F2bandwidth_sma3nz_stddevNorm', 'alphaRatioV_sma3nz_amean',
       'mfcc2_sma3_stddevNorm', 'mfcc1V_sma3nz_amean',
       'slopeUV0-500_sma3nz_amean', 'mfcc1V_sma3nz_stddevNorm',
       'mfcc3V_sma3nz_amean', 'F2frequency_sma3nz_amean',
       'logRelF0-H1-A3_sma3nz_amean', 'hammarbergIndexV_sma3nz_amean',
       'F1bandwidth_sma3nz_stddevNorm', 'mfcc3_sma3_stddevNorm',
       'mfcc2V_sma3nz_stddevNorm', 'F1frequency_sma3nz_amean',
       'F2frequency_sma3nz_stddevNorm',
       'logRelF0-H1-H2_sma3nz_stddevNorm', 'mfcc4V_sma3nz_stddevNorm',
       'mfcc4_sma3_stddevNorm', 'F3bandwidth_sma3nz_amean',
       'mfcc3V_sma3nz_stddevNorm', 'slopeV0-500_sma3nz_stddevNorm']

In [4]:
# load pretrained model
# model_name = 'less-biased'
# training_model_name = 'rf'
# task = 'speech'
# feature_set = uncorrelated_features

# url_path = f'https://github.com/danielmlow/vfp/blob/main/data/output/{training_model_name}_{model_name}_{task}.pkl?raw=true' # speech models trained on reading task
# mfile = BytesIO(requests.get(url_path).content) # load from url
# model = pickle.load(mfile)

In [12]:
# let's test on our own data
task = 'speech'
feature_set = uncorrelated_features

df = pd.read_csv(f'https://github.com/danielmlow/vfp/raw/main/data/input/features/egemaps_vector_{task}_cpp.csv', index_col = 0)

# X is just getting the values of all the columns that is used in the model for each recording
# Y is just getting the target value of all the recordings
# numpy array is a low level array
# pandas dataframe is a higher level abstraction to wrap around numpy array
X = df[feature_set].values
y = df['target'].values

# test
# y_pred = model.predict(X)
# performance = roc_auc_score(y, y_pred) # Should get perfect performance, since it is testing on the training set
# performance
print(X.shape, y.shape, df.shape)

(453, 44) (453,) (453, 96)


# How to extract features on your own wav files using egemaps

To test on your own data, the test set should match our features (egemaps) using the the same variables and sampling rate (16k)

In [None]:
# from os.path import exists
# # config: depends whether you're on Google Colab or local


# # Get URL from github csv by clicking on Download > Copy Link Address

# load_from_google_drive = False

# if load_from_google_drive:
#       # On google colab
#       # Mount GDrive and attach it to the colab for data I/O
#     from google.colab import drive
#     drive.mount('/content/drive')
#     input_dir = '/content/drive/My Drive/datum/vfp/data/input/'
#     output_dir = '/content/drive/My Drive/datum/vfp/data/output/'
#     os.makedirs(output_dir, exist_ok=True)

# else:
#   # If using jupyter-lab or jupyter notebook, load locally:
#   input_dir = './data/input/'
#   output_dir = './data/output/'



In [14]:
from google.colab import drive
drive.mount('/content/drive')
input_dir = '/content/drive/My Drive/neuro140/vfp/data/input/'
output_dir = '/content/drive/My Drive/neuro140/vfp/data/output/'
os.makedirs(output_dir, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip /content/drive/MyDrive/neuro140/vfp/data/archive.zip -d /content/drive/MyDrive/neuro140/vfp/data

In [None]:
! grep Diagnosis: /content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/.txt

In [None]:
pip install wfdb scipy

In [21]:
import wfdb
from scipy.io.wavfile import write

def wfdb_to_wav(input_path, output_path, channel=0):
    """
    Convert a WFDB file to a WAV file.

    Parameters:
    - input_path: Path to the input WFDB file.
    - output_path: Path to the output WAV file.
    - channel: The channel of the WFDB file to convert (default is 0)
    """
    # Read the WFDB file
    record = wfdb.rdrecord(input_path)

    # Extract the signal from the specified channel
    signal = record.p_signal[:, channel]

    # Normalize the signal to be in the range of int16 (required for WAV files)
    signal_normalized = ((signal - signal.min()) / (signal.max() - signal.min()) * (2**15 - 1) - 2**15).astype('int16')

    # Write the normalized signal to a WAV file
    # Note: The sample rate is set according to the WFDB record's sampling frequency
    write(output_path, record.fs, signal_normalized)

# Example usage
input_path = '/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice001'  # Change this to the path of your WFDB file
output_path = '/content/drive/MyDrive/neuro140/vfp/data/VOICEDDATASET/voice001.wav'  # Change this to your desired output path
wfdb_to_wav(input_path, output_path)


In [None]:
# !pip install -q opensmile

In [None]:
# import glob
# import opensmile

In [None]:
# wav_dir = './../../../data/blake_private/audio/blake_16khz/*'
# wav_paths = glob.glob(wav_dir)

In [None]:
# smile = opensmile.Smile(
#             feature_set=opensmile.FeatureSet.eGeMAPSv02, #or path to conf: 'gemaps/eGeMAPSv02.conf'
#             feature_level=feature_level,
#             sampling_rate=16000,
#             resample=True,
#             # num_workers = 4,
#             verbose=True,
#         )
# feature_vectors = smile.process_files([
#     wav_path


# ])
# df = feature_vectors.reset_index()


# How we trained these models

In [16]:
# We choose Random Forest as it tends to have highest median classification across analyses
model = RandomForestClassifier(n_estimators= 100)
training_model_name = 'rf'

# Others:
# LogisticRegressionCV(solver='liblinear', penalty = 'l1', max_iter = 100)
# MLPClassifier(alpha = 1, max_iter= 1000)
# SGDClassifier(loss='log', penalty="elasticnet", early_stopping=True, max_iter = 5000)

In [17]:
for task in ['speech', 'vowel']:
    df = pd.read_csv(f'https://github.com/danielmlow/vfp/raw/main/data/input/features/egemaps_vector_{task}_cpp.csv', index_col = 0)

    for feature_set, model_name in zip([all_features, uncorrelated_features],
                                       ['biased', 'less-biased']
                                      ):

        X = df[feature_set].values
        y = df['target'].values
        print(X.shape, y.shape)

        # train and save model

        output_path = output_dir+f'{training_model_name}_{model_name}_{task}.pkl'

        model = RandomForestClassifier(n_estimators= 100)
        model.fit(X,y) # train

        # save
        # file that is opened is called f
        with open(output_path,'wb') as f:
            pickle.dump(model,f)

        # load
        # with open(output_path, 'rb') as f:
        #     model = pickle.load(f)

        # model is predicting on the training set, reporting the training accuracy
        y_pred = model.predict(X)
        performance = roc_auc_score(y, y_pred) # Should get perfect performance, since it is testing on the training set
        print('performance:', task, model_name, performance)



(453, 88) (453,)
performance: speech biased 1.0
(453, 44) (453,)
performance: speech less-biased 1.0
(455, 88) (455,)
performance: vowel biased 1.0
(455, 44) (455,)
performance: vowel less-biased 1.0


# Other features

In [None]:
cpp_features = ['cpp_amean', 'cpp_stddevNorm', 'cpp_percentile20', 'cpp_percentile80']