In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import torch
from transformers import AutoFeatureExtractor, WhisperModel, WhisperProcessor
from transformers import pipeline
import torchaudio
from torchinfo import summary

In [115]:
all_df = pd.read_csv('final_metadata_acoustic_features.csv', sep=',',  decimal=',')
print(all_df['Dataset'].unique())
#all_df = all_df.sample(frac=1).reset_index(drop=True) # shuffle
df = all_df[all_df['Dataset'].isin(['VOC-ALS', 'MINSK'])] #[all_df['Phoneme'] == 'A'] 

['VOC-ALS' 'PD_dataset_2' 'MSA' 'PSP' 'PD_dataset_1' 'PD_dataset_3'
 'MINSK' 'Italian']


In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 893 entries, 0 to 1249
Data columns (total 95 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Unnamed: 0           893 non-null    int64 
 1   subjectID            893 non-null    object
 2   voiced_file_path     893 non-null    object
 3   file_path            893 non-null    object
 4   Age                  893 non-null    object
 5   Sex                  893 non-null    object
 6   Severity             510 non-null    object
 7   Phoneme              893 non-null    object
 8   label                893 non-null    object
 9   Dataset              893 non-null    object
 10  rapJitter            893 non-null    object
 11  localJitter          893 non-null    object
 12  localabsoluteJitter  893 non-null    object
 13  ppq5Jitter           893 non-null    object
 14  ddpJitter            893 non-null    object
 15  localShimmer         893 non-null    object
 16  localdbShimm

In [117]:
df

Unnamed: 0.1,Unnamed: 0,subjectID,voiced_file_path,file_path,Age,Sex,Severity,Phoneme,label,Dataset,...,intensity_mean_dB,intensity_sd_dB,intensity_cv_dB,F0_F1_comp,F0_F2_comp,F0_F3_comp,F1_F2_comp,F1_F3_comp,F2_F3_comp,F0_IN_comp
0,0,PZ094,data/VOC-ALS/ALS/A/voiced/PZ094_phonationA_Onl...,data/VOC-ALS/ALS/A/PZ094_phonationA.wav,51.0,M,12.0,A,ALS,VOC-ALS,...,65.26295571057149,1.94010716278506,0.0297275405574497,12.0,13.0,8.0,4.0,1.0,2.0,10.0
1,1,PZ081,data/VOC-ALS/ALS/A/voiced/PZ081_phonationA_Onl...,data/VOC-ALS/ALS/A/PZ081_phonationA.wav,70.0,F,12.0,A,ALS,VOC-ALS,...,67.60580442378135,3.4645237854345767,0.0512459516599713,4.0,4.0,3.0,5.0,2.0,2.0,3.0
2,2,PZ111,data/VOC-ALS/ALS/A/voiced/PZ111_phonationA_Onl...,data/VOC-ALS/ALS/A/PZ111_phonationA.wav,55.0,M,7.0,A,ALS,VOC-ALS,...,71.84654410250067,2.264392188318661,0.0315170648303993,5.0,3.0,2.0,4.0,0.0,0.0,3.0
3,3,PZ104,data/VOC-ALS/ALS/A/voiced/PZ104_phonationA_Onl...,data/VOC-ALS/ALS/A/PZ104_phonationA.wav,75.0,M,10.0,A,ALS,VOC-ALS,...,70.45599641527039,2.3516386292654468,0.0333774092896905,3.0,14.0,2.0,2.0,0.0,7.0,5.0
4,4,PZ089,data/VOC-ALS/ALS/A/voiced/PZ089_phonationA_Onl...,data/VOC-ALS/ALS/A/PZ089_phonationA.wav,50.0,M,11.0,A,ALS,VOC-ALS,...,68.09601971758357,2.819326131724274,0.0414022162149409,6.0,10.0,11.0,5.0,4.0,7.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1245,1245,025_a,data/Minsk2020_ALS_database/ALS/voiced/025_a_O...,data/Minsk2020_ALS_database/ALS/025_a.wav,51.0,M,,a,ALS,MINSK,...,82.23856789679324,1.1420189223590804,0.0138866586756748,5.0,5.0,6.0,5.0,7.0,6.0,3.0
1246,1246,094_a,data/Minsk2020_ALS_database/ALS/voiced/094_a_O...,data/Minsk2020_ALS_database/ALS/094_a.wav,55.0,F,,a,ALS,MINSK,...,65.84191679670987,3.310521520218331,0.0502798472656822,11.0,15.0,15.0,19.0,19.0,16.0,5.0
1247,1247,096_a,data/Minsk2020_ALS_database/ALS/voiced/096_a_O...,data/Minsk2020_ALS_database/ALS/096_a.wav,52.0,F,,a,ALS,MINSK,...,70.04316575358658,2.487346025447005,0.0355116162824156,5.0,4.0,6.0,1.0,4.0,14.0,4.0
1248,1248,102_i,data/Minsk2020_ALS_database/ALS/voiced/102_i_O...,data/Minsk2020_ALS_database/ALS/102_i.wav,53.0,F,,i,ALS,MINSK,...,71.07391073515156,2.3903785600808547,0.0336322925720003,2.0,6.0,4.0,5.0,6.0,9.0,2.0


In [118]:
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperModel.from_pretrained("openai/whisper-base")

In [136]:
def process_file(file_path):
    waveform, sample_rate = torchaudio.load(file_path)
    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
    waveform = torch.mean(waveform, dim=0, keepdim=False)
    #waveform = torchaudio.transforms.Vad(sample_rate=16000)(waveform).squeeze(0)
    print(waveform.shape)
    
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
   
    
    
    with torch.no_grad():
        encoder_outputs = model.encoder(inputs.input_features).last_hidden_state
    #print(encoder_outputs.shape)
    return encoder_outputs

process_file("data/voc-als-data-wav/CT001_phonationE.wav")

torch.Size([283520])


tensor([[[-1.4901,  0.4910, -0.6511,  ..., -0.6344,  1.0671,  0.7626],
         [-0.2755,  0.2858,  0.6054,  ..., -0.2588,  0.7666,  0.3234],
         [-0.2191,  1.0572,  0.2114,  ..., -0.7902,  0.6914,  0.3232],
         ...,
         [-0.4014, -1.0865,  0.3372,  ...,  0.5190, -0.4958,  0.6408],
         [-0.7207, -0.9906,  0.2412,  ...,  0.4102, -0.0178,  0.4921],
         [-0.7329, -0.5356,  0.9796,  ...,  0.1833, -0.8070, -1.0586]]])

In [137]:
files = df['voiced_file_path']
embeddings = []
for file in files:
    embeddings.append(process_file(file))
embeddings_t = torch.cat(embeddings)

torch.Size([101776])
torch.Size([105616])
torch.Size([149776])
torch.Size([75376])
torch.Size([122896])
torch.Size([53296])
torch.Size([299056])
torch.Size([26416])
torch.Size([15856])
torch.Size([43696])
torch.Size([88816])
torch.Size([96976])
torch.Size([168496])
torch.Size([39856])
torch.Size([191056])
torch.Size([135856])
torch.Size([184816])
torch.Size([132496])
torch.Size([94576])
torch.Size([273616])
torch.Size([240976])
torch.Size([69616])
torch.Size([59536])
torch.Size([239536])
torch.Size([101776])
torch.Size([309136])
torch.Size([147856])
torch.Size([277456])
torch.Size([352336])
torch.Size([188656])
torch.Size([96496])
torch.Size([172336])
torch.Size([168016])
torch.Size([56656])
torch.Size([200176])
torch.Size([57616])
torch.Size([149296])
torch.Size([207856])
torch.Size([18736])
torch.Size([33136])
torch.Size([26896])
torch.Size([93136])
torch.Size([85936])
torch.Size([256336])
torch.Size([179056])
torch.Size([518896])
torch.Size([153616])
torch.Size([289936])
torch.Size(

In [138]:
y = df['label']
# HC=0, ALS=1
y = pd.get_dummies(y, drop_first=True)
y = np.logical_not(y).astype(float).to_numpy().ravel()
print(y.shape)
y

(893,)


array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1.

In [139]:
subjects = df['subjectID'].unique()
train_subjects, test_subjects = train_test_split(subjects, test_size=0.3, random_state=0)
X = embeddings_t.mean(dim=1).detach().numpy()
X_train, X_test, y_train, y_test = X[df['subjectID'].isin(train_subjects)], X[df['subjectID'].isin(test_subjects)], y[df['subjectID'].isin(train_subjects)], y[df['subjectID'].isin(test_subjects)]
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((632, 512), (261, 512), (632,), (261,))

In [140]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, np.ones_like(y_test))

0.5862068965517241

In [156]:
#fitting the model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.6513409961685823

In [157]:
def mean_majority_accuracy(model):
    accuracy_mean = 0
    accuracy_majority = 0
    accuracy_first = 0
    for subject in test_subjects:
        X_test_subject = X[df['subjectID'] == subject]
        y_test_subject = y[df['subjectID'] == subject]
        y_pred = model.predict_proba(X_test_subject)[:, -1]
        y_mean = y_pred.mean() > 0.5
        y_pred_hard = y_pred > 0.5
        y_majority = y_pred_hard.mean() > 0.5
        accuracy_mean += y_mean == y_test_subject.mean()
        accuracy_majority += y_majority == y_test_subject.mean()
        accuracy_first += y_pred_hard[0] == y_test_subject[0]

    accuracy_mean /= len(test_subjects)
    accuracy_majority /= len(test_subjects)
    accuracy_first /= len(test_subjects)
    return accuracy_mean.item(), accuracy_majority.item(), accuracy_first.item()

mean_majority_accuracy(model)

(0.6506024096385542, 0.6506024096385542, 0.6626506024096386)

In [143]:

#fitting random forest
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)

(0.6168582375478927,
 (0.6144578313253012, 0.6144578313253012, 0.6506024096385542))

In [145]:
#fitting K
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)

(0.6053639846743295,
 (0.5903614457831325, 0.5783132530120482, 0.6626506024096386))

In [149]:
# fitting decision tree
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)

(0.6245210727969349,
 (0.6506024096385542, 0.6506024096385542, 0.7108433734939759))

In [150]:
# fitting naive bayes
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)

(0.5440613026819924,
 (0.4939759036144578, 0.4939759036144578, 0.5301204819277109))

In [151]:
# fitting neural network
from sklearn.neural_network import MLPClassifier

model = MLPClassifier(max_iter=10000)
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)

(0.6551724137931034,
 (0.6746987951807228, 0.6506024096385542, 0.6265060240963856))

In [152]:
# fitting gradient boosting
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)

(0.6819923371647509,
 (0.6746987951807228, 0.6867469879518072, 0.7228915662650602))

In [153]:
# fitting adaboost
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)



(0.6436781609195402,
 (0.6506024096385542, 0.6506024096385542, 0.6144578313253012))

In [154]:
# fitting bagging
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)

(0.6513409961685823,
 (0.6506024096385542, 0.6385542168674698, 0.6385542168674698))

In [155]:
# fitting extra trees
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test), mean_majority_accuracy(model)

(0.6283524904214559,
 (0.5903614457831325, 0.6265060240963856, 0.6506024096385542))