In [40]:
import torch
import torchaudio
import matplotlib.pyplot as plt
import os, glob, re
from models.stutterModel import StutterDetectionModel_FC, StutterDetectionModel_LSTM
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
from tqdm import tqdm


device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Lendo e Processando as labels para o SEP

In [25]:

labels_csv_caminho = '/home/filhoij/Documents/CEIA/disfluency/projeto_residual_bi_lstm/data/sep_28/'
sep_28_lab = 'SEP-28k_labels.csv'
fbank_lab = 'fluencybank_labels.csv'
df_sep28k = pd.read_csv(labels_csv_caminho + sep_28_lab)
df_fluencybank = pd.read_csv(labels_csv_caminho + fbank_lab)

# Combinando os dfs
df_combined = pd.concat([df_sep28k, df_fluencybank], ignore_index=True)
stuttering_labels = ['Block', 'Prolongation', 'SoundRep', 'WordRep', 'Interjection']
df_combined['Stuttering'] = df_combined[stuttering_labels].sum(axis=1).apply(lambda x: 1 if x > 0 else 0)

# Preparar os nomes para o dicionário
df_combined['EpId'] = df_combined.apply(lambda row: f'{int(row.EpId):03}' if row.Show == 'FluencyBank' and int(row.EpId) < 100 else row.EpId, axis=1)
df_combined['filename'] = df_combined['Show'] + '_' + df_combined['EpId'].astype(str) + '_' + df_combined['ClipId'].astype(str)

# Criar o dicionário de gagueira
stuttering_dict = df_combined.set_index('filename')['Stuttering'].to_dict()

stuttering_dict['FluencyBank_010_0']

1

# Lendo os modelos a partir dos checkpoints salvos

In [37]:
model_fc_name = 'FC_Model'
model_lstm_name = 'LSTM_Model'
epoch = '12'  
# Caminho para o checkpoint
checkpoint_dir = 'checkpoints'
checkpoint_path_fc = os.path.join(checkpoint_dir, f'{model_fc_name}_epoch_{epoch}.pth')
checkpoint_path_lstm = os.path.join(checkpoint_dir, f'{model_lstm_name}_epoch_{epoch}.pth')

# Inicialize o modelo

num_disfluencies = 1  # Saída binária
model_lstm = StutterDetectionModel_LSTM(num_disfluencies)
model_fc = StutterDetectionModel_FC(num_disfluencies)

# Carregue o checkpoint
checkpoint_1 = torch.load(checkpoint_path_fc)
checkpoint_2 = torch.load(checkpoint_path_lstm)
model_fc.load_state_dict(checkpoint_1['model_state_dict'])
model_lstm.load_state_dict(checkpoint_2['model_state_dict'])
model_fc = model_fc.to(device)
model_lstm = model_lstm.to(device)
model_fc.eval() 
model_lstm.eval()

  checkpoint_1 = torch.load(checkpoint_path_fc)
  checkpoint_2 = torch.load(checkpoint_path_lstm)


StutterDetectionModel_LSTM(
  (resnet): ResNet18(
    (conv1): Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (layer1): Sequential(
      (0): ResidualBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (shortcut): Sequential()
      )
      (1): ResidualBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1

In [51]:
true_labels = []
predictions_fc = []
predictions_lstm = []

In [48]:
def process_file(wav_file):
    filename = os.path.basename(wav_file).replace('.wav', '')
    
    has_stuttering = stuttering_dict.get(filename, 0)
    
    try:
        waveform, sample_rate = torchaudio.load(wav_file)
        mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate, 
            n_mels=100,
            n_fft=512,
            win_length=400,
            hop_length=160,
        )
        
        mel_spectrogram = mel_spectrogram_transform(waveform)
        mel_spectrogram = torch.log(mel_spectrogram + 1e-13)
        mel_spectrogram = mel_spectrogram.unsqueeze(0).to(device)
        
        with torch.no_grad():
            output_fc = model_fc(mel_spectrogram)
            output_lstm = model_lstm(mel_spectrogram)
            pred_fc = torch.sigmoid(output_fc).round().item()
            pred_lstm = torch.sigmoid(output_lstm).round().item()
        
        predictions_fc.append(pred_fc)
        predictions_lstm.append(pred_lstm)
        true_labels.append(has_stuttering)
    except Exception as e:
        print(f'Error processing file {filename}: {e}')
    

In [29]:
data_path = 'data/sep_28/clips/stuttering-clips/clips'
wav_files = glob.glob(os.path.join(data_path, '*.wav'))
wav_files

['data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_0.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_1.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_10.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_11.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_12.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_13.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_14.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_15.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_16.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_17.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_18.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_19.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_2.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyBank_010_20.wav',
 'data/sep_28/clips/stuttering-clips/clips/FluencyB

In [52]:
# Processamento dos arquivos
for wav_file in tqdm(wav_files):
    process_file(wav_file)

  1%|          | 334/32321 [00:04<02:56, 180.86it/s]

Error processing file FluencyBank_019_10: Failed to decode audio.
Error processing file FluencyBank_019_11: Failed to decode audio.
Error processing file FluencyBank_019_12: Failed to decode audio.
Error processing file FluencyBank_019_13: Failed to decode audio.
Error processing file FluencyBank_019_14: Failed to decode audio.
Error processing file FluencyBank_019_15: Failed to decode audio.
Error processing file FluencyBank_019_16: Failed to decode audio.
Error processing file FluencyBank_019_17: Failed to decode audio.
Error processing file FluencyBank_019_20: Failed to decode audio.
Error processing file FluencyBank_019_21: Failed to decode audio.
Error processing file FluencyBank_019_22: Failed to decode audio.
Error processing file FluencyBank_019_23: Failed to decode audio.
Error processing file FluencyBank_019_24: Failed to decode audio.
Error processing file FluencyBank_019_25: Failed to decode audio.
Error processing file FluencyBank_019_26: Failed to decode audio.
Error proc

  1%|          | 362/32321 [00:04<02:35, 205.37it/s]

Error processing file FluencyBank_019_78: Failed to decode audio.
Error processing file FluencyBank_019_79: Failed to decode audio.
Error processing file FluencyBank_019_8: Failed to decode audio.
Error processing file FluencyBank_019_9: Failed to decode audio.


  8%|▊         | 2654/32321 [00:35<04:38, 106.41it/s]

Error processing file FluencyBank_118_10: Failed to decode audio.
Error processing file FluencyBank_118_100: Failed to decode audio.
Error processing file FluencyBank_118_101: Failed to decode audio.
Error processing file FluencyBank_118_104: Failed to decode audio.
Error processing file FluencyBank_118_105: Failed to decode audio.
Error processing file FluencyBank_118_106: Failed to decode audio.
Error processing file FluencyBank_118_107: Failed to decode audio.
Error processing file FluencyBank_118_108: Failed to decode audio.
Error processing file FluencyBank_118_109: Failed to decode audio.
Error processing file FluencyBank_118_11: Failed to decode audio.
Error processing file FluencyBank_118_12: Failed to decode audio.
Error processing file FluencyBank_118_13: Failed to decode audio.
Error processing file FluencyBank_118_14: Failed to decode audio.
Error processing file FluencyBank_118_17: Failed to decode audio.
Error processing file FluencyBank_118_18: Failed to decode audio.
Er

  8%|▊         | 2747/32321 [00:36<01:46, 277.81it/s]

Error processing file FluencyBank_118_59: Failed to decode audio.
Error processing file FluencyBank_118_6: Failed to decode audio.
Error processing file FluencyBank_118_60: Failed to decode audio.
Error processing file FluencyBank_118_61: Failed to decode audio.
Error processing file FluencyBank_118_62: Failed to decode audio.
Error processing file FluencyBank_118_63: Failed to decode audio.
Error processing file FluencyBank_118_64: Failed to decode audio.
Error processing file FluencyBank_118_65: Failed to decode audio.
Error processing file FluencyBank_118_66: Failed to decode audio.
Error processing file FluencyBank_118_67: Failed to decode audio.
Error processing file FluencyBank_118_68: Failed to decode audio.
Error processing file FluencyBank_118_69: Failed to decode audio.
Error processing file FluencyBank_118_7: Failed to decode audio.
Error processing file FluencyBank_118_70: Failed to decode audio.
Error processing file FluencyBank_118_71: Failed to decode audio.
Error proces

 15%|█▌        | 4926/32321 [01:06<02:55, 156.36it/s]

Error processing file HeStutters_0_0: Failed to decode audio.
Error processing file HeStutters_0_1: Failed to decode audio.
Error processing file HeStutters_0_10: Failed to decode audio.
Error processing file HeStutters_0_11: Failed to decode audio.
Error processing file HeStutters_0_12: Failed to decode audio.
Error processing file HeStutters_0_13: Failed to decode audio.
Error processing file HeStutters_0_14: Failed to decode audio.
Error processing file HeStutters_0_15: Failed to decode audio.
Error processing file HeStutters_0_16: Failed to decode audio.
Error processing file HeStutters_0_2: Failed to decode audio.
Error processing file HeStutters_0_21: Failed to decode audio.
Error processing file HeStutters_0_22: Failed to decode audio.
Error processing file HeStutters_0_23: Failed to decode audio.
Error processing file HeStutters_0_24: Failed to decode audio.
Error processing file HeStutters_0_25: Failed to decode audio.
Error processing file HeStutters_0_27: Failed to decode au

 20%|█▉        | 6337/32321 [01:26<05:33, 77.90it/s] 

Error processing file HeStutters_1_100: Failed to decode audio.
Error processing file HeStutters_1_101: Failed to decode audio.
Error processing file HeStutters_1_102: Failed to decode audio.
Error processing file HeStutters_1_103: Failed to decode audio.
Error processing file HeStutters_1_104: Failed to decode audio.


 20%|█▉        | 6386/32321 [01:27<05:37, 76.86it/s]

Error processing file HeStutters_1_15: Failed to decode audio.
Error processing file HeStutters_1_16: Failed to decode audio.


 20%|█▉        | 6426/32321 [01:27<06:03, 71.22it/s]

Error processing file HeStutters_1_19: Failed to decode audio.
Error processing file HeStutters_1_20: Failed to decode audio.


 20%|█▉        | 6450/32321 [01:27<06:01, 71.56it/s]

Error processing file HeStutters_1_21: Failed to decode audio.
Error processing file HeStutters_1_22: Failed to decode audio.


 20%|██        | 6474/32321 [01:28<05:42, 75.57it/s]

Error processing file HeStutters_1_23: Failed to decode audio.
Error processing file HeStutters_1_24: Failed to decode audio.


 20%|██        | 6498/32321 [01:28<05:46, 74.57it/s]

Error processing file HeStutters_1_25: Failed to decode audio.
Error processing file HeStutters_1_26: Failed to decode audio.


 20%|██        | 6535/32321 [01:28<03:18, 129.73it/s]

Error processing file HeStutters_1_27: Failed to decode audio.
Error processing file HeStutters_1_28: Failed to decode audio.
Error processing file HeStutters_1_29: Failed to decode audio.
Error processing file HeStutters_1_30: Failed to decode audio.
Error processing file HeStutters_1_31: Failed to decode audio.
Error processing file HeStutters_1_32: Failed to decode audio.
Error processing file HeStutters_1_33: Failed to decode audio.
Error processing file HeStutters_1_34: Failed to decode audio.
Error processing file HeStutters_1_35: Failed to decode audio.
Error processing file HeStutters_1_36: Failed to decode audio.
Error processing file HeStutters_1_38: Failed to decode audio.
Error processing file HeStutters_1_39: Failed to decode audio.
Error processing file HeStutters_1_40: Failed to decode audio.
Error processing file HeStutters_1_41: Failed to decode audio.
Error processing file HeStutters_1_42: Failed to decode audio.
Error processing file HeStutters_1_43: Failed to decode

 20%|██        | 6592/32321 [01:29<02:11, 196.16it/s]

Error processing file HeStutters_1_81: Failed to decode audio.
Error processing file HeStutters_1_82: Failed to decode audio.
Error processing file HeStutters_1_83: Failed to decode audio.
Error processing file HeStutters_1_84: Failed to decode audio.
Error processing file HeStutters_1_85: Failed to decode audio.
Error processing file HeStutters_1_86: Failed to decode audio.
Error processing file HeStutters_1_87: Failed to decode audio.
Error processing file HeStutters_1_88: Failed to decode audio.
Error processing file HeStutters_1_89: Failed to decode audio.
Error processing file HeStutters_1_90: Failed to decode audio.
Error processing file HeStutters_1_91: Failed to decode audio.
Error processing file HeStutters_1_92: Failed to decode audio.
Error processing file HeStutters_1_93: Failed to decode audio.
Error processing file HeStutters_1_94: Failed to decode audio.
Error processing file HeStutters_1_95: Failed to decode audio.
Error processing file HeStutters_1_98: Failed to decode

 40%|████      | 13067/32321 [03:00<02:00, 159.78it/s]

Error processing file StrongVoices_25_0: Failed to decode audio.
Error processing file StrongVoices_25_1: Failed to decode audio.
Error processing file StrongVoices_25_10: Failed to decode audio.
Error processing file StrongVoices_25_11: Failed to decode audio.
Error processing file StrongVoices_25_12: Failed to decode audio.
Error processing file StrongVoices_25_13: Failed to decode audio.
Error processing file StrongVoices_25_14: Failed to decode audio.
Error processing file StrongVoices_25_15: Failed to decode audio.
Error processing file StrongVoices_25_17: Failed to decode audio.
Error processing file StrongVoices_25_18: Failed to decode audio.
Error processing file StrongVoices_25_19: Failed to decode audio.
Error processing file StrongVoices_25_2: Failed to decode audio.
Error processing file StrongVoices_25_20: Failed to decode audio.
Error processing file StrongVoices_25_21: Failed to decode audio.
Error processing file StrongVoices_25_22: Failed to decode audio.
Error process

 53%|█████▎    | 17088/32321 [03:56<02:55, 86.63it/s] 

Error processing file StutterTalk_59_31: Failed to decode audio.
Error processing file StutterTalk_59_33: Failed to decode audio.
Error processing file StutterTalk_59_34: Failed to decode audio.
Error processing file StutterTalk_59_35: Failed to decode audio.
Error processing file StutterTalk_59_36: Failed to decode audio.
Error processing file StutterTalk_59_37: Failed to decode audio.
Error processing file StutterTalk_59_38: Failed to decode audio.


 72%|███████▏  | 23202/32321 [05:22<00:59, 152.03it/s]

Error processing file WomenWhoStutter_0_100: Failed to decode audio.
Error processing file WomenWhoStutter_0_101: Failed to decode audio.
Error processing file WomenWhoStutter_0_102: Failed to decode audio.
Error processing file WomenWhoStutter_0_105: Failed to decode audio.
Error processing file WomenWhoStutter_0_106: Failed to decode audio.
Error processing file WomenWhoStutter_0_107: Failed to decode audio.
Error processing file WomenWhoStutter_0_108: Failed to decode audio.
Error processing file WomenWhoStutter_0_109: Failed to decode audio.
Error processing file WomenWhoStutter_0_110: Failed to decode audio.
Error processing file WomenWhoStutter_0_111: Failed to decode audio.
Error processing file WomenWhoStutter_0_112: Failed to decode audio.
Error processing file WomenWhoStutter_0_113: Failed to decode audio.
Error processing file WomenWhoStutter_0_114: Failed to decode audio.
Error processing file WomenWhoStutter_0_115: Failed to decode audio.
Error processing file WomenWhoStut

 72%|███████▏  | 23410/32321 [05:24<00:53, 166.04it/s]

Error processing file WomenWhoStutter_0_30: Failed to decode audio.
Error processing file WomenWhoStutter_0_31: Failed to decode audio.
Error processing file WomenWhoStutter_0_32: Failed to decode audio.
Error processing file WomenWhoStutter_0_33: Failed to decode audio.
Error processing file WomenWhoStutter_0_36: Failed to decode audio.
Error processing file WomenWhoStutter_0_37: Failed to decode audio.
Error processing file WomenWhoStutter_0_38: Failed to decode audio.
Error processing file WomenWhoStutter_0_39: Failed to decode audio.
Error processing file WomenWhoStutter_0_40: Failed to decode audio.
Error processing file WomenWhoStutter_0_41: Failed to decode audio.
Error processing file WomenWhoStutter_0_42: Failed to decode audio.
Error processing file WomenWhoStutter_0_43: Failed to decode audio.
Error processing file WomenWhoStutter_0_44: Failed to decode audio.
Error processing file WomenWhoStutter_0_45: Failed to decode audio.
Error processing file WomenWhoStutter_0_46: Fail

100%|██████████| 32321/32321 [07:30<00:00, 71.73it/s] 


In [53]:
# Calculando as matrizes de confusão após processar todos os arquivos
conf_matrix_fc = confusion_matrix(true_labels, predictions_fc)
conf_matrix_lstm = confusion_matrix(true_labels, predictions_lstm)

print("Confusion Matrix for FC Model:")
print(conf_matrix_fc)
print("Confusion Matrix for LSTM Model:")
print(conf_matrix_lstm)

Confusion Matrix for FC Model:
[[   64  6838]
 [  114 24892]]
Confusion Matrix for LSTM Model:
[[  562  6340]
 [ 1464 23542]]


In [54]:
# Gerando relatórios de classificação
print("Classification Report for FC Model:")
print(classification_report(true_labels, predictions_fc, target_names=['Non-Stutter', 'Stutter']))

print("Classification Report for LSTM Model:")
print(classification_report(true_labels, predictions_lstm, target_names=['Non-Stutter', 'Stutter']))

Classification Report for FC Model:
              precision    recall  f1-score   support

 Non-Stutter       0.36      0.01      0.02      6902
     Stutter       0.78      1.00      0.88     25006

    accuracy                           0.78     31908
   macro avg       0.57      0.50      0.45     31908
weighted avg       0.69      0.78      0.69     31908

Classification Report for LSTM Model:
              precision    recall  f1-score   support

 Non-Stutter       0.28      0.08      0.13      6902
     Stutter       0.79      0.94      0.86     25006

    accuracy                           0.76     31908
   macro avg       0.53      0.51      0.49     31908
weighted avg       0.68      0.76      0.70     31908

