In [5]:
import numpy as np
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2Model,
    Wav2Vec2PreTrainedModel,
)

import librosa


class ModelHead(nn.Module):
    r"""Classification head."""

    def __init__(self, config, num_labels):

        super().__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, num_labels)

    def forward(self, features, **kwargs):

        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)

        return x


class AgeGenderModel(Wav2Vec2PreTrainedModel):
    r"""Speech emotion classifier."""

    def __init__(self, config):

        super().__init__(config)

        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.age = ModelHead(config, 1)
        self.gender = ModelHead(config, 3)
        self.init_weights()

    def forward(
            self,
            input_values,
    ):

        outputs = self.wav2vec2(input_values)
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits_age = self.age(hidden_states)
        logits_gender = torch.softmax(self.gender(hidden_states), dim=1)

        return hidden_states, logits_age, logits_gender

In [7]:
# load model from hub
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-24-ft-age-gender'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = AgeGenderModel.from_pretrained(model_name)

# dummy signal
# sampling_rate = 16000
# signal = np.zeros((1, sampling_rate), dtype=np.float32)


def process_func(
    x: np.ndarray,
    sampling_rate: int,
    embeddings: bool = False,
) -> np.ndarray:
    r"""Predict age and gender or extract embeddings from raw audio signal."""

    # run through processor to normalize signal
    # always returns a batch, so we just get the first entry
    # then we put it on the device
    y = processor(x, sampling_rate=sampling_rate)
    y = y['input_values'][0]
    y = y.reshape(1, -1)
    y = torch.from_numpy(y).to(device)

    # run through model
    with torch.no_grad():
        y = model(y)
        if embeddings:
            y = y[0]
        else:
            y = torch.hstack([y[1], y[2]])

    # convert to numpy
    y = y.detach().cpu().numpy()

    return y

In [17]:
#Predicting ages

from os import path
from pydub import AudioSegment
import numpy as np
import soundfile as sf

#file_root = '../EMBRACE-data-analysis/data/summer23-pitt/Family_32/32_Chromebook Data/32_voice-recordings/voice-recording-1687549850446-1687550006814_no_silences/voice-recording-1687549850446-1687550006814_1'

#142 female .96 (female predicted)
#110 child .55 female .43 (child is the correct one)
#96 female .96238494e-01 child 0.00013884614e-04 (female is the correct one)
file_root = "/Users/jab464/Documents/GitHub/EMBRACE-data-analysis/audios/asu/mom-child-96"
# assign files
input_file = file_root+'.mp3'#"voice-recording-1684176308802-1684176427677.mp3"
output_file = file_root+'.wav'
output_file_16k = file_root+"_16k.wav"

# convert mp3 file to wav file
# sound = AudioSegment.from_mp3(input_file)
# sound.export(output_file, format="wav")
#
# # Load audio with librosa (originally at 48 kHz)
# audio, sr = librosa.load(output_file, sr=48000)
# # Resample to 16 kHz
# audio_16k = librosa.resample(audio, orig_sr=sr, target_sr=16000)
# # Save the resampled audio
# sf.write(output_file_16k, audio_16k, 16000)

# Define the sampling rate
sampling_rate = 16000  # Target sample rate for the model

# Load the audio file with librosa, setting `sr=sampling_rate` for consistent resampling
signal, _ = librosa.load(output_file, sr=sampling_rate)  # Resamples audio to 16kHz

# Reshape `signal` for the model input and ensure it matches the required dtype
signal = signal.astype(np.float32).reshape(1, -1)

print(signal)

[[0.00805298 0.02018604 0.0276726  ... 0.01285762 0.01374081 0.01713197]]


In [1]:
import parselmouth

import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

from pydub import AudioSegment
from pydub.playback import play

import math

import time

#Danish you can comment this one
from pyannote.audio import Pipeline

#import set of functions I made for the automatic analysis of the audios
import sys

#here the path needs to be changed to your own local path
sys.path.insert(1, '/Users/jab464/Documents/Github/EMBRACE-data-analysis/')
from audio_analysis_functions import *

#import needed to create a folder
import os

import pandas as pd

import unidecode

def find(name, path):
    for root, dirs, files in os.walk(path):
        if name in files:
            return os.path.join(root, name)

  Referenced from: '/Users/jab464/miniconda3/envs/pyannote/lib/python3.8/site-packages/torchvision/image.so'
  Expected in: '/Users/jab464/miniconda3/envs/pyannote/lib/python3.8/site-packages/torch/lib/libc10.dylib'
  warn(f"Failed to load image Python extension: {e}")


In [11]:
def add_labeled_data_audeering_model(current_df,filename,labels_df):
    print(labels_df)
    print("those were the labels")
    filename_no_mp3 = filename[0:len(filename)-4]

    #only the name of the file withouth the whole path
    filename_no_mp3_no_folder = filename_no_mp3[filename_no_mp3.rfind('/')+1:len(filename_no_mp3)]

    root_folder = filename_no_mp3[0:filename_no_mp3.rfind('/')]

    #extract only the active voice audio
    pipeline_act_detection = Pipeline.from_pretrained("pyannote/voice-activity-detection",
                                            use_auth_token="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")
    #extracted audio
    output = pipeline_act_detection(filename)

    # Specify the name of the new folder
    folder_path = filename_no_mp3+"_no_silences"

    # if os.path.exists(folder_path):
    #     print(f"The path '{folder_path}' exists.")
    # else:
    #     print(f"The path '{folder_path}' does not exist.")
    #     # Create the new folder
    #     os.mkdir(folder_path)
    #
    # try:
    #     audio = AudioSegment.from_file(filename, "mp3")
    # except:
    #     audio = AudioSegment.from_file(filename, format="mp4")
    #
    # counter = 0
    # for speech in output.get_timeline().support():
    #
    #     # Extract the chunk
    #     start_time = speech.start
    #     end_time = speech.end
    #     chunk = audio[start_time*1000:end_time*1000]
    #     # Export the extracted chunk to a new audio file
    #     chunk.export(fol+"/"+filename_no_mp3_no_folder+"_"+str(counter)+".mp3", format="mp3")
    #     counter=counter+1
    #
    # # List of audio file paths to concatenate
    # audio_files = []
    #
    # #fill audio files list with all the active voice chunks generated in the last for loop
    # for i in range (0,counter):
    #     audio_files.append(root_folder+"/"+filename_no_mp3_no_folder+"_"+str(i)+".mp3")
    #
    # # Initialize an empty AudioSegment object to hold the concatenated audio
    # concatenated_audio = AudioSegment.empty()
    #
    # # Iterate through each audio file
    # for file in audio_files:
    #     # Load the audio file
    #     curr_audio = AudioSegment.from_file(file)
    #
    #     # Append the loaded audio to the concatenated audio
    #     concatenated_audio += curr_audio
    #
    # # Export the concatenated audio withouth silences to a new file
    # concatenated_audio.export(folder_path+"/"+filename_no_mp3_no_folder+"_no_silences.wav", format="wav")

    #Gets the file without silences (yes again) but in a different format
    voice_activity = get_voice_activity(filename,False,False)
    #it segments the data in 1 second segments
    custom_size_voice_activity = custom_size_segments(voice_activity)

    #Sound with no silences
    #snd = parselmouth.Sound(folder_path+"/"+filename_no_mp3_no_folder+"_no_silences.mp3")

    snd = parselmouth.Sound(filename)
    # Initialize lists to store data
    data = []
    #max_pitch_length = 97  # Maximum number of pitch values per segment
    #num_mfcc = 13  # Assuming 13 MFCC coefficients for simplicity

    labels = [f"Label_{i+1}" for i in range(len(custom_size_voice_activity))]



    # convert mp3 file to wav file
    sound = AudioSegment.from_mp3(filename)
    sound.export(root_folder+"/"+filename_no_mp3_no_folder+".wav", format="wav")

    # Define the sampling rate
    sampling_rate = 16000  # Target sample rate for the model

    # Load the audio file with librosa, setting `sr=sampling_rate` for consistent resampling
    signal, _ = librosa.load(root_folder+"/"+filename_no_mp3_no_folder+".wav", sr=sampling_rate)  # Resamples audio to 16kHz

    print("length audio: "+str(len(signal)))

    # Reshape `signal` for the model input and ensure it matches the required dtype
    signal = signal.astype(np.float32).reshape(1, -1)

    print(signal)
    print(len(signal))

    for i, segment in enumerate(custom_size_voice_activity):
        print(i)
        start, end = segment
        print(segment)
        #Sets metadata (filename,index,start,end,duration)
        metadata = []
        metadata.append(filename_no_mp3_no_folder)
        metadata.append(i)
        metadata.append(start)
        metadata.append(end)
        duration_segment = float(end)-float(start)
        metadata.append(duration_segment)

        # Convert start and end times to sample indices
        start_sample = int(start * sampling_rate)
        end_sample = int(end * sampling_rate)



        if(duration_segment>=.1):

            # Slice the audio array to get the specific segment
            sub_signal = signal[:,start_sample:end_sample]
            print(sub_signal)

            #detection probabilities -> age female male child
            detection_probs = process_func(sub_signal, sampling_rate)[0]
            print(detection_probs)

            highest_prob = detection_probs[1]
            highest_prob_index = 1
            for j in range(2,4):
                if(detection_probs[j]>highest_prob):
                    highest_prob = detection_probs[j]
                    highest_prob_index=j
            pred_label = 'c'
            if(highest_prob_index==1):
                pred_label='af'
            elif(highest_prob_index==2):
                pred_label='am'

            #find label from the labeled dataset
            label = labels_df.at[i,'label']
            labels[i]=label
            print("predicted label: "+pred_label)
            print("actual label: "+label)

            # Combine all the data for this segment into one list
            segment_data = list(metadata) + detection_probs.astype('<U43').tolist() + [pred_label,label]

            # Append the segment's data to the overall data list
            data.append(segment_data)


    # Column names
    column_names = ['filename','index','start','end','duration','age','female','male','child','pred_label','Label']

    # Create a DataFrame from the data list
    df = pd.DataFrame(data, columns=column_names)

    #Concatenate that df to the global df
    current_df = pd.concat([current_df, df])
    current_df.reset_index(drop=True, inplace=True)

    return current_df

def get_labels(filename):
    filename_no_mp3 = filename[0:len(filename)-4]

    #only the name of the file withouth the whole path
    filename_no_mp3_no_folder = filename_no_mp3[filename_no_mp3.rfind('/')+1:len(filename_no_mp3)]

    root_folder = filename_no_mp3[0:filename_no_mp3.rfind('/')]

    print(filename_no_mp3)
    print(filename_no_mp3_no_folder)
    print(root_folder)

    labels_list = []

    file_labels_danish = find(filename_no_mp3_no_folder+".mp3_labeled_danish.txt","../EMBRACE-data-analysis/data")
    file_labels_jordan = find(filename_no_mp3_no_folder+".mp3_labeled_jordan.txt","../EMBRACE-data-analysis/data")
    print(filename_no_mp3_no_folder+".mp3_labeled_danish.txt")
    print(filename_no_mp3_no_folder+".mp3_labeled_jordan.txt")
    print(file_labels_danish)
    print(file_labels_jordan)

    if(file_labels_danish!=None and file_labels_jordan==None):
        #file_labels_path = root_folder+"/"+filename_no_mp3_no_folder+".mp3_labeled_danish.txt"
        file_labels_path =  os.path.abspath(file_labels_danish)

    else:
        file_labels_path =  os.path.abspath(file_labels_jordan)
    print(file_labels_path)
    file_labels=open(file_labels_path,"r")
    counter = 0

    for line in file_labels:
        raw_labels = line.split(" ")
        labels = []
        labels.append(filename_no_mp3_no_folder)
        labels.append(raw_labels[0])
        labels.append(raw_labels[1][1:len(raw_labels[1])-1])
        labels.append(raw_labels[2][0:len(raw_labels[2])-1])
        labels.append(raw_labels[3][0:len(raw_labels[3])-1])
        counter = counter + 1
        labels_list.append(labels)

    labels_df = df = pd.DataFrame(labels_list, columns =['filename', 'index', 'start','end','label'])
    print(labels_df)

    return labels_df

In [12]:
all_labeled_files = open("/Users/jab464/Documents/GitHub/EMBRACE-data-analysis/data/pitt_2023_audios.txt","r")

lines = all_labeled_files.readlines()
last = lines[-1]

current_df = pd.DataFrame()
counter = 0
for line in lines:
    if line is last:
        path_labeled_file = line[0:len(line)]
        # do work on lst line
    else:
        # work on other lines
        path_labeled_file = line[0:len(line)-1]
    print(path_labeled_file)
    labels_df = get_labels(path_labeled_file)
    current_df = add_labeled_data_audeering_model(current_df,path_labeled_file,labels_df)
    counter = counter + 1
print("labeled files "+str(counter))

# Save the DataFrame to a CSV file
current_df.to_csv('audeering_predictions.csv', index=False)

../EMBRACE-data-analysis/data/summer23-pitt/Family_01/01_Chromebook Data/01_voice-recordings/voice-recording-1684287809686-1684287934521.mp3
../EMBRACE-data-analysis/data/summer23-pitt/Family_01/01_Chromebook Data/01_voice-recordings/voice-recording-1684287809686-1684287934521
voice-recording-1684287809686-1684287934521
../EMBRACE-data-analysis/data/summer23-pitt/Family_01/01_Chromebook Data/01_voice-recordings
voice-recording-1684287809686-1684287934521.mp3_labeled_danish.txt
voice-recording-1684287809686-1684287934521.mp3_labeled_jordan.txt
None
../EMBRACE-data-analysis/data/summer23-pitt/Family_01/01_Chromebook Data/01_voice-recordings/voice-recording-1684287809686-1684287934521.mp3_labeled_jordan.txt
/Users/jab464/Documents/GitHub/EMBRACE-data-analysis/data/summer23-pitt/Family_01/01_Chromebook Data/01_voice-recordings/voice-recording-1684287809686-1684287934521.mp3_labeled_jordan.txt
                                        filename index               start  \
0    voice-recording

In [57]:
filename = "../EMBRACE-data-analysis/data/summer23-pitt/Family_03/03_Chromebook Data/03_voice-recordings/voice-recording-1683505777195-1683505963551.mp3"

#extract only the active voice audio
pipeline_act_detection = Pipeline.from_pretrained("pyannote/voice-activity-detection",
                                        use_auth_token="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")

#extracted audio
output = pipeline_act_detection(filename)

try:
    audio = AudioSegment.from_file(filename, "mp3")
except:
    audio = AudioSegment.from_file(filename, format="mp4")

filename_no_mp3 = filename[0:len(filename)-4]
#
# #only the name of the file withouth the whole path
filename_no_mp3_no_folder = filename_no_mp3[filename_no_mp3.rfind('/')+1:len(filename_no_mp3)]

# Specify the name of the new folder
folder_path = filename_no_mp3+"_silences"
counter = 0
for speech in output.get_timeline().support():
    # Extract the chunk
    start_time = speech.start
    end_time = speech.end
    chunk = audio[start_time*1000:end_time*1000]
    # Export the extracted chunk to a new audio file
    chunk.export(folder_path+"/"+filename_no_mp3_no_folder+"_"+str(counter)+".wav", format="wav")
    counter=counter+1


In [33]:
import pandas as pd
from sklearn.metrics import confusion_matrix

df_audeering_predictions = pd.read_csv('audeering_predictions.csv', sep=',', header=0)

df_audeering_predictions['duration'] =  df_audeering_predictions['duration'].astype('float')

filtered_df = df_audeering_predictions[~df_audeering_predictions['Label'].str.contains('&|,|-', case=False, na=False)]
filtered_df = filtered_df[filtered_df['duration']>=.1]

y_true = filtered_df['Label']
y_pred = filtered_df['pred_label']

y_true = [str(label) for label in y_true]
y_pred = [str(label) for label in y_pred]

# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred, labels=['am', 'af', 'c'])

# Convert to a pandas DataFrame for better visualization
cm_df = pd.DataFrame(cm, index=['am', 'af', 'c'], columns=['am', 'af', 'c'])

print(cm_df)

     am    af    c
am  163    27    0
af   41  2200   27
c    19   514  331


In [18]:
print(process_func(signal, sampling_rate))
#    Age        female     male       child
# [[ 0.33793038 0.2715511  0.2275236  0.5009253 ]]

print(process_func(signal, sampling_rate, embeddings=True))
# Pooled hidden states of last transformer layer
# [[ 0.024444    0.0508722   0.04930823 ...  0.07247854 -0.0697901
#   -0.0170537 ]]

[[4.9168006e-01 9.6238494e-01 3.7476216e-02 1.3884614e-04]]
[[ 0.02255362  0.01676262  0.03831391 ...  0.05845425 -0.0053767
   0.07303516]]
