In [3]:
import numpy as np
import librosa
import matplotlib.pyplot as plt
import pandas as pd
import scipy.signal
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('cmudict')
from nltk import pos_tag

In [4]:
def area(y, sr):
    # Compute the F0 contour
    f0, voiced_flag, voiced_probs = librosa.pyin(y=y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, frame_length=1024, hop_length=256)
    f0 = np.nan_to_num(f0)

    # Smooth the F0 contour
    f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))

    # Compute the duration of each segment in seconds
    durations = librosa.frames_to_time(range(len(f0)), sr=sr, hop_length=256)

    # Compute the AFC feature
    afc = sum(ti * fi for ti, fi in zip(durations, f0_smoothed))

    return afc

In [5]:
def area2(y, sr):
    # Compute the F0 contour
    f0, voiced_flag, voiced_probs = librosa.pyin(y=y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, frame_length=1024, hop_length=256)
    f0 = np.nan_to_num(f0)

    # Smooth the F0 contour using a Hann window with a length of 41 frames
    f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))

    # Compute the root-mean-square (RMS) energy of each segment
    rms_energy = librosa.feature.rms(y=y, hop_length=128)[0]

    # Scale the RMS energy by a factor of 0.1
    rms_energy_scaled = 0.1 * (rms_energy)

    # Compute the duration of each segment in seconds
    durations = librosa.frames_to_time(range(len(f0)), sr=sr, hop_length=256)

    # Compute the EFI feature
    efi = sum(ti * fi * ei for ti, fi, ei in zip(durations, f0_smoothed, rms_energy_scaled))

    return efi

In [6]:

def calculate_vur(y, sr):
    # Compute the short-time Fourier transform (STFT) of the signal
    stft = librosa.stft(y)

    # Compute the power spectrogram
    power = librosa.power_to_db(np.abs(stft)**2)

    # Compute the voiced and unvoiced segments using the Mel-frequency cepstral coefficients (MFCCs)
    mfccs = librosa.feature.mfcc(S=power, sr=sr)
    voiced = np.where(mfccs[0] > np.median(mfccs[0]))[0]
    unvoiced = np.where(mfccs[0] <= np.median(mfccs[0]))[0]

    # Compute the duration of the voiced and unvoiced segments
    voiced_duration = librosa.frames_to_time(len(voiced), sr=sr)
    unvoiced_duration = librosa.frames_to_time(len(unvoiced), sr=sr)

    # Calculate the VUR
    vur = voiced_duration / (voiced_duration + unvoiced_duration)

    return vur


In [7]:
from sklearn.mixture import GaussianMixture
from scipy.signal import medfilt

def calculate_dlh(y, sr):

    spec = librosa.stft(y=y, n_fft=2048, hop_length=256, win_length=1024, window='hann', center=True, pad_mode='reflect')

    # Compute the power spectrogram
    power_spec = np.abs(spec) ** 2

    # Compute the mel spectrogram
    mel_spec = librosa.feature.melspectrogram(S=power_spec)

    # Compute the log mel spectrogram
    log_mel_spec = librosa.amplitude_to_db(mel_spec)

    # Compute the F0 contour
    f0, voiced_flag, voiced_probs = librosa.pyin(y=y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, frame_length=1024, hop_length=256)
    f0 = np.nan_to_num(f0)

    # Smooth the F0 contour using a Hann window with a length of 41 frames
    f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))

    # Cluster the F0 values using a two-component Gaussian mixture model
    gmm = GaussianMixture(n_components=2, covariance_type='full')
    f0_reshaped = f0_smoothed.reshape(-1, 1)
    gmm.fit(f0_reshaped)
    labels = gmm.predict(f0_reshaped)

    # Compute the mean F0 values for the low and high frequency clusters
    mean_low = np.mean(f0_smoothed[labels == 0])
    mean_high = np.mean(f0_smoothed[labels == 1])

    # Compute the DLH feature as the difference between the mean F0 values of the high and low frequency clusters
    dlh = mean_high - mean_low

    return dlh


In [None]:
import librosa
import numpy as np
from sklearn.isotonic import IsotonicRegression

y, sr = librosa.load("output.wav", sr=None)

y1 = np.array(y, dtype=object)

print(np.count_nonzero(np.isnan(y)))

def calculate_f0_curve(audio_segment, sample_rate):
    f0, voiced_flag, voiced_probs = librosa.pyin(audio_segment, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
    return f0

def perform_isotonic_regression(f0_curve):
    ir = IsotonicRegression(out_of_bounds='clip')
    f0_smoothed = ir.fit_transform(np.arange(len(f0_curve)), f0_curve)
    return f0_smoothed

def compute_likelihood_values(f0_curve):
    diff = np.diff(f0_curve)

    peaks = np.where((diff[:-1] > 0) & (diff[1:] < 0))[0] + 1
    valleys = np.where((diff[:-1] < 0) & (diff[1:] > 0))[0] + 1

    rising = len(peaks) == 0 and len(valleys) == 0
    falling = len(peaks) == 0 and len(valleys) > 0
    peak = len(peaks) > 0 and len(valleys) == 0
    valley = len(peaks) > 0 and len(valleys) > 0

    return rising, falling, peak, valley


In [None]:

def compute_shape_vector(audio_segment, sample_rate):
    f0_curve = calculate_f0_curve(audio_segment, sample_rate)
    f0_smoothed = perform_isotonic_regression(f0_curve)
    shapes = compute_likelihood_values(f0_smoothed)
    return shapes


In [8]:
import numpy as np
import librosa
from sklearn.mixture import GaussianMixture
from scipy.signal import medfilt

def get_f0_peak_valley(y, sr):
    f0_values, voiced_flag, _ = librosa.pyin(
        y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr)
    f0_values = np.nan_to_num(f0_values)
    # Smooth F0 curve using a median filter
    f0_smoothed = medfilt(f0_values, kernel_size=5)

    #count the no. of nan values in f0_values
    nan_count = np.count_nonzero(np.isnan(f0_values))

    # Compute location and amplitude of F0 peak or valley
    if f0_smoothed.size > 0:
        peak_loc, valley_loc = librosa.util.peak_pick(
        f0_smoothed, 3, 3, 3, 5, 0.5, 5)[:2]


    # else:
    # # handle the case where f0_smoothed is empty or has a size of 0
    #     print("Error: f0_smoothed array is empty or has a size of 0.")
    peak_loc, valley_loc = librosa.util.peak_pick(f0_smoothed, 3, 3, 3, 5, 0.5, 5)[:2]
    print(peak_loc, valley_loc)
    if peak_loc > 0:
        f_amp = f0_smoothed[peak_loc] - np.mean(f0_smoothed)

        f_loc = peak_loc / sr
    elif valley_loc > 0:
        f_amp = np.mean(f0_smoothed) - f0_smoothed[valley_loc]
        f_loc = valley_loc / sr
    else:
        f_amp = 0
        f_loc = 0

    return f_amp, f_loc

In [9]:
import numpy as np

def get_aggregate_stats(y, sr):
    f0, voiced_flag, voiced_probs = librosa.pyin(y=y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr, frame_length=1024, hop_length=256)
    f0 = np.nan_to_num(f0)
    rms_energy = librosa.feature.rms(y=y, hop_length=128)[0]
    rms_energy = np.nan_to_num(rms_energy)

    stats_list = [np.mean(f0), np.median(f0), np.max(f0), np.min(f0), np.var(f0),
                  np.mean(rms_energy), np.median(rms_energy), np.max(rms_energy), np.min(rms_energy), np.var(rms_energy)]

    return stats_list

In [None]:
from nltk.corpus import cmudict

d = cmudict.dict()

def count_syllables(word):
    try:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]])
    except KeyError:
        # Word not found in the dictionary
        return 0

In [10]:
def generate_features(audio, sr, start, end, text):
    y = audio[start:end]
    feature_vector = []
    # area under fundamental frequency curve
    ar = area(y, sr)
    feature_vector.append(ar)
    # energy fundamental frequency integral
    ar2 = area2(y, sr)
    feature_vector.append(ar2) 

    # vur
    vur = calculate_vur(y, sr)
    feature_vector.append(vur)
    # dlh
    dlh = calculate_dlh(y, sr)
    feature_vector.append(dlh)

    #shape
    shape = compute_shape_vector(y,sr)
    feature_vector.append(shape)

    # duration
    duration_frames = int(len(y) / 10)
    feature_vector.append(duration_frames)

    # aggregate stats
    stats_l = get_aggregate_stats(y, sr)
    for i in range(len(stats_l)):
        feature_vector.append(stats_l[i])

    #pos_tag
    pos = pos_tag([text])[0][1]
    feature_vector.append(pos)

    #function word or content word
    function_words = {'DT', 'IN', 'PRP', 'PRP$', 'TO', 'CC', 'MD', 'WRB', 'WP', 'WDT'}
    if(pos in function_words):
        feature_vector.append(0)
    else:
        feature_vector.append(1)
    
    #syllable_count
    syllable_count = count_syllables(text)
    feature_vector.append(syllable_count)


    return feature_vector

In [15]:
def generate_features_for_all_words(file_number):
    audio, sr = librosa.load('../Files/'+str(file_number)+'.wav', sr=16000)

    # read txt file
    start = []
    duration = []
    transcripts = []
    with open('../Files/'+str(file_number)+'_A.txt', 'r') as f:
        for line in f:
            line = line.split()
            # convert to float
            start.append(float(line[2]))
            duration.append(float(line[3]))
            transcripts.append(line[4])

    # make start and end arrays with index
    start = np.array(start)
    duration = np.array(duration)
    end = start + duration

    #  convert start from seconds to indices of audio
    start = librosa.time_to_samples(start, sr=sr)
    end = librosa.time_to_samples(end, sr=sr)

    word_durations = []
    for i in range(len(start)):
        duration = (start[i], end[i])
        word_durations.append(duration)
    features = []
    for start, end in word_durations:
        features.append(generate_features(audio, sr, start, end, transcripts[len(features)]))
    return features

In [18]:
feature_vectors = []

for x in range(0,1000):
    feature_vectors.append(generate_features_for_all_words(x))

df = pd.DataFrame(feature_vectors)
df.to_csv('feature_vectors.csv', index=False)

  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve(f0, scipy.signal.hann(41), mode='same') / sum(scipy.signal.hann(41))
  f0_smoothed = scipy.signal.convolve