In [1]:
import librosa

In [27]:
import numpy as np

In [2]:
WAVFILE = "/content/spkslt_98.wav"

In [3]:
audio, sr = librosa.load(WAVFILE)

In [10]:
f0, voiced_flag, voiced_probs = librosa.pyin(y=audio,
                                             fmin=librosa.note_to_hz('C2'),
                                             fmax=librosa.note_to_hz('C7'),
                                             pad_mode='constant',
                                             n_thresholds = 10,
                                             max_transition_rate = 100,
                                             sr=sr)

In [24]:
onsets = librosa.onset.onset_detect(y=audio, sr=sr)

# Helpers

In [19]:
def load_tsv(filename):
    output = []
    with open(filename) as inf:
        for line in inf.readlines():
            parts = line.strip().split("\t")
            output.append((float(parts[0]), float(parts[1]), parts[2]))
    return output

In [21]:
def get_detdem(tsvish):
    determiners = ["this", "that", "these", "those"]

    output = []
    for part in tsvish:
        if part[2] in determiners:
            output.append(part)
    return output

# TSV data

In [22]:
tsvcontent = load_tsv("/content/spkslt_98.tsv")

In [23]:
get_detdem(tsvcontent)

[(0.78, 1.04, 'this'), (4.84, 5.05, 'that'), (21.58, 21.97, 'this')]

In [25]:
detdem = get_detdem(tsvcontent)

In [28]:
starts = np.array([x[0] for x in detdem])

In [29]:
ends = np.array([x[1] for x in detdem])

In [64]:
detdem

[(0.78, 1.04, 'this'), (4.84, 5.05, 'that'), (21.58, 21.97, 'this')]

# Frames vs. times

In [39]:
!ffprobe -i {WAVFILE} 2>&1|grep Duration

  Duration: 00:00:27.44, bitrate: 1058 kb/s


In [40]:
librosa.time_to_frames(np.array([0.0, 24.62, 27.44]), sr=sr)

array([   0, 1060, 1181])

In [41]:
len(f0)

1182

# New Section

In [30]:
frstarts = librosa.time_to_frames(starts, sr=sr)

In [32]:
frends = librosa.time_to_frames(ends, sr=sr)

In [33]:
frstarts, frends

(array([ 33, 208, 929]), array([ 44, 217, 946]))

In [48]:
for z in zip(frstarts, frends):
    print(np.nanmean(f0[z[0]:z[1]]))

193.80076757544543
172.74165512508745
201.8456084757203


In [50]:
%%capture
%pip install pydub

In [51]:
from pydub import AudioSegment

In [58]:
seg = AudioSegment.from_wav(WAVFILE)

In [56]:
pdframes = [(int(z[0] * 1000), int(z[1] * 1000)) for z in zip(starts, ends)]

In [59]:
asegs = [seg[a[0]:a[1]] for a in pdframes]

In [63]:
[a for a in pdframes]

[(780, 1040), (4840, 5050), (21580, 21970)]

In [60]:
asegs[0]

In [61]:
asegs[1]

In [62]:
asegs[2]