### Librosa Code License:

---

ISC License  
Copyright (c) 2013--2023, librosa development team.

Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.

THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

---

**Modifications**: The following code block(s) from the `librosa` library have been modified by Kahyun Choi.


In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Beat and tempo
==============
.. autosummary::
   :toctree: generated/

   beat_track
   plp
"""

import numpy as np
import scipy
import scipy.stats

# from librosa_cache import cache
from librosa import core
from librosa import onset
from librosa import util
from librosa.feature import tempogram, fourier_tempogram
from librosa.feature import tempo as _tempo
from librosa.util.exceptions import ParameterError
from librosa.util.decorators import moved
from typing import Any, Callable, Optional, Tuple

__all__ = ["beat_track", "tempo", "plp"]


tempo = moved(moved_from="librosa.beat.tempo", version="0.10.0", version_removed="1.0")(
    _tempo
)


def beat_track(
    *,
    y: Optional[np.ndarray] = None,
    sr: float = 22050,
    onset_envelope: Optional[np.ndarray] = None,
    hop_length: int = 512,
    start_bpm: float = 120.0,
    tightness: float = 100,
    trim: bool = True,
    bpm: Optional[float] = None,
    prior: Optional[scipy.stats.rv_continuous] = None,
    units: str = "frames",
) -> Tuple[float, np.ndarray]:
    r"""Dynamic programming beat tracker.

    Beats are detected in three stages, following the method of [#]_:

      1. Measure onset strength
      2. Estimate tempo from onset correlation
      3. Pick peaks in onset strength approximately consistent with estimated
         tempo

    .. [#] Ellis, Daniel PW. "Beat tracking by dynamic programming."
           Journal of New Music Research 36.1 (2007): 51-60.
           http://labrosa.ee.columbia.edu/projects/beattrack/

    Parameters
    ----------
    y : np.ndarray [shape=(n,)] or None
        audio time series
    sr : number > 0 [scalar]
        sampling rate of ``y``
    onset_envelope : np.ndarray [shape=(n,)] or None
        (optional) pre-computed onset strength envelope.
    hop_length : int > 0 [scalar]
        number of audio samples between successive ``onset_envelope`` values
    start_bpm : float > 0 [scalar]
        initial guess for the tempo estimator (in beats per minute)
    tightness : float [scalar]
        tightness of beat distribution around tempo
    trim : bool [scalar]
        trim leading/trailing beats with weak onsets
    bpm : float [scalar]
        (optional) If provided, use ``bpm`` as the tempo instead of
        estimating it from ``onsets``.
    prior : scipy.stats.rv_continuous [optional]
        An optional prior distribution over tempo.
        If provided, ``start_bpm`` will be ignored.
    units : {'frames', 'samples', 'time'}
        The units to encode detected beat events in.
        By default, 'frames' are used.

    Returns
    -------
    tempo : float [scalar, non-negative]
        estimated global tempo (in beats per minute)
    beats : np.ndarray [shape=(m,)]
        estimated beat event locations in the specified units
        (default is frame indices)
    .. note::
        If no onset strength could be detected, beat_tracker estimates 0 BPM
        and returns an empty list.

    Raises
    ------
    ParameterError
        if neither ``y`` nor ``onset_envelope`` are provided,
        or if ``units`` is not one of 'frames', 'samples', or 'time'

    See Also
    --------
    librosa.onset.onset_strength

    Examples
    --------
    Track beats using time series input

    >>> y, sr = librosa.load(librosa.ex('choice'), duration=10)

    >>> tempo, beats = librosa.beat.beat_track(y=y, sr=sr)
    >>> tempo
    135.99917763157896

    Print the frames corresponding to beats

    >>> beats
    array([  3,  21,  40,  59,  78,  96, 116, 135, 154, 173, 192, 211,
           230, 249, 268, 287, 306, 325, 344, 363])

    Or print them as timestamps

    >>> librosa.frames_to_time(beats, sr=sr)
    array([0.07 , 0.488, 0.929, 1.37 , 1.811, 2.229, 2.694, 3.135,
           3.576, 4.017, 4.458, 4.899, 5.341, 5.782, 6.223, 6.664,
           7.105, 7.546, 7.988, 8.429])

    Track beats using a pre-computed onset envelope

    >>> onset_env = librosa.onset.onset_strength(y=y, sr=sr,
    ...                                          aggregate=np.median)
    >>> tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env,
    ...                                        sr=sr)
    >>> tempo
    135.99917763157896
    >>> beats
    array([  3,  21,  40,  59,  78,  96, 116, 135, 154, 173, 192, 211,
           230, 249, 268, 287, 306, 325, 344, 363])

    Plot the beat events against the onset strength envelope

    >>> import matplotlib.pyplot as plt
    >>> hop_length = 512
    >>> fig, ax = plt.subplots(nrows=2, sharex=True)
    >>> times = librosa.times_like(onset_env, sr=sr, hop_length=hop_length)
    >>> M = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=hop_length)
    >>> librosa.display.specshow(librosa.power_to_db(M, ref=np.max),
    ...                          y_axis='mel', x_axis='time', hop_length=hop_length,
    ...                          ax=ax[0])
    >>> ax[0].label_outer()
    >>> ax[0].set(title='Mel spectrogram')
    >>> ax[1].plot(times, librosa.util.normalize(onset_env),
    ...          label='Onset strength')
    >>> ax[1].vlines(times[beats], 0, 1, alpha=0.5, color='r',
    ...            linestyle='--', label='Beats')
    >>> ax[1].legend()
    """
    # First, get the frame->beat strength profile if we don't already have one
    if onset_envelope is None:
        if y is None:
            raise ParameterError("y or onset_envelope must be provided")

        onset_envelope = onset.onset_strength(
            y=y, sr=sr, hop_length=hop_length, aggregate=np.median
        )

    # Do we have any onsets to grab?
    if not onset_envelope.any():
        return (0, np.array([], dtype=int))

    # Estimate BPM if one was not provided
    if bpm is None:
        bpm = _tempo(
            onset_envelope=onset_envelope,
            sr=sr,
            hop_length=hop_length,
            start_bpm=start_bpm,
            prior=prior,
        )[0]

    # Then, run the tracker
    beats, cumscore = __beat_tracker(onset_envelope, bpm, float(sr) / hop_length, tightness, trim)

    if units == "frames":
        return (bpm, beats, cumscore)
    elif units == "samples":
        return (bpm, core.frames_to_samples(beats, hop_length=hop_length))
    elif units == "time":
        return (bpm, core.frames_to_time(beats, hop_length=hop_length, sr=sr))
    else:
        raise ParameterError(f"Invalid unit type: {units}")



def plp(
    *,
    y: Optional[np.ndarray] = None,
    sr: float = 22050,
    onset_envelope: Optional[np.ndarray] = None,
    hop_length: int = 512,
    win_length: int = 384,
    tempo_min: Optional[float] = 30,
    tempo_max: Optional[float] = 300,
    prior: Optional[scipy.stats.rv_continuous] = None,
) -> np.ndarray:
    """Predominant local pulse (PLP) estimation. [#]_

    The PLP method analyzes the onset strength envelope in the frequency domain
    to find a locally stable tempo for each frame.  These local periodicities
    are used to synthesize local half-waves, which are combined such that peaks
    coincide with rhythmically salient frames (e.g. onset events on a musical time grid).
    The local maxima of the pulse curve can be taken as estimated beat positions.

    This method may be preferred over the dynamic programming method of `beat_track`
    when the tempo is expected to vary significantly over time.  Additionally,
    since `plp` does not require the entire signal to make predictions, it may be
    preferable when beat-tracking long recordings in a streaming setting.

    .. [#] Grosche, P., & Muller, M. (2011).
        "Extracting predominant local pulse information from music recordings."
        IEEE Transactions on Audio, Speech, and Language Processing, 19(6), 1688-1701.

    Parameters
    ----------
    y : np.ndarray [shape=(..., n)] or None
        audio time series. Multi-channel is supported.

    sr : number > 0 [scalar]
        sampling rate of ``y``

    onset_envelope : np.ndarray [shape=(..., n)] or None
        (optional) pre-computed onset strength envelope

    hop_length : int > 0 [scalar]
        number of audio samples between successive ``onset_envelope`` values

    win_length : int > 0 [scalar]
        number of frames to use for tempogram analysis.
        By default, 384 frames (at ``sr=22050`` and ``hop_length=512``) corresponds
        to about 8.9 seconds.

    tempo_min, tempo_max : numbers > 0 [scalar], optional
        Minimum and maximum permissible tempo values.  ``tempo_max`` must be at least
        ``tempo_min``.

        Set either (or both) to `None` to disable this constraint.

    prior : scipy.stats.rv_continuous [optional]
        A prior distribution over tempo (in beats per minute).
        By default, a uniform prior over ``[tempo_min, tempo_max]`` is used.

    Returns
    -------
    pulse : np.ndarray, shape=[(..., n)]
        The estimated pulse curve.  Maxima correspond to rhythmically salient
        points of time.

        If input is multi-channel, one pulse curve per channel is computed.

    See Also
    --------
    beat_track
    librosa.onset.onset_strength
    librosa.feature.fourier_tempogram

    Examples
    --------
    Visualize the PLP compared to an onset strength envelope.
    Both are normalized here to make comparison easier.

    >>> y, sr = librosa.load(librosa.ex('brahms'))
    >>> onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    >>> pulse = librosa.beat.plp(onset_envelope=onset_env, sr=sr)
    >>> # Or compute pulse with an alternate prior, like log-normal
    >>> import scipy.stats
    >>> prior = scipy.stats.lognorm(loc=np.log(120), scale=120, s=1)
    >>> pulse_lognorm = librosa.beat.plp(onset_envelope=onset_env, sr=sr,
    ...                                  prior=prior)
    >>> melspec = librosa.feature.melspectrogram(y=y, sr=sr)

    >>> import matplotlib.pyplot as plt
    >>> fig, ax = plt.subplots(nrows=3, sharex=True)
    >>> librosa.display.specshow(librosa.power_to_db(melspec,
    ...                                              ref=np.max),
    ...                          x_axis='time', y_axis='mel', ax=ax[0])
    >>> ax[0].set(title='Mel spectrogram')
    >>> ax[0].label_outer()
    >>> ax[1].plot(librosa.times_like(onset_env),
    ...          librosa.util.normalize(onset_env),
    ...          label='Onset strength')
    >>> ax[1].plot(librosa.times_like(pulse),
    ...          librosa.util.normalize(pulse),
    ...          label='Predominant local pulse (PLP)')
    >>> ax[1].set(title='Uniform tempo prior [30, 300]')
    >>> ax[1].label_outer()
    >>> ax[2].plot(librosa.times_like(onset_env),
    ...          librosa.util.normalize(onset_env),
    ...          label='Onset strength')
    >>> ax[2].plot(librosa.times_like(pulse_lognorm),
    ...          librosa.util.normalize(pulse_lognorm),
    ...          label='Predominant local pulse (PLP)')
    >>> ax[2].set(title='Log-normal tempo prior, mean=120', xlim=[5, 20])
    >>> ax[2].legend()

    PLP local maxima can be used as estimates of beat positions.

    >>> tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env)
    >>> beats_plp = np.flatnonzero(librosa.util.localmax(pulse))
    >>> import matplotlib.pyplot as plt
    >>> fig, ax = plt.subplots(nrows=2, sharex=True, sharey=True)
    >>> times = librosa.times_like(onset_env, sr=sr)
    >>> ax[0].plot(times, librosa.util.normalize(onset_env),
    ...          label='Onset strength')
    >>> ax[0].vlines(times[beats], 0, 1, alpha=0.5, color='r',
    ...            linestyle='--', label='Beats')
    >>> ax[0].legend()
    >>> ax[0].set(title='librosa.beat.beat_track')
    >>> ax[0].label_outer()
    >>> # Limit the plot to a 15-second window
    >>> times = librosa.times_like(pulse, sr=sr)
    >>> ax[1].plot(times, librosa.util.normalize(pulse),
    ...          label='PLP')
    >>> ax[1].vlines(times[beats_plp], 0, 1, alpha=0.5, color='r',
    ...            linestyle='--', label='PLP Beats')
    >>> ax[1].legend()
    >>> ax[1].set(title='librosa.beat.plp', xlim=[5, 20])
    >>> ax[1].xaxis.set_major_formatter(librosa.display.TimeFormatter())
    """
    # Step 1: get the onset envelope
    if onset_envelope is None:
        onset_envelope = onset.onset_strength(
            y=y, sr=sr, hop_length=hop_length, aggregate=np.median
        )

    if tempo_min is not None and tempo_max is not None and tempo_max <= tempo_min:
        raise ParameterError(
            f"tempo_max={tempo_max} must be larger than tempo_min={tempo_min}"
        )

    # Step 2: get the fourier tempogram
    ftgram = fourier_tempogram(
        onset_envelope=onset_envelope,
        sr=sr,
        hop_length=hop_length,
        win_length=win_length,
    )

    # Step 3: pin to the feasible tempo range
    tempo_frequencies = core.fourier_tempo_frequencies(
        sr=sr, hop_length=hop_length, win_length=win_length
    )

    if tempo_min is not None:
        ftgram[..., tempo_frequencies < tempo_min, :] = 0
    if tempo_max is not None:
        ftgram[..., tempo_frequencies > tempo_max, :] = 0

    # reshape lengths to match dimension properly
    tempo_frequencies = util.expand_to(tempo_frequencies, ndim=ftgram.ndim, axes=-2)

    # Step 3: Discard everything below the peak
    ftmag = np.log1p(1e6 * np.abs(ftgram))
    if prior is not None:
        ftmag += prior.logpdf(tempo_frequencies)

    peak_values = ftmag.max(axis=-2, keepdims=True)
    ftgram[ftmag < peak_values] = 0

    # Normalize to keep only phase information
    ftgram /= util.tiny(ftgram) ** 0.5 + np.abs(ftgram.max(axis=-2, keepdims=True))

    # Step 5: invert the Fourier tempogram to get the pulse
    pulse = core.istft(
        ftgram, hop_length=1, n_fft=win_length, length=onset_envelope.shape[-1]
    )

    # Step 6: retain only the positive part of the pulse cycle
    pulse = np.clip(pulse, 0, None, pulse)

    # Return the normalized pulse
    return util.normalize(pulse, axis=-1)



def __beat_tracker(
    onset_envelope: np.ndarray, bpm: float, fft_res: float, tightness: float, trim: bool
) -> np.ndarray:
    """Tracks beats in an onset strength envelope.

    Parameters
    ----------
    onset_envelope : np.ndarray [shape=(n,)]
        onset strength envelope
    bpm : float [scalar]
        tempo estimate
    fft_res : float [scalar]
        resolution of the fft (sr / hop_length)
    tightness : float [scalar]
        how closely do we adhere to bpm?
    trim : bool [scalar]
        trim leading/trailing beats with weak onsets?

    Returns
    -------
    beats : np.ndarray [shape=(n,)]
        frame numbers of beat events
    """
    if bpm <= 0:
        raise ParameterError("bpm must be strictly positive")

    # convert bpm to a sample period for searching
    period = round(60.0 * fft_res / bpm)

    # localscore is a smoothed version of AGC'd onset envelope
    localscore = __beat_local_score(onset_envelope, period)

    # run the DP
    backlink, cumscore = __beat_track_dp(localscore, period, tightness)

    # get the position of the last beat
    beats = [__last_beat(cumscore)]

    # Reconstruct the beat path from backlinks
    while backlink[beats[-1]] >= 0:
        beats.append(backlink[beats[-1]])

    # Put the beats in ascending order
    # Convert into an array of frame numbers
    beats = np.array(beats[::-1], dtype=int)

    # Discard spurious trailing beats
    beats = __trim_beats(localscore, beats, trim)

    return beats, cumscore


# -- Helper functions for beat tracking
def __normalize_onsets(onsets):
    """Map onset strength function into the range [0, 1]"""
    norm = onsets.std(ddof=1)
    if norm > 0:
        onsets = onsets / norm
    return onsets


def __beat_local_score(onset_envelope, period):
    """Construct the local score for an onset envlope and given period"""
    window = np.exp(-0.5 * (np.arange(-period, period + 1) * 32.0 / period) ** 2)
    return scipy.signal.convolve(__normalize_onsets(onset_envelope), window, "same")


def __beat_track_dp(localscore, period, tightness):
    """Core dynamic program for beat tracking"""
    backlink = np.zeros_like(localscore, dtype=int)
    cumscore = np.zeros_like(localscore)

    # Search range for previous beat
    window = np.arange(-2 * period, -np.round(period / 2) + 1, dtype=int)

    # Make a score window, which begins biased toward start_bpm and skewed
    if tightness <= 0:
        raise ParameterError("tightness must be strictly positive")

    txwt = -tightness * (np.log(-window / period) ** 2)

    # Are we on the first beat?
    first_beat = True
    for i, score_i in enumerate(localscore):
        # Are we reaching back before time 0?
        z_pad = np.maximum(0, min(-window[0], len(window)))

        # Search over all possible predecessors
        candidates = txwt.copy()
        candidates[z_pad:] = candidates[z_pad:] + cumscore[window[z_pad:]]

        # Find the best preceding beat
        beat_location = np.argmax(candidates)

        # Add the local score
        cumscore[i] = score_i + candidates[beat_location]

        # Special case the first onset.  Stop if the localscore is small
        if first_beat and score_i < 0.01 * localscore.max():
            backlink[i] = -1
        else:
            backlink[i] = window[beat_location]
            first_beat = False

        # Update the time range
        window = window + 1

    return backlink, cumscore


def __last_beat(cumscore):
    """Get the last beat from the cumulative score array"""
    maxes = util.localmax(cumscore)
    med_score = np.median(cumscore[np.argwhere(maxes)])

    # The last of these is the last beat (since score generally increases)
    return np.argwhere((cumscore * maxes * 2 > med_score)).max()


def __trim_beats(localscore: np.ndarray, beats: np.ndarray, trim: bool) -> np.ndarray:
    """Remove spurious leading and trailing beats"""
    smooth_boe = scipy.signal.convolve(localscore[beats], scipy.signal.hann(5), "same")

    if trim:
        threshold = 0.5 * ((smooth_boe**2).mean() ** 0.5)
    else:
        threshold = 0.0

    valid = np.argwhere(smooth_boe > threshold)

    return beats[valid.min() : valid.max()]

In [None]:
import numpy as np

def calculate_beat_stats(y, sr, tightness_score, onset_type, start_bpm):
  if onset_type == "rms":
      onset_env = librosa.feature.rms(y=y, frame_length=2048, hop_length=512)[0]
  elif onset_type == "zcr":
      onset_env = librosa.feature.zero_crossing_rate(y, frame_length=2048, hop_length=512)[0]
  elif onset_type == "speccent":
      onset_env = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
  elif onset_type == "onsetstrength": # Add your technique here
      onset_env = librosa.onset.onset_strength(y=y, sr=sr, aggregate=np.median)
  else:
      raise ValueError(f"Unknown technique: {onset_type}")
  tempo, beats, cumscore = beat_track(onset_envelope=onset_env, sr=sr, tightness=tightness_score, start_bpm=start_bpm)
  return (tempo, beats, cumscore)

def select_best_30sec_chunk(index_array_y, sr):
    audio_duration = len(index_array_y) / sr
    if audio_duration < 30:
        print(f"Warning: Audio is shorter than 30 seconds ({audio_duration} seconds).")
        return 0, len(index_array_y)

    # Initialize variables
    window_size = 30 * sr  # 30 seconds window
    best_start_idx = 0
    best_voiced_time = 0
    least_long_silence = float('inf')

    # Slide window over index_array_y
    for start_idx in range(0, len(index_array_y) - window_size, sr):  # Step size is 1 second
        end_idx = start_idx + window_size
        window = index_array_y[start_idx:end_idx]

        # Count voiced time
        voiced_time = np.sum(window) / sr

        # Count long silences
        silence_count = 0
        long_silences = 0
        for i in range(len(window)):
            if window[i] == 0:
                silence_count += 1
            else:
                if silence_count > 2 * sr:  # Consider silences longer than 2 seconds as "long"
                    long_silences += 1
                silence_count = 0

        # Update best_start_idx if needed
        if voiced_time > best_voiced_time or (voiced_time == best_voiced_time and long_silences < least_long_silence):
            best_start_idx = start_idx
            best_voiced_time = voiced_time
            least_long_silence = long_silences

    best_end_idx = best_start_idx + window_size
    return best_start_idx, best_end_idx

# Poetry Reading

In [None]:
import librosa
from librosa.core import audio
import time
import numpy as np
import pickle
import os
import pandas as pd

# Assign 'row' with your own dataframe that contains the columns 'Matching_MP3_File', 'transcriptions_list'
# row =

# Assign the folder path where 'Matching_MP3_File' is located
# audio_path =

mp3_file_path = audio_path + row['Matching_MP3_File']
file_base_name, _ = os.path.splitext(os.path.basename(mp3_file_path))
audio_type = 'poetry'


y, sr = librosa.load(mp3_file_path, sr=16000)
# Initialize an array with zeros
index_array_y = np.zeros(len(y))

transcriptions = row['transcriptions_list']

for segment in transcriptions["segments_after_alignment"]:
    for word_info in segment["words"]:
        if "start" in word_info and "end" in word_info:
            start_time = word_info["start"]
            end_time = word_info["end"]

            # Find the sample range corresponding to the word's time interval
            start_sample = int(start_time * sr)
            end_sample = int(end_time * sr)

            # Set to 1 in the index array
            index_array_y[start_sample:end_sample] = 1

best_start_idx, best_end_idx = select_best_30sec_chunk(index_array_y, sr)
print("Best 30-second chunk starts at index:", best_start_idx, "and ends at index:", best_end_idx)

# Check for various parameter combinations
for tightness_score in [1, 1000]:
    for onset_type in ['onsetstrength']:
        for start_bpm in [120]:
          tempo, beats, cumscore = calculate_beat_stats(y[best_start_idx:best_end_idx], sr, tightness_score, onset_type, start_bpm)
          # Calculate and print statistics
          print(f"tightness_score: {tightness_score}, Tempo: {tempo}, Cumulative Score / Length: {np.max(cumscore) / len(beats)}")

Best 30-second chunk starts at index: 80000 and ends at index: 560000
tightness_score: 1, Tempo: 110.29411764705883, Cumulative Score / Length: 2.3650420909328327
tightness_score: 1000, Tempo: 110.29411764705883, Cumulative Score / Length: 0.941578580463269


  smooth_boe = scipy.signal.convolve(localscore[beats], scipy.signal.hann(5), "same")


#Narration

In [None]:
import librosa
from librosa.core import audio
import time
import numpy as np
import pickle
import os
import pandas as pd

# Assign 'row' with your own dataframe that contains the columns 'ID', 'READER','BOOK ID', and 'transcriptions_list'
# row =

# Assign the folder path where a mp3 file is located
# audio_path =


# Extract ID, Reader, and SUBSET values
_id = str(row['ID'])  # Convert to str, in case they are not
reader = str(row['READER'])  # Convert to str, in case they are not
bookid = row['BOOK ID']

file_name = f"{_id}_{reader}_{bookid}_combined.mp3"
mp3_file_path = os.path.join(audio_path, file_name)

file_base_name, _ = os.path.splitext(os.path.basename(mp3_file_path))
audio_type = 'narration'


y, sr = librosa.load(mp3_file_path, sr=16000)
# Initialize an array with zeros
index_array_y = np.zeros(len(y))

transcriptions = row['transcriptions_list']

for segment in transcriptions["segments_after_alignment"]:
    for word_info in segment["words"]:
        if "start" in word_info and "end" in word_info:
            start_time = word_info["start"]
            end_time = word_info["end"]

            # Find the sample range corresponding to the word's time interval
            start_sample = int(start_time * sr)
            end_sample = int(end_time * sr)

            # Set to 1 in the index array
            index_array_y[start_sample:end_sample] = 1

best_start_idx, best_end_idx = select_best_30sec_chunk(index_array_y, sr)
print("Best 30-second chunk starts at index:", best_start_idx, "and ends at index:", best_end_idx)

# Check for various parameter combinations
for tightness_score in [1, 1000]:
    for onset_type in ['onsetstrength']:
        for start_bpm in [120]:
          tempo, beats, cumscore = calculate_beat_stats(y[best_start_idx:best_end_idx], sr, tightness_score, onset_type, start_bpm)
          # Calculate and print statistics
          print(f"tightness_score: {tightness_score}, Tempo: {tempo}, Cumulative Score / Length: {np.max(cumscore) / len(beats)}")

Best 30-second chunk starts at index: 1200000 and ends at index: 1680000
tightness_score: 1, Tempo: 98.6842105263158, Cumulative Score / Length: 2.9124290948515013
tightness_score: 1000, Tempo: 98.6842105263158, Cumulative Score / Length: 1.125259321502782


  smooth_boe = scipy.signal.convolve(localscore[beats], scipy.signal.hann(5), "same")


# Singing Voice

In [None]:
import librosa
from librosa.core import audio
import time
import numpy as np
import pickle
import os
import pandas as pd

# Assign 'row' with your own dataframe that contains the columns 'performance_id', 'transcriptions_list'
# row =

# Assign the folder path where a mp3 files is located
# audio_path =

performance_id = row['performance_id']
mp3_file_path = os.path.join(audio_path, f"{performance_id}.mp3")  # Assuming the filename is performance_id.mp3
file_base_name, _ = os.path.splitext(os.path.basename(mp3_file_path))
audio_type = 'vocal'

y, sr = librosa.load(mp3_file_path, sr=16000)
# Initialize an array with zeros
index_array_y = np.zeros(len(y))

transcriptions = row['transcriptions_list']

for segment in transcriptions["segments_after_alignment"]:
    for word_info in segment["words"]:
        if "start" in word_info and "end" in word_info:
            start_time = word_info["start"]
            end_time = word_info["end"]

            # Find the sample range corresponding to the word's time interval
            start_sample = int(start_time * sr)
            end_sample = int(end_time * sr)

            # Set to 1 in the index array
            index_array_y[start_sample:end_sample] = 1

best_start_idx, best_end_idx = select_best_30sec_chunk(index_array_y, sr)
print("Best 30-second chunk starts at index:", best_start_idx, "and ends at index:", best_end_idx)

# Check for various parameter combinations
for tightness_score in [1, 1000]:
    for onset_type in ['onsetstrength']:
        for start_bpm in [120]:
          tempo, beats, cumscore = calculate_beat_stats(y[best_start_idx:best_end_idx], sr, tightness_score, onset_type, start_bpm)
          # Calculate and print statistics
          print(f"tightness_score: {tightness_score}, Tempo: {tempo}, Cumulative Score / Length: {np.max(cumscore) / len(beats)}")

Best 30-second chunk starts at index: 768000 and ends at index: 1248000
tightness_score: 1, Tempo: 170.45454545454547, Cumulative Score / Length: 2.020721899959782
tightness_score: 1000, Tempo: 170.45454545454547, Cumulative Score / Length: 0.7784624634142701


  smooth_boe = scipy.signal.convolve(localscore[beats], scipy.signal.hann(5), "same")
