# Tango Phrase & Section Toolkit
Fast, modular playground for Argentine tango.

In [1]:
# Install dependencies (Linux Docker friendly)
!pip -q install librosa soundfile scikit-learn ipywidgets

import numpy as np, pandas as pd, librosa, soundfile as sf
from sklearn.cluster import KMeans
from IPython.display import Audio, display
import ipywidgets as widgets

# Patch deprecated NumPy aliases used by librosa
for alias, actual in [('complex', np.complex128), ('float', float)]:
    if not hasattr(np, alias):
        setattr(np, alias, actual)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
from abc import ABC, abstractmethod

class PhraseFinderInterface(ABC):
    @abstractmethod
    def getPhraseOnSet(self): ...
    @abstractmethod
    def getSectionOnSet(self): ...


In [16]:
class VisionClusterPhraseFinder(PhraseFinderInterface):
    def __init__(self, audio_path, sr=22_050, n_mels=128,
                 hop=1024, tol=0.25, max_tries=10):
        # load & features
        self.y, self.sr = librosa.load(audio_path, sr=sr)
        self.total = len(self.y) / sr
        S = librosa.feature.melspectrogram(y=self.y, sr=sr,
                                           n_fft=2048, hop_length=hop,
                                           n_mels=n_mels, power=2.0)
        F = librosa.power_to_db(S, ref=np.max)[:20].T          # (frames × 20)
        self.times = librosa.frames_to_time(
            np.arange(F.shape[0]), sr=sr, hop_length=hop)
        self.hop = hop

        # ---- sections (5) ----
        cuts = self._tier_frames(F, k=5, exp=self.total/5,
                                 tol=tol, max_tries=max_tries, offset=0)
        sec_start_frames = np.insert(cuts, 0, 0)               # 0 + 4 cuts = 5
        sec_end_frames   = np.append(cuts, len(self.times)-1)
        self.sec_on = self.times[sec_start_frames]             # len == 5

        # ---- phrases (20) ----
        ph_on = [0.0]
        for s, e in zip(sec_start_frames, sec_end_frames):
            seg_dur = self.times[e] - self.times[s]
            ph_cuts = self._tier_frames(
                F[s:e], k=4, exp=seg_dur/4,
                tol=tol, max_tries=max_tries, offset=s)
            ph_on.extend(self.times[ph_cuts])

        # ensure exactly 20
        self.ph_on = np.array(sorted(np.unique(ph_on)))
        if len(self.ph_on) > 20:
            self.ph_on = self.ph_on[:20]
        elif len(self.ph_on) < 20:
            need = 20 - len(self.ph_on)
            extra = np.linspace(self.ph_on[-1], self.total - 0.001, need+2)[1:-1]
            self.ph_on = np.append(self.ph_on, extra)

    # ---------------- helpers ----------------
    def getSectionOnSet(self): return self.sec_on
    def getPhraseOnSet(self):  return self.ph_on

    def _tier_frames(self, feats, k, exp, tol, max_tries, offset):
        """Try K-Means up to max_tries; if durations not ~equal, fall back to equal grid."""
        if feats.shape[0] <= k:                    # very short segment
            step = feats.shape[0] // k
            return (np.arange(step, feats.shape[0], step)[:k-1] + offset)

        for seed in range(max_tries):
            labels = KMeans(k, random_state=seed, n_init=5).fit(feats).labels_
            cuts = np.where(np.diff(labels, prepend=labels[0]))[0]
            if len(cuts) != k-1:            # empty cluster
                continue
            frames = cuts + offset
            durations = np.diff(np.append([offset], frames)) * self.hop / self.sr
            if np.all((durations > exp*(1-tol)) & (durations < exp*(1+tol))):
                return frames

        # fallback: even split
        step = feats.shape[0] // k
        return (np.arange(step, feats.shape[0], step)[:k-1] + offset)

In [4]:
class AudioSplitter:
    def __init__(self, y, sr): self.y, self.sr = y, sr
    def split(self, onsets):
        onsets = np.append(onsets, len(self.y)/self.sr)
        return [self.y[int(s*self.sr):int(e*self.sr)] for s, e in zip(onsets[:-1], onsets[1:])]


In [5]:
class SegmentPlayer:
    """Simple dropdown widget to play segments."""
    def __init__(self, segments, sr):
        self.segments, self.sr = segments, sr
        self._ui()
    def _ui(self):
        dd = widgets.Dropdown(options=[(f'Seg {i+1}', i) for i in range(len(self.segments))],
                              description='Play:')
        out = widgets.Output()
        def on_change(ch):
            with out:
                out.clear_output()
                display(Audio(self.segments[ch["new"]], rate=self.sr))
        dd.observe(on_change, names='value')
        display(dd, out)


In [6]:
class StatsReporter:
    def __init__(self, onsets, total_dur):
        self.onsets = np.array(onsets)
        self.total_dur = total_dur
        self.durs = np.diff(np.append(self.onsets, total_dur))
    def summary(self):
        print("Count:", len(self.onsets))
        print("Durations (s):", np.round(self.durs, 2))
        print(f"Mean: {self.durs.mean():.2f}s  Std: {self.durs.std():.2f}s")


In [17]:
# 🔥 Quick demo (adjust path)
audio_path = 'data/Carlos_di_Sarli_Instrumental_Cara.mp3'
finder = VisionClusterPhraseFinder(audio_path)
print("Sections:", np.round(finder.getSectionOnSet(), 2))
print("Phrases:", np.round(finder.getPhraseOnSet()[:12], 2))


Sections: [  0.    33.76  67.52 101.29 135.05]
Phrases: [  0.     8.41  16.81  25.22  42.17  50.57  58.98  75.93  84.33  92.74
 109.69 118.1 ]


In [18]:
splitter = AudioSplitter(finder.y, finder.sr)


In [19]:
phrases = splitter.split(finder.getPhraseOnSet())

In [20]:
sections = splitter.split(finder.getSectionOnSet())

In [21]:

SegmentPlayer(phrases, finder.sr)


Dropdown(description='Play:', options=(('Seg 1', 0), ('Seg 2', 1), ('Seg 3', 2), ('Seg 4', 3), ('Seg 5', 4), (…

Output()

<__main__.SegmentPlayer at 0x722706f60090>

In [22]:
SegmentPlayer(sections, finder.sr)


Dropdown(description='Play:', options=(('Seg 1', 0), ('Seg 2', 1), ('Seg 3', 2), ('Seg 4', 3), ('Seg 5', 4)), …

Output()

<__main__.SegmentPlayer at 0x7227028ae9d0>

In [23]:
StatsReporter(finder.getPhraseOnSet(), len(finder.y)/finder.sr).summary()

Count: 20
Durations (s): [ 8.41  8.41  8.41 16.95  8.41  8.41 16.95  8.41  8.41 16.95  8.41  8.41
 17.    8.45  8.45  1.71  1.71  1.71  1.71  1.71]
Mean: 8.45s  Std: 5.09s


In [None]:
StatsReporter(finder.getSectionOnSet(), len(finder.y)/finder.sr).summary()