In [3]:
!pip -q install torch torchaudio librosa soundfile scikit-learn matplotlib

In [39]:
!git clone https://huggingface.co/datasets/mteb/free-spoken-digit-dataset fsdd
!ls -l fsdd/data

fatal: destination path 'fsdd' already exists and is not an empty directory.
total 19176
-rw-r--r-- 1 root root  1980617 Sep  6 01:06 test-00000-of-00001.parquet
-rw-r--r-- 1 root root 17651752 Sep  6 01:06 train-00000-of-00001.parquet


In [41]:
import pandas as pd
df_train = pd.read_parquet('fsdd/data/train-00000-of-00001.parquet')
df_test = pd.read_parquet('fsdd/data/test-00000-of-00001.parquet')

In [54]:
%%writefile digits_rnn_local_fsdd.py

import os, re, math, argparse, random, warnings
import numpy as np
import soundfile as sf
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm
from glob import glob
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from glob import glob
import io
import soundfile as sf
import pandas as pd

CFG = dict(
    sr = 16000, sec = 1.0,
    n_mfcc=40, add_deltas=True,
    n_fft=512, hop_length=160, win_length=400, fmin=20.0, fmax=8000.0,
    batch_size=128, epochs=30, lr=2e-4,
    rnn_hidden=128, rnn_layers=2, rnn_bidirectional=True,
    dropout=0.2, seed=42, patience=6
)

DIGITS = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
random.seed(CFG['seed'])
np.random.seed(CFG['seed'])
torch.manual_seed(CFG['seed'])
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

def sec_to_frames(sec, hop, sr):
    target_length = int(sr * sec)
    return max(1, math.ceil((target_length - CFG["win_length"]) / hop) + 1)

def read_mono(audio):
    audio_io = io.BytesIO(audio)
    y, sr = sf.read(audio_io)
    if y.ndim == 2: y = y.mean(axis=1)
    return y, int(sr)

class FSDDDataset(Dataset):
    def __init__(self, df, labels, extractor, augment=False):
      self.df = df
      self.labels = labels
      self.extractor = extractor
      self.augment = augment
    def __len__(self): return len(self.df)
    def _aug(self, y):
      if np.random.rand()<0.5: y = y * (10**(np.random.uniform(-6,6)/20.0))
      if np.random.rand()<0.5:
        s = np.random.randint(-int(0.1*len(y)), int(0.1*len(y)))
        y = np.roll(y,s)
      if np.random.rand()<0.3:
        snr = np.random.choice([20,10,5]);
        sigp = (y**2).mean();
        y = y + np.random.randn(*y.shape)*np.sqrt(sigp/(10**(snr/10))+1e-9)
      return y
    def __getitem__(self, i):
      y, sr = read_mono(self.df[i]['bytes'])
      if self.augment: y = self._aug(y)
      mfcc = self.extractor.extract(y, sr)
      mfcc = self.extractor.transform(mfcc)
      return torch.from_numpy(mfcc.T), torch.tensor(self.labels[i], dtype=torch.long)

class MFCCExtractor:
    def __init__(self, target_frames):
      self.target_frames = target_frames
      self.mean_, self.std_ = None, None
    def _fix_len(self, y):
      T = int(CFG["sr"]*CFG["sec"])
      if len(y) < T:
        y = np.pad(y, (0, T-len(y)))
      else:
        y = y[:T]
      return y
    def extract(self, y, sr):
      if sr != CFG["sr"]:
          y = librosa.resample(y, orig_sr = sr, target_sr=CFG["sr"])
      y = self._fix_len(y)
      mfcc = librosa.feature.mfcc(y=y, sr=CFG["sr"], n_mfcc=CFG["n_mfcc"], n_fft=CFG["n_fft"], hop_length=CFG["hop_length"], win_length=CFG["win_length"],
                                  fmin=CFG["fmin"], fmax=CFG["fmax"])
      feats = [mfcc]
      if CFG["add_deltas"]:
        feats += [librosa.feature.delta(mfcc), librosa.feature.delta(mfcc, order = 2)]
      M = np.concatenate(feats, axis=0)
      t = M.shape[1]
      if t < self.target_frames:
        M = np.pad(M, ((0,0),(0,self.target_frames - t)))
      else:
        M = M[:, :self.target_frames]
      return M.astype(np.float32)
    def fit_norm(self, feats):
      X = np.concatenate([f.reshape(f.shape[0], -1) for f in feats], axis=1)
      self.mean_ = X.mean(axis=1, keepdims=True)
      self.std_ = X.std(axis=1, keepdims=True) + 1e-8
    def transform(self, M):
      if self.mean_ is None: return M
      return (M - self.mean_) / self.std_
    def save(self, path):
      np.savez(path, mean=self.mean_, std=self.std_)
    def load(self, path):
      data = np.load(path)
      self.mean_ = data['mean']
      self.std_ = data['std']

def collate_pad(batch):
  xs, ys = zip(*batch)
  lens = [x.shape[0] for x in xs]
  D = xs[0].shape[1]
  L = max(lens)
  out = torch.zeros(len(xs), L, D, dtype=torch.float32)
  for i,x in enumerate(xs): out[i, :x.shape[0],:] = x
  return out, torch.stack(ys), torch.tensor(lens, dtype=torch.long)

class BiLSTMClassifier(nn.Module):
  def __init__(self, input_dim, hidden=128, num_layers=2, n_classes=10, dropout=0.2):
    super().__init__()
    self.rnn = nn.LSTM(input_dim, hidden, num_layers=num_layers, batch_first=True, bidirectional=True,
                       dropout=dropout if num_layers>1 else 0.0)
    self.drop = nn.Dropout(dropout)
    self.fc = nn.Linear(hidden*2, n_classes)
  def forward(self, x, lengths):
    packed = nn.utils.rnn.pack_padded_sequence(x, lengths.cpu(), batch_first=True, enforce_sorted=False)
    out, _ = self.rnn(packed)
    out, _ = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
    mask = (torch.arange(out.size(1), device=out.device)[None,:]<lengths[:,None]).float().unsqueeze(-1)
    out = (out*mask).sum(1) / mask.sum(1).clamp_min(1.0)
    out = self.drop(out)
    return self.fc(out)

@torch.no_grad()
def evaluate(model, loader, device):
    model.eval()
    ys, ps = [], []
    for xb, yb, lengths, in loader:
      xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)
      ps.extend(model(xb, lengths).argmax(dim=1).cpu().tolist())
      ys.extend(yb.cpu().tolist())
    return accuracy_score(ys, ps), np.array(ys), np.array(ps)

def train(args):
    os.makedirs(args.out, exist_ok=True)
    df_tr = pd.read_parquet('fsdd/data/train-00000-of-00001.parquet')
    df_te = pd.read_parquet('fsdd/data/test-00000-of-00001.parquet')

    trP = df_tr["audio"]
    trY = df_tr['label']
    teP = df_te["audio"]
    teY = df_te['label']
    print(f"Split -> train: {len(trP)} | test: {len(teP)}")

    target_frames = sec_to_frames(CFG["sec"], CFG["hop_length"], CFG["sr"])
    extractor = MFCCExtractor(target_frames)
    feat_list = []
    for p in trP[:2000]:
      y, sr = read_mono(p['bytes'])
      feat_list.append(extractor.extract(y, sr))
    extractor.fit_norm(feat_list)
    extractor.save(os.path.join(args.out, "extractor.pt"))

    full_tr = FSDDDataset(trP, trY, extractor, augment=True)
    full_te = FSDDDataset(teP, teY, extractor, augment=False)
    n = len(full_tr)
    n_val = max(1, int(0.1*n))
    n_tr = n - n_val
    gen = torch.Generator().manual_seed(CFG["seed"])
    train_ds, val_ds = random_split(full_tr, [n_tr, n_val], generator=gen)

    dl_tr = DataLoader(train_ds, batch_size=CFG["batch_size"], shuffle=True, collate_fn=collate_pad, num_workers=2)
    dl_val = DataLoader(val_ds, batch_size=CFG["batch_size"], shuffle=False, collate_fn=collate_pad, num_workers=2)
    dl_te = DataLoader(full_te, batch_size=CFG["batch_size"], shuffle=False, collate_fn=collate_pad, num_workers=2)

    input_dim = CFG["n_mfcc"]*(3 if CFG["add_deltas"] else 1)
    model = BiLSTMClassifier(input_dim, CFG["rnn_hidden"], CFG["rnn_layers"], 10, CFG["dropout"])
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    opt = torch.optim.Adam(model.parameters(), lr=CFG["lr"])
    ce = nn.CrossEntropyLoss()
    best_val_acc = 0.0
    patience = CFG["patience"]
    best_state = None
    for ep in range(1, CFG["epochs"]+1):
      model.train()
      losses=[]
      for xb, yb, lengths in dl_tr:
        xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)
        opt.zero_grad()
        loss = ce(model(xb, lengths), yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        losses.append(loss.item())
      va_acc,_,_ = evaluate(model, dl_val, device)
      print(f"[E{ep:02d}] loss ={np.mean(losses):.4f} val_acc={va_acc:.4f}")
      if va_acc > best_val_acc:
        best_val_acc = va_acc
        best_state = {k:v.detach().cpu() for k,v in model.state_dict().items()}
        patience = CFG["patience"]
        torch.save(best_state, os.path.join(args.out, "best_bilstm.pt"))
      else:
        patience -= 1
        if patience == 0:
          print(f"Early stop at epoch {ep}. Best val acc: {best_val_acc:.4f}")
          break
    if best_state is not None:
      model.load_state_dict(best_state)
    te_acc, ys, ps = evaluate(model, dl_te, device)
    print(f"Test acc: {te_acc:.4f}")
    print(classification_report(ys, ps, target_names=DIGITS, digits=4))
    cm = confusion_matrix(ys, ps)
    # plot_confusion(cm, DIGITS, os.path.join(args.out, "cm.png"))
    print("Artifacts saved ->", args.out)


# def inter_file(args):
#   device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#   target_frames = sec_to_frames(CFG["sec"], CFG["hop_length"], CFG["sr"])
#   extractor = MFCCExtractor(target_frames)
#   extractor.load(os.path.join(os.path.dirname(os.path.abspath(__file__)), "extractor.pt"))
#   inpur_dim = CFG["n_mfcc"]*(3 if CFG["add_deltas"] else 1)
#   state = torch.load(os.path.abspath)
#   model = BiLSTMClassifier(input_dim, CFG["rnn_hidden"], CFG["rnn_layers"], CFG["rnn_bidirectional"], CFG["dropout"])
#   model.load_state_dict(state["model_state_dict"])
#   model.to(device)
#   model.eval()

if __name__=="__main__":
  ap = argparse.ArgumentParser()
  ap.add_argument("--mode", choices=["train", "inter_file"], required=True)
  ap.add_argument("--data_root", default="fsdd")
  ap.add_argument("--out", default="results")
  ap.add_argument("--ckpt", default="results/best_bilstm.pt")
  ap.add_argument("--wav")
  args = ap.parse_args()
  if args.mode=="train":
    train(args)
  else:
    inter_file(args)


Overwriting digits_rnn_local_fsdd.py


In [55]:
!python digits_rnn_local_fsdd.py --mode train --out result

Split -> train: 2700 | test: 300
[E01] loss =2.2952 val_acc=0.2259
[E02] loss =2.2697 val_acc=0.3259
[E03] loss =2.2098 val_acc=0.3370
[E04] loss =2.0352 val_acc=0.4889
[E05] loss =1.7222 val_acc=0.6296
[E06] loss =1.3763 val_acc=0.7074
[E07] loss =1.0677 val_acc=0.7444
[E08] loss =0.7858 val_acc=0.7593
[E09] loss =0.5544 val_acc=0.8556
[E10] loss =0.4516 val_acc=0.8852
[E11] loss =0.3929 val_acc=0.9037
[E12] loss =0.3235 val_acc=0.9111
[E13] loss =0.3046 val_acc=0.9148
[E14] loss =0.2794 val_acc=0.9185
[E15] loss =0.2622 val_acc=0.9407
[E16] loss =0.2372 val_acc=0.9074
[E17] loss =0.2263 val_acc=0.9296
[E18] loss =0.2001 val_acc=0.9519
[E19] loss =0.1733 val_acc=0.9444
[E20] loss =0.1596 val_acc=0.9259
[E21] loss =0.1593 val_acc=0.9519
[E22] loss =0.1491 val_acc=0.9519
[E23] loss =0.1225 val_acc=0.9667
[E24] loss =0.1349 val_acc=0.9593
[E25] loss =0.1398 val_acc=0.9481
[E26] loss =0.1285 val_acc=0.9704
[E27] loss =0.1079 val_acc=0.9630
[E28] loss =0.1158 val_acc=0.9481
[E29] loss =0.1