In [1]:
import os, sys, glob, random, numpy as np
from tqdm import tqdm
import torch, torchaudio
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import fairseq

In [2]:
import content_encoder_utils as utils

In [3]:
SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

<torch._C.Generator at 0x109100b10>

In [4]:
#where data is stored
base_path = "/Users/arjun/Documents/NUS/CS5647/accentdb/accentdb_extended/data"

In [5]:
train_set, validation_set, test_set = utils.scan_and_split_by_accent(base_path)
print(len(train_set), len(validation_set), len(test_set))

5396 1356 5214


In [6]:
accents = sorted({lab for _, lab in train_set})
label2id = {lab:i for i,lab in enumerate(accents)}  

In [7]:
print(accents)
print(label2id)

['american', 'australian', 'bangla', 'british', 'indian', 'malayalam', 'odiya', 'telugu', 'welsh']
{'american': 0, 'australian': 1, 'bangla': 2, 'british': 3, 'indian': 4, 'malayalam': 5, 'odiya': 6, 'telugu': 7, 'welsh': 8}


In [8]:
#baseline hubert for initial experimentation (not required to run)
bundle = torchaudio.pipelines.HUBERT_BASE
hubert = bundle.get_model().eval()   # outputs [B, T', 768]

@torch.no_grad()
def ssl_embed(wav16):  # wav16: [B,1,T] or [B,T] or [1,T] @16k mono
    x = wav16.squeeze(1) if wav16.dim()==3 and wav16.size(1)==1 else wav16
    if x.dim()==1: x = x.unsqueeze(0)
    feats, _ = hubert(x, None)      # [B, T', 768]
    return feats

In [9]:
def train_epoch(model, opt, X, y, bs=256):
    model.train()
    idx = torch.randperm(X.size(0))
    total = 0.0
    for i in range(0, len(idx), bs):
        b = idx[i:i+bs]
        xb, yb = X[b], y[b]
        loss = F.cross_entropy(model(xb), yb)
        opt.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        opt.step()
        total += loss.item() * xb.size(0)
    return total / len(idx)

@torch.no_grad()
def evaluate(model, X, y, labels, tag="VAL"):
    model.eval()
    pred = model(X).argmax(1).numpy()
    true = y.numpy()
    acc = accuracy_score(true, pred)
    print(f"[{tag}] ACC: {acc:.4f}")
    try:
        print(classification_report(true, pred, target_names=labels, digits=4))
        print("Confusion:\n", confusion_matrix(true, pred))
    except Exception: pass
    return acc

In [10]:
TARGET_SR = 16000

In [11]:
#import path variables for loading contentvec model
PROJECT_ROOT = "/Users/arjun/Lab/cs5647-labs/project_cv"

# 1. Update CONTENTVEC_ROOT to point to the Fairseq clone directory
# This directory contains the installed Fairseq package and the custom code.
CONTENTVEC_ROOT = os.path.join(PROJECT_ROOT, "contentvec", "fairseq") 

# 2. Update MODEL_PATH as needed
MODEL_PATH = os.path.join(PROJECT_ROOT, "model", "checkpoint_best_500.pt")
#MODEL_PATH = os.path.join(PROJECT_ROOT, "model", "checkpoint_best_legacy_500.pt")

In [12]:
class ContentVecEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.out_dim = 768

        # Load the ContentVec / HuBERT model from checkpoint
        models, cfg, task = fairseq.checkpoint_utils.load_model_ensemble_and_task(
        [MODEL_PATH], 
        arg_overrides={
            "data": os.path.dirname(MODEL_PATH),
            "user_dir": CONTENTVEC_ROOT, # Tells Fairseq where to find the custom code
        }
        )
        self.model = models[0]

    @torch.no_grad()
    def extract(self, wav16):  # [1, T] or [B, T]
        self.model.eval()

        x = wav16
        # convert [B,1,T] → [B,T]
        if x.dim() == 3 and x.size(1) == 1:
            x = x.squeeze(1)
        # convert [T] → [1,T]
        elif x.dim() == 1:
            x = x.unsqueeze(0)

        
        out = self.model(
            x,                   # Your main audio input
            #source_2=None,              # Pass None for the second audio view
            #spk_emb=None,               # Pass None for speaker embedding
            features_only=True, 
            mask=False
        )

        return out["x"]

    @torch.no_grad()
    def extract_layer(self, wav16, layer_idx):
        x = wav16
        if x.dim() == 3 and x.size(1) == 1:
            x = x.squeeze(1)
        elif x.dim() == 1:
            x = x.unsqueeze(0)

        out = self.model(
            x,
            features_only=True,
            mask=False,
            output_layer=layer_idx + 1,   # <-- IMPORTANT
        )

        return out["x"] 
    @torch.no_grad()
    def extract_with_embeddings(self, wav16, spk_emb_input=None):  # [1, T] or [B, T], spk_emb_input: [B, 192]
        self.model.eval()

        x = wav16
        # Convert audio input to [B, T]
        if x.dim() == 3 and x.size(1) == 1:
            x = x.squeeze(1)
        elif x.dim() == 1:
            x = x.unsqueeze(0)
            
        B = x.size(0)
        SPK_EMB_DIM = 192 # CONFIRMED DIMENSION
        
        # Use the provided speaker embedding if available, otherwise create a zero tensor
        if spk_emb_input is not None:
            # Ensure the provided embedding is correctly shaped
            if spk_emb_input.dim() == 1:
                 # Case: [192] -> [1, 192]
                spk_emb = spk_emb_input.unsqueeze(0)
            elif spk_emb_input.dim() == 2:
                # Case: [B, 192]
                spk_emb = spk_emb_input
            else:
                raise ValueError(f"spk_emb_input must be [192] or [B, 192], got shape {spk_emb_input.shape}")
        else:
            # Create a dummy speaker embedding tensor (all zeros) if none is provided
            # This is what you would do if you only wanted content features
            print("Generating dummy embeddings")
            spk_emb = torch.zeros(B, SPK_EMB_DIM, device=x.device, dtype=x.dtype)

        # Call the dedicated extraction method (the cleanest way)
        # Note: If you want a specific intermediate layer, set output_layer=N (1-based index)
        out, _ = self.model.extract_features(
            source=x,
            spk_emb=spk_emb, # Pass the correctly sized speaker embedding
            padding_mask=None,
            mask=False,
            ret_conv=False,
            output_layer=None, # Use the final layer
            tap=False
        )
        
        # 'out' is the final representation tensor: [B, T', 768]
        return out   

In [13]:
content_encoder = ContentVecEncoder()

  state = torch.load(f, map_location=torch.device("cpu"))
2025-11-04 15:55:05 | INFO | fairseq.tasks.contentvec_pretraining | current directory is /Users/arjun/Lab/cs5647-labs/project_cv
2025-11-04 15:55:05 | INFO | fairseq.tasks.contentvec_pretraining | ContentvecPretrainingTask Config {'_name': 'contentvec_pretraining', 'data': '/Users/arjun/Lab/cs5647-labs/project_cv/model', 'fine_tuning': False, 'labels': ['km'], 'label_dir': 'label', 'label_rate': 50, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'crop': True, 'pad_audio': False, 'spk2info': 'spk2info.dict'}
2025-11-04 15:55:05 | INFO | fairseq.models.hubert.contentvec | ContentvecModel Config: {'_name': 'contentvec', 'label_rate': 50, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_layers_1': 3, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12

In [14]:
cv = content_encoder.model
print(type(cv))
print("has encoder:", hasattr(cv, "encoder"))
print("has hubert:", hasattr(cv, "hubert"))
print("has extract_features:", hasattr(cv, "extract_features"))

<class 'fairseq.models.hubert.contentvec.ContentvecModel'>
has encoder: True
has hubert: False
has extract_features: True


In [15]:
import speechEncoder

2025-11-04 15:55:07 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint save hook for _speechbrain_save
2025-11-04 15:55:07 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint load hook for _speechbrain_load
2025-11-04 15:55:07 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint save hook for save
2025-11-04 15:55:07 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint load hook for load
2025-11-04 15:55:07 | INFO | speechbrain.utils.quirks | Applied quirks (see `speechbrain.utils.quirks`): [disable_jit_profiling, allow_tf32]
2025-11-04 15:55:07 | INFO | speechbrain.utils.quirks | Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
2025-11-04 15:55:07 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint save hook for _save
2025-11-04 15:55:07 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint load hook for _recover
  from speechbrain.pretrained import EncoderClassifier


In [16]:
s_encoder = speechEncoder.SpeakerEncoder()

2025-11-04 15:55:09 | INFO | speechbrain.utils.fetching | Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
2025-11-04 15:55:09 | INFO | speechbrain.utils.fetching | Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
2025-11-04 15:55:10 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint save hook for _save
2025-11-04 15:55:10 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint load hook for _load
2025-11-04 15:55:10 | DEBUG | speechbrain.utils.checkpoints | Registered parameter transfer hook for _load
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
2025-11-04 15:55:10 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint save hook for save
2025-11-04 15:55:10 | DEBUG | speechbrain.utils.checkpoints | Registered checkpoint load hook for load_if_possible
2025-11-04 15:55:10 | DEBUG | speechbrain.utils.parameter_transfer | Fetching 

In [17]:
def build_cached_split(pairs, label2id, encoder, speaker_encoder, cache_dir, layer = -1):
    os.makedirs(cache_dir, exist_ok=True)
    X, y = [], []
    for path, label in tqdm(pairs, desc=f"Extracting to {cache_dir}"):
        # stable key per file
        key = os.path.splitext(os.path.basename(path))[0]
        npy = os.path.join(cache_dir, key + ".npy")
        if os.path.exists(npy):
            emb = np.load(npy)
        else:
            wav = utils.load_and_resample_16k(path)          # 22k -> 16k here
            speaker_emb = speaker_encoder.extract_embedding(path, wav, TARGET_SR)
            speaker_emb = torch.from_numpy(speaker_emb)
            speaker_emb = F.pad(speaker_emb, (0, 256-192), 'constant', 0)
            with torch.no_grad():
                if layer >= 0:
                    feats = encoder.extract_layer(wav,layer)
                else:
                    feats = encoder.extract_with_embeddings(wav, speaker_emb)           # [1, T', 768]
                    
            #pooled = pool_mean_std(feats).cpu().numpy() # [1536]
            np.save(npy, feats)
            emb = feats
        X.append(emb)
        y.append(label2id[label])
    #X = torch.tensor(np.stack(X), dtype=torch.float32)
    y = torch.tensor(y, dtype=torch.long)
    return X, y

In [18]:
test_data, test_label = build_cached_split(test_set, label2id, content_encoder, s_encoder, cache_dir="cache_cv/with_s_embs_no_pool/test")

Extracting to cache_cv/with_s_embs_no_pool/test: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5214/5214 [00:04<00:00, 1286.40it/s]


In [19]:
validation_data, validation_label = build_cached_split(validation_set, label2id, content_encoder, s_encoder, cache_dir="cache_cv/with_s_embs_no_pool/train")

Extracting to cache_cv/with_s_embs_no_pool/train: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:01<00:00, 1238.50it/s]


In [20]:
train_data, train_label = build_cached_split(train_set, label2id, content_encoder, s_encoder, cache_dir="cache_cv/with_s_embs_no_pool/train")

Extracting to cache_cv/with_s_embs_no_pool/train: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5396/5396 [00:05<00:00, 1015.28it/s]


In [21]:
train_frame_features, train_frame_labels = utils.generate_frame_level_dataset(train_data, train_label, 10)

In [22]:
validation_frame_features, validation_frame_labels = utils.generate_frame_level_dataset(validation_data, validation_label, 10)

In [23]:
test_frame_features, test_frame_labels = utils.generate_frame_level_dataset(test_data, test_label, 10)

In [24]:
train_frame_features.size()

torch.Size([53960, 768])

In [25]:
class LeakageTester(nn.Module):
    def __init__(self, input_dim, num_classes, hidden=None):
        super().__init__()
        self.net = (nn.Linear(input_dim, num_classes) if hidden is None else
                    nn.Sequential(nn.Linear(input_dim, hidden), nn.ReLU(), nn.Dropout(0.2),
                                  nn.Linear(hidden, num_classes)))
    def forward(self, x): 
        return self.net(x)

In [26]:
lt = LeakageTester(input_dim=768, num_classes=len(accents))  # linear probe
opt = torch.optim.AdamW(lt.parameters(), lr=1e-3, weight_decay=1e-4)

In [27]:
best, state = 0.0, None
for ep in range(10):
    #y_train = torch.from_numpy(np.random.permutation(y_train))
    tr_loss = train_epoch(lt, opt, train_frame_features, train_frame_labels)
    val_acc = evaluate(lt, validation_frame_features, validation_frame_labels, accents, "VAL")
    print(f"Epoch {ep+1:02d} | train loss {tr_loss:.4f}")
    if val_acc > best: best, state = val_acc, {k:v.cpu() for k,v in lt.state_dict().items()}
if state: lt.load_state_dict(state)

[VAL] ACC: 0.9955
              precision    recall  f1-score   support

    american     0.9933    1.0000    0.9967      1490
  australian     0.9933    0.9926    0.9930      1490
      bangla     0.9867    1.0000    0.9933      1560
     british     1.0000    0.9993    0.9997      1490
      indian     1.0000    1.0000    1.0000      1490
   malayalam     1.0000    0.9934    0.9967      1510
       odiya     1.0000    1.0000    1.0000      1500
      telugu     1.0000    0.9812    0.9905      1540
       welsh     0.9867    0.9933    0.9900      1490

    accuracy                         0.9955     13560
   macro avg     0.9956    0.9955    0.9955     13560
weighted avg     0.9955    0.9955    0.9955     13560

Confusion:
 [[1490    0    0    0    0    0    0    0    0]
 [  10 1479    0    0    0    0    0    0    1]
 [   0    0 1560    0    0    0    0    0    0]
 [   0    0    1 1489    0    0    0    0    0]
 [   0    0    0    0 1490    0    0    0    0]
 [   0    0   10    0    

In [28]:
print("\n=== TEST ===")
evaluate(lt, test_frame_features, test_frame_labels, accents, "TEST")


=== TEST ===
[TEST] ACC: 0.1316
              precision    recall  f1-score   support

    american     0.1945    0.1208    0.1490      7420
  australian     0.0159    0.0101    0.0124      7420
      bangla     0.1904    0.1061    0.1363      7500
     british     0.3780    0.3361    0.3558      7420
      indian     0.2525    0.2838    0.2673      7420
   malayalam     0.0153    0.0155    0.0154      7470
       odiya     0.0000    0.0000    0.0000         0
      telugu     0.1497    0.0507    0.0758      7490
       welsh     0.0000    0.0000    0.0000         0

    accuracy                         0.1316     52140
   macro avg     0.1329    0.1026    0.1124     52140
weighted avg     0.1708    0.1316    0.1443     52140

Confusion:
 [[ 896 3502  470  266  178  293 1053   44  718]
 [ 442   75  640 1704  537 2412   10  807  793]
 [ 108  111  796  463 3121  318 2110  355  118]
 [1626  515  362 2494  182  463 1062  701   15]
 [  19  293    4  150 2106  942 3463  230  213]
 [  93  14

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


0.13162639048714997

In [30]:
utils.near_duplicate_pairs(train_frame_features[:20000], validation_frame_features[:10000])

NameError: name 'F' is not defined

In [51]:
print(len(X_train), len(X_test))

27695 3462
