In [11]:
from pyannote.database import registry, FileFinder

registry.load_database("diarization_dataset/pyannote/database.yml")
dataset = registry.get_protocol('VOC.SpeakerDiarization.mini',
                                preprocessors={"audio": FileFinder()})

'VOC.SpeakerDiarization.mini' found in /finetune/diarization_dataset/pyannote/database.yml does not define the 'scope' of speaker labels (file, database, or global). Setting it to 'file'.


In [12]:
from pyannote.audio import Model
from pyannote.audio import Pipeline
from pyannote.audio.tasks import Segmentation

### Check pipeline

In [8]:
# Load pipeline
pretrained_pipeline = Pipeline.from_pretrained('diarization_model/config.yaml')

In [9]:
for file in dataset.test():
  print(file)

<pyannote.database.protocol.protocol.ProtocolFile object at 0x7f0af6b29c60>
<pyannote.database.protocol.protocol.ProtocolFile object at 0x7f0af6b19db0>
<pyannote.database.protocol.protocol.ProtocolFile object at 0x7f0af6b0bdc0>


In [13]:
# Validate the pipeline on test set
from tqdm import tqdm
from pyannote.metrics.diarization import DiarizationErrorRate
metric = DiarizationErrorRate()

for file in tqdm(dataset.test()):
    # apply pretrained pipeline
    file["pretrained pipeline"] = pretrained_pipeline(file)

    # evaluate its performance
    metric(file["annotation"], file["pretrained pipeline"], uem=file["annotated"])

print(f"The pretrained pipeline reaches a Diarization Error Rate (DER) of {100 * abs(metric):.1f}% on {dataset.name} test set.")

3it [02:40, 53.64s/it]

The pretrained pipeline reaches a Diarization Error Rate (DER) of 17.2% on VOC.SpeakerDiarization.mini test set.





### Fine-tune Segmentation

In [15]:
# Load model
model = Model.from_pretrained('diarization_model/segmentation-3.0.bin')

# Set up for fine-tuning on training set
task = Segmentation(
    dataset,
    duration=model.specifications.duration,
    max_num_speakers=len(model.specifications.classes),
    batch_size=32,
    num_workers=2,
    loss="bce",
    vad_loss="bce"
)
model.task = task
model.prepare_data()
model.setup()

Protocol VOC.SpeakerDiarization.mini does not precompute the output of torchaudio.info(): adding a 'torchaudio.info' preprocessor for you to speed up dataloaders. See pyannote.database documentation on how to do that yourself.


In [16]:
# train with lightning
from types import MethodType
from torch.optim import Adam
from pytorch_lightning.callbacks import (
    EarlyStopping,
    ModelCheckpoint,
    RichProgressBar,
)

# we use Adam optimizer with 1e-4 learning rate
def configure_optimizers(self):
    return Adam(self.parameters(), lr=1e-4)

model.configure_optimizers = MethodType(configure_optimizers, model)

# we monitor diarization error rate on the validation set
# and use to keep the best checkpoint and stop early
monitor, direction = task.val_monitor
checkpoint = ModelCheckpoint(
    monitor=monitor,
    mode=direction,
    save_top_k=1,
    every_n_epochs=1,
    save_last=False,
    save_weights_only=False,
    filename="{epoch}",
    verbose=False,
)
early_stopping = EarlyStopping(
    monitor=monitor,
    mode=direction,
    min_delta=0.0,
    patience=10,
    strict=True,
    verbose=False,
)

callbacks = [RichProgressBar(), checkpoint, early_stopping]

# we train for at most 20 epochs (might be shorter in case of early stopping)
from pytorch_lightning import Trainer
trainer = Trainer(accelerator="auto",
                  callbacks=callbacks,
                  max_epochs=20,
                  gradient_clip_val=0.5)
trainer.fit(model)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Missing logger folder: /finetune/lightning_logs


`Trainer.fit` stopped: `max_epochs=20` reached.


In [17]:
checkpoint

<pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint at 0x7f0af6b83010>

In [18]:
# save path to the best checkpoint for later use
finetuned_model = checkpoint.best_model_path

In [None]:
files.download(finetuned_model)

### Optimize hyper-params of pipeline

In [19]:
pretrained_hyperparameters = pretrained_pipeline.parameters(instantiated=True) # as in config.yaml
pretrained_hyperparameters

{'segmentation': {'min_duration_off': 0.0},
 'clustering': {'method': 'centroid',
  'min_cluster_size': 12,
  'threshold': 0.7045654963945799}}

The above hyper-parameters might be no longer optimal for the newly finetuned speaker segmentation model.

`segmentation.threshold` ($\theta$, between 0 and 1) controls the aggressiveness of speaker activity detection (VAD) (a higher value will result in less detected speech)

`clustering.threshold` ($\delta$, between 0 and 2) controls the number of speakers (a higher value will result in less speakers).

`segmentation.min_duration_off` ($\Delta$, in seconds) controls whether intra-speaker pauses are filled. This usually depends on the downstream application so it is better to first force it to zero (i.e. never fill intra-speaker pauses) during optimization.

`clustering.centroid` is the linkage used by the agglomerative clustering step. `centroid` has been found to be slightly better than `average`.

`clustering.min_cluster_size` controls what to do with small speaker clusters. Clusters smaller than that are assigned to the most similar large cluster. `15` is a good default value.

In [20]:
# Optimizing `segmentation.threshold` by assuming that the subsequent clustering step is perfect (`OracleClustering`).
from pyannote.audio.pipelines import SpeakerDiarization
from pyannote.pipeline import Optimizer

pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    clustering="OracleClustering",
)
# as reported in the technical report, min_duration_off can safely be set to 0.0
pipeline.freeze({"segmentation": {"min_duration_off": 0.0}})

optimizer = Optimizer(pipeline)
dev_set = list(dataset.development())

iterations = optimizer.tune_iter(dev_set, show_progress=False)
best_loss = 1.0
for i, iteration in enumerate(iterations):
    print(f"Best segmentation threshold so far: {iteration['params']['segmentation']['threshold']}")
    if i > 20: break

Best segmentation threshold so far: 0.14532902063694833
Best segmentation threshold so far: 0.630678329814278
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segmentation threshold so far: 0.5764273103219858
Best segme

In [21]:
# Then, use optimized value of segmentation.threshold to optimize clustering.threshold
best_segmentation_threshold = optimizer.best_params["segmentation"]["threshold"]
print(f"Best segmentation threshold: {best_segmentation_threshold}")

pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

pipeline.freeze({
    "segmentation": {
        "threshold": best_segmentation_threshold,
        "min_duration_off": 0.0,
    },
    "clustering": {
        "method": "centroid",
        "min_cluster_size": 15,
    },
})

optimizer = Optimizer(pipeline)
iterations = optimizer.tune_iter(dev_set, show_progress=False)
best_loss = 1.0
for i, iteration in enumerate(iterations):
    print(f"Best clustering threshold so far: {iteration['params']['clustering']['threshold']}")
    if i > 20: break

Best segmentation threshold: 0.5764273103219858
Best clustering threshold so far: 1.672845723270305
Best clustering threshold so far: 1.672845723270305
Best clustering threshold so far: 1.672845723270305
Best clustering threshold so far: 1.672845723270305
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Best clustering threshold so far: 0.7365351775467037
Be

In [22]:
# Evaluate performance of the fine-tuned pipeline
best_clustering_threshold = optimizer.best_params['clustering']['threshold']
print(f"Best segmentation threshold: {best_segmentation_threshold}")
print(f"Best clustering threshold: {best_clustering_threshold}")

finetuned_pipeline = SpeakerDiarization(
    segmentation=finetuned_model,
    embedding=pretrained_pipeline.embedding,
    embedding_exclude_overlap=pretrained_pipeline.embedding_exclude_overlap,
    clustering=pretrained_pipeline.klustering,
)

finetuned_pipeline.instantiate({
    "segmentation": {
        "threshold": best_segmentation_threshold,
        "min_duration_off": 0.0,
    },
    "clustering": {
        "method": "centroid",
        "min_cluster_size": 15,
        "threshold": best_clustering_threshold,
    },
})

metric = DiarizationErrorRate()

for file in dataset.test():
    # apply finetuned pipeline
    file["finetuned pipeline"] = finetuned_pipeline(file)

    # evaluate its performance
    metric(file["annotation"], file["finetuned pipeline"], uem=file["annotated"])

print(f"The finetuned pipeline reaches a Diarization Error Rate (DER) of {100 * abs(metric):.1f}% on {dataset.name} test set.")

Best segmentation threshold: 0.5764273103219858
Best clustering threshold: 0.7365351775467037
The finetuned pipeline reaches a Diarization Error Rate (DER) of 66.7% on VOC.SpeakerDiarization.mini test set.
