juanmc2005 · juanmc2005 · Nov 9, 2023 · Oct 16, 2023 · Oct 16, 2023 · Oct 18, 2023
diff --git a/requirements.txt b/requirements.txt
@@ -9,7 +9,7 @@ pandas>=1.4.2
 torch>=1.12.1
 torchvision>=0.14.0
 torchaudio>=2.0.2
-pyannote.audio>=2.1.1
+pyannote.audio>=3.0.0
 pyannote.core>=4.5
 pyannote.database>=4.1.1
 pyannote.metrics>=3.2

diff --git a/setup.cfg b/setup.cfg
@@ -31,7 +31,7 @@ install_requires=
     torch>=1.12.1
     torchvision>=0.14.0
     torchaudio>=2.0.2
-    pyannote.audio>=2.1.1
+    pyannote.audio>=3.0.0
     pyannote.core>=4.5
     pyannote.database>=4.1.1
     pyannote.metrics>=3.2

diff --git a/src/diart/blocks/clustering.py b/src/diart/blocks/clustering.py
@@ -140,6 +140,10 @@ def identify(
         long_speakers = np.where(np.mean(segmentation.data, axis=0) >= self.rho_update)[
             0
         ]
+        # Remove speakers that have NaN embeddings
+        no_nan_embeddings = np.where(~np.isnan(embeddings).any(axis=1))[0]
+        active_speakers = np.intersect1d(active_speakers, no_nan_embeddings)
+
         num_local_speakers = segmentation.data.shape[1]
 
         if self.centers is None:

diff --git a/src/diart/models.py b/src/diart/models.py
@@ -3,9 +3,13 @@
 
 import torch
 import torch.nn as nn
+from requests import HTTPError
 
 try:
-    import pyannote.audio.pipelines.utils as pyannote_loader
+    from pyannote.audio import Inference, Model
+    from pyannote.audio.pipelines.speaker_verification import (
+        PretrainedSpeakerEmbedding,
+    )
 
     _has_pyannote = True
 except ImportError:
@@ -18,15 +22,20 @@ def __init__(self, model_info, hf_token: Union[Text, bool, None] = True):
         self.model_info = model_info
         self.hf_token = hf_token
 
-    def __call__(self) -> nn.Module:
-        return pyannote_loader.get_model(self.model_info, self.hf_token)
+    def __call__(self) -> Callable:
+        try:
+            return Model.from_pretrained(self.model_info, use_auth_token=self.hf_token)
+        except HTTPError:
+            return PretrainedSpeakerEmbedding(
+                self.model_info, use_auth_token=self.hf_token
+            )
 
 
-class LazyModel(nn.Module, ABC):
-    def __init__(self, loader: Callable[[], nn.Module]):
+class LazyModel(ABC):
+    def __init__(self, loader: Callable[[], Callable]):
         super().__init__()
         self.get_model = loader
-        self.model: Optional[nn.Module] = None
+        self.model: Optional[Callable] = None
 
     def is_in_memory(self) -> bool:
         """Return whether the model has been loaded into memory"""
@@ -38,11 +47,17 @@ def load(self):
 
     def to(self, *args, **kwargs) -> nn.Module:
         self.load()
-        return super().to(*args, **kwargs)
+        return self.model.to(*args, **kwargs)
 
     def __call__(self, *args, **kwargs):
         self.load()
-        return super().__call__(*args, **kwargs)
+        return self.model(*args, **kwargs)
+
+    def eval(self) -> "LazyModel":
+        self.load()
+        if isinstance(self.model, nn.Module):
+            self.model.eval()
+        return self
 
 
 class SegmentationModel(LazyModel):
@@ -83,21 +98,6 @@ def sample_rate(self) -> int:
     def duration(self) -> float:
         pass
 
-    @abstractmethod
-    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
-        """
-        Forward pass of the segmentation model.
-
-        Parameters
-        ----------
-        waveform: torch.Tensor, shape (batch, channels, samples)
-
-        Returns
-        -------
-        speaker_segmentation: torch.Tensor, shape (batch, frames, speakers)
-        """
-        pass
-
 
 class PyannoteSegmentationModel(SegmentationModel):
     def __init__(self, model_info, hf_token: Union[Text, bool, None] = True):
@@ -113,8 +113,17 @@ def duration(self) -> float:
         self.load()
         return self.model.specifications.duration
 
-    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
-        return self.model(waveform)
+    def __call__(self, waveform: torch.Tensor) -> torch.Tensor:
+        """
+        Call the forward pass of the segmentation model.
+        Parameters
+        ----------
+        waveform: torch.Tensor, shape (batch, channels, samples)
+        Returns
+        -------
+        speaker_segmentation: torch.Tensor, shape (batch, frames, speakers)
+        """
+        return super().__call__(waveform)
 
 
 class EmbeddingModel(LazyModel):
@@ -143,33 +152,33 @@ def from_pyannote(
         assert _has_pyannote, "No pyannote.audio installation found"
         return PyannoteEmbeddingModel(model, use_hf_token)
 
-    @abstractmethod
-    def forward(
+
+class PyannoteEmbeddingModel(EmbeddingModel):
+    def __init__(self, model_info, hf_token: Union[Text, bool, None] = True):
+        super().__init__(PyannoteLoader(model_info, hf_token))
+
+    def __call__(
         self, waveform: torch.Tensor, weights: Optional[torch.Tensor] = None
     ) -> torch.Tensor:
         """
-        Forward pass of an embedding model with optional weights.
-
+        Call the forward pass of an embedding model with optional weights.
         Parameters
         ----------
         waveform: torch.Tensor, shape (batch, channels, samples)
         weights: Optional[torch.Tensor], shape (batch, frames)
             Temporal weights for each sample in the batch. Defaults to no weights.
-
         Returns
         -------
         speaker_embeddings: torch.Tensor, shape (batch, embedding_dim)
         """
-        pass
-
-
-class PyannoteEmbeddingModel(EmbeddingModel):
-    def __init__(self, model_info, hf_token: Union[Text, bool, None] = True):
-        super().__init__(PyannoteLoader(model_info, hf_token))
-
-    def forward(
-        self,
-        waveform: torch.Tensor,
-        weights: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        return self.model(waveform, weights=weights)
+        # Normalize weights
+        if weights is not None:
+            min_values = weights.min(dim=1, keepdim=True).values
+            max_values = weights.max(dim=1, keepdim=True).values
+            weights = (weights - min_values) / (max_values - min_values)
+            weights.nan_to_num_(0.0)
+
+        embeddings = super().__call__(waveform, weights)
+        if isinstance(embeddings, np.ndarray):
+            embeddings = torch.from_numpy(embeddings)
+        return embeddings