Encoder-decoder based attractors (#43)

* [fix] Y_pad-->np.ascontiguous(Y_pad) in case of transpose * [add] features of encoder-decoder attractors * [add] EDA training scripts * [update] avoid using 'if not flag' * [update] description of the repo * [fix] typos & bugs * [fix] sample-wise copy to GPU to deal with variable number of speakers * [fix] reference to the INTERSPEECH 2020 paper
hitachi-speech · Oct 28, 2020 · ddf4df4 · ddf4df4
1 parent 9a0f211
commit ddf4df4
Show file tree

Hide file tree

Showing 18 changed files with 1,186 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -1,8 +1,14 @@
 # EEND (End-to-End Neural Diarization)
 
 EEND (End-to-End Neural Diarization) is a neural-network-based speaker diarization method.
-- https://www.isca-speech.org/archive/Interspeech_2019/abstracts/2899.html
-- https://arxiv.org/abs/1909.06247 (to appear at ASRU 2019)
+- BLSTM EEND (INTERSPEECH 2019)
+  - https://www.isca-speech.org/archive/Interspeech_2019/abstracts/2899.html
+- Self-attentive EEND (ASRU 2019)
+  - https://ieeexplore.ieee.org/abstract/document/9003959/
+
+The EEND extension for various number of speakers is also provided in this repository.
+- Self-attentive EEND with encoder-decoder based attractors
+  - https://arxiv.org/abs/2005.09921
 
 ## Install tools
 ### Requirements
@@ -48,6 +54,7 @@ cd egs/mini_librispeech/v1
 ```bash
 ./run.sh
 ```
+- If you use encoder-decoder based attractors [3], modify `run.sh` to use `config/eda/{train,infer}.yaml`
 - See `RESULT.md` and compare with your result.
 
 ## CALLHOME two-speaker experiment
@@ -57,28 +64,39 @@ If you use your local machine, use "run.pl".
 If you use Grid Engine, use "queue.pl"
 If you use SLURM, use "slurm.pl".
 For more information about cmd.sh see http://kaldi-asr.org/doc/queue.html.
-- Modify `egs/callhome/v1/run_prepare_shared.sh` according to storage paths of your copora.
+- Modify `egs/callhome/v1/run_prepare_shared.sh` according to storage paths of your corpora.
 
 ### Data preparation
 ```bash
 cd egs/callhome/v1
 ./run_prepare_shared.sh
+# If you want to conduct 1-4 speaker experiments, run below.
+# You also have to set paths to your corpora properly.
+./run_prepare_shared_eda.sh
 ```
-### Self-attention-based model (latest configuration)
+### Self-attention-based model using 2-speaker mixtures
 ```bash
 ./run.sh
 ```
-### BLSTM-based model (old configuration)
+### BLSTM-based model using 2-speaker mixtures
 ```bash
 local/run_blstm.sh
 ```
+### Self-attention-based model with EDA using 1-4-speaker mixtures
+```bash
+./run_eda.sh
+```
 
 ## References
 [1] Yusuke Fujita, Naoyuki Kanda, Shota Horiguchi, Kenji Nagamatsu, Shinji Watanabe, "
 End-to-End Neural Speaker Diarization with Permutation-free Objectives," Proc. Interspeech, pp. 4300-4304, 2019
 
 [2] Yusuke Fujita, Naoyuki Kanda, Shota Horiguchi, Yawen Xue, Kenji Nagamatsu, Shinji Watanabe, "
-End-to-End Neural Speaker Diarization with Self-attention," arXiv preprints arXiv:1909.06247, 2019
+End-to-End Neural Speaker Diarization with Self-attention," Proc. ASRU, pp. 296-303, 2019
+
+[3] Shota Horiguchi, Yusuke Fujita, Shinji Watanabe, Yawen Xue, Kenji Nagamatsu, "
+End-to-End Speaker Diarization for an Unknown Number of Speakers with Encoder-Decoder Based Attractors," Proc. INTERSPEECH, 2020
+
 
 
 ## Citation

diff --git a/eend/bin/infer.py b/eend/bin/infer.py
@@ -45,6 +45,17 @@
 parser.add_argument('--transformer-encoder-n-heads', default=4, type=int)
 parser.add_argument('--transformer-encoder-n-layers', default=2, type=int)
 parser.add_argument('--save-attention-weight', default=0, type=int)
+
+attractor_args = parser.add_argument_group('attractor')
+attractor_args.add_argument('--use-attractor', action='store_true',
+                            help='Enable encoder-decoder attractor mode')
+attractor_args.add_argument('--shuffle', action='store_true',
+                            help='Shuffle the order in time-axis before input to the network')
+attractor_args.add_argument('--attractor-loss-ratio', default=1.0, type=float,
+                            help='weighting parameter')
+attractor_args.add_argument('--attractor-encoder-dropout', default=0.1, type=float)
+attractor_args.add_argument('--attractor-decoder-dropout', default=0.1, type=float)
+attractor_args.add_argument('--attractor-threshold', default=0.5, type=float)
 args = parser.parse_args()
 
 system_info.print_system_info()

diff --git a/eend/bin/train.py b/eend/bin/train.py
@@ -34,7 +34,7 @@
                     help='input transform')
 parser.add_argument('--lr', default=0.001, type=float)
 parser.add_argument('--optimizer', default='adam', type=str)
-parser.add_argument('--num-speakers', default=2, type=int)
+parser.add_argument('--num-speakers', type=int)
 parser.add_argument('--gradclip', default=-1, type=int,
                     help='gradient clipping. if < 0, no clipping')
 parser.add_argument('--num-frames', default=2000, type=int,
@@ -63,6 +63,16 @@
 parser.add_argument('--transformer-encoder-dropout', default=0.1, type=float)
 parser.add_argument('--gradient-accumulation-steps', default=1, type=int)
 parser.add_argument('--seed', default=777, type=int)
+
+attractor_args = parser.add_argument_group('attractor')
+attractor_args.add_argument('--use-attractor', action='store_true',
+                            help='Enable encoder-decoder attractor mode')
+attractor_args.add_argument('--shuffle', action='store_true',
+                            help='Shuffle the order in time-axis before input to the network')
+attractor_args.add_argument('--attractor-loss-ratio', default=1.0, type=float,
+                            help='weighting parameter')
+attractor_args.add_argument('--attractor-encoder-dropout', default=0.1, type=float)
+attractor_args.add_argument('--attractor-decoder-dropout', default=0.1, type=float)
 args = parser.parse_args()
 
 system_info.print_system_info()

diff --git a/eend/chainer_backend/diarization_dataset.py b/eend/chainer_backend/diarization_dataset.py
@@ -26,6 +26,7 @@ def _gen_frame_indices(
 
 
 class KaldiDiarizationDataset(chainer.dataset.DatasetMixin):
+
     def __init__(
             self,
             data_dir,
@@ -40,7 +41,8 @@ def __init__(
             use_last_samples=False,
             label_delay=0,
             n_speakers=None,
-            ):
+            shuffle=False,
+    ):
         self.data_dir = data_dir
         self.dtype = dtype
         self.chunk_size = chunk_size
@@ -64,9 +66,11 @@ def __init__(
                     label_delay=self.label_delay,
                     subsampling=self.subsampling):
                 self.chunk_indices.append(
-                        (rec, st * self.subsampling, ed * self.subsampling))
+                    (rec, st * self.subsampling, ed * self.subsampling))
         print(len(self.chunk_indices), " chunks")
 
+        self.shuffle = shuffle
+
     def __len__(self):
         return len(self.chunk_indices)
 
@@ -83,4 +87,19 @@ def get_example(self, i):
         Y = feature.transform(Y, self.input_transform)
         Y_spliced = feature.splice(Y, self.context_size)
         Y_ss, T_ss = feature.subsample(Y_spliced, T, self.subsampling)
+
+        # If the sample contains more than "self.n_speakers" speakers,
+        #  extract top-(self.n_speakers) speakers
+        if self.n_speakers and T_ss.shape[1] > self.n_speakers:
+            selected_speakers = np.argsort(T_ss.sum(axis=0))[::-1][:self.n_speakers]
+            T_ss = T_ss[:, selected_speakers]
+
+        # If self.shuffle is True, shuffle the order in time-axis
+        # This operation improves the performance of EEND-EDA
+        if self.shuffle:
+            order = np.arange(Y_ss.shape[0])
+            np.random.shuffle(order)
+            Y_ss = Y_ss[order]
+            T_ss = T_ss[order]
+
         return Y_ss, T_ss
diff --git a/eend/chainer_backend/encoder_decoder_attractor.py b/eend/chainer_backend/encoder_decoder_attractor.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+# Copyright 2020 Hitachi, Ltd. (author: Shota Horiguchi)
+# Licensed under the MIT license.
+
+from chainer import Chain, cuda
+import chainer.functions as F
+import chainer.links as L
+
+
+class EncoderDecoderAttractor(Chain):
+
+    def __init__(self, n_units, encoder_dropout=0.1, decoder_dropout=0.1):
+        super(EncoderDecoderAttractor, self).__init__()
+        with self.init_scope():
+            self.encoder = L.NStepLSTM(1, n_units, n_units, encoder_dropout)
+            self.decoder = L.NStepLSTM(1, n_units, n_units, decoder_dropout)
+            self.counter = L.Linear(n_units, 1)
+        self.n_units = n_units
+
+    def forward(self, xs, zeros):
+        hx, cx, _ = self.encoder(None, None, xs)
+        _, _, attractors = self.decoder(hx, cx, zeros)
+        return attractors
+
+    def estimate(self, xs, max_n_speakers=15):
+        """
+        Calculate attractors from embedding sequences
+         without prior knowledge of the number of speakers
+
+        Args:
+          xs: List of (T,D)-shaped embeddings
+          max_n_speakers (int)
+        Returns:
+          attractors: List of (N,D)-shaped attractors
+          probs: List of attractor existence probabilities
+        """
+
+        xp = cuda.get_array_module(xs[0])
+        zeros = [xp.zeros((max_n_speakers, self.n_units), dtype=xp.float32) for _ in xs]
+        attractors = self.forward(xs, zeros)
+        probs = [F.sigmoid(F.flatten(self.counter(att))) for att in attractors]
+        return attractors, probs
+
+    def __call__(self, xs, n_speakers):
+        """
+        Calculate attractors from embedding sequences with given number of speakers
+
+        Args:
+          xs: List of (T,D)-shaped embeddings
+          n_speakers: List of number of speakers, or None if the number of speakers is unknown (ex. test phase)
+        Returns:
+          loss: Attractor existence loss
+          attractors: List of (N,D)-shaped attractors
+        """
+
+        xp = cuda.get_array_module(xs[0])
+        zeros = [xp.zeros((n_spk + 1, self.n_units), dtype=xp.float32) for n_spk in n_speakers]
+        attractors = self.forward(xs, zeros)
+        labels = F.concat([xp.array([[1] * n_spk + [0]], xp.int32) for n_spk in n_speakers], axis=1)
+        logit = F.concat([F.reshape(self.counter(att), (-1, n_spk + 1)) for att, n_spk in zip(attractors, n_speakers)], axis=1)
+        loss = F.sigmoid_cross_entropy(logit, labels)
+
+        # The final attractor does not correspond to a speaker so remove it
+        # attractors = [att[:-1] for att in attractors]
+        attractors = [att[slice(0, att.shape[0] - 1)] for att in attractors]
+        return loss, attractors
diff --git a/eend/chainer_backend/infer.py b/eend/chainer_backend/infer.py
@@ -11,7 +11,7 @@
 from chainer import serializers
 from scipy.ndimage import shift
 from eend.chainer_backend.models import BLSTMDiarization
-from eend.chainer_backend.models import TransformerDiarization
+from eend.chainer_backend.models import TransformerDiarization, TransformerEDADiarization
 from eend.chainer_backend.utils import use_single_gpu
 from eend import feature
 from eend import kaldi_data
@@ -32,26 +32,39 @@ def infer(args):
 
     # Prepare model
     in_size = feature.get_input_dim(
-            args.frame_size,
-            args.context_size,
-            args.input_transform)
+        args.frame_size,
+        args.context_size,
+        args.input_transform)
 
     if args.model_type == "BLSTM":
         model = BLSTMDiarization(
-                in_size=in_size,
-                n_speakers=args.num_speakers,
-                hidden_size=args.hidden_size,
-                n_layers=args.num_lstm_layers,
-                embedding_layers=args.embedding_layers,
-                embedding_size=args.embedding_size)
+            in_size=in_size,
+            n_speakers=args.num_speakers,
+            hidden_size=args.hidden_size,
+            n_layers=args.num_lstm_layers,
+            embedding_layers=args.embedding_layers,
+            embedding_size=args.embedding_size
+        )
     elif args.model_type == 'Transformer':
-        model = TransformerDiarization(
+        if args.use_attractor:
+            model = TransformerEDADiarization(
+                in_size,
+                n_units=args.hidden_size,
+                n_heads=args.transformer_encoder_n_heads,
+                n_layers=args.transformer_encoder_n_layers,
+                dropout=0,
+                attractor_encoder_dropout=args.attractor_encoder_dropout,
+                attractor_decoder_dropout=args.attractor_decoder_dropout,
+            )
+        else:
+            model = TransformerDiarization(
                 args.num_speakers,
                 in_size,
                 n_units=args.hidden_size,
                 n_heads=args.transformer_encoder_n_heads,
                 n_layers=args.transformer_encoder_n_layers,
-                dropout=0)
+                dropout=0
+            )
     else:
         raise ValueError('Unknown model type.')
 
@@ -75,7 +88,12 @@ def infer(args):
                 Y_chunked = Variable(Y[start:end])
                 if args.gpu >= 0:
                     Y_chunked.to_gpu(gpuid)
-                hs, ys = model.estimate_sequential(hs, [Y_chunked])
+                hs, ys = model.estimate_sequential(
+                    hs, [Y_chunked],
+                    n_speakers=args.num_speakers,
+                    th=args.attractor_threshold,
+                    shuffle=args.shuffle
+                )
                 if args.gpu >= 0:
                     ys[0].to_cpu()
                 out_chunks.append(ys[0].data)
@@ -88,6 +106,8 @@ def infer(args):
         if hasattr(model, 'label_delay'):
             outdata = shift(np.vstack(out_chunks), (-model.label_delay, 0))
         else:
+            max_n_speakers = max([o.shape[1] for o in out_chunks])
+            out_chunks = [np.insert(o, o.shape[1], np.zeros((max_n_speakers - o.shape[1], o.shape[0])), axis=1) for o in out_chunks]
             outdata = np.vstack(out_chunks)
         with h5py.File(outpath, 'w') as wf:
             wf.create_dataset('T_hat', data=outdata)