# Reproducible experimental protocol

This notebook builds the database with all the information we need to perform domain-adversarial speech activity detection.

## Requirements

### Python packages

- pyannote.audio
- pyannote.core
- pyannote.database
- pandas

### Datasets

- `AMI`: [A multi-modal data set consisting of 100 hours of meeting recordings](http://groups.inf.ed.ac.uk/ami/corpus/)
- `ldc2019e31`: [Second DIHARD Challenge Development Data](https://coml.lscp.ens.fr/dihard/)
- `ldc2019e32`: [Second DIHARD Challenge Evaluation Data](https://coml.lscp.ens.fr/dihard/)
- `musan`: [A corpus of MUsic, Speech, And Noise](https://www.openslr.org/17/) 

In [64]:
# where AMI has been downloaded from http://groups.inf.ed.ac.uk/ami/corpus/
# Note that we'll just use the Mix-Headset subset for our experiments.
ami = '/export/corpora4/ami/amicorpus'

# where ldc2019e31 dataset has been downloaded
ldc2019e31 = '/vol/corpora1/data/ldc/ldc2019e31/LDC2019E31_Second_DIHARD_Challenge_Development_Data'

# where ldc2019e32 dataset has been downloaded 
ldc2019e32 = '/vol/corpora1/data/ldc/ldc2019e32/LDC2019E32_Second_DIHARD_Challenge_Evaluation_Data_V1.1'

# where MUSAN has been downloaded from https://www.openslr.org/17/
musan = '/vol/corpora4/musan'

# where github.com/hbredin/DomainAdversarialVoiceActivityDetection has been cloned
ROOT = '/vol/work1/bredin/jsalt/DomainAdversarialVoiceActivityDetection'

In [65]:
# create 'database' sub-directory that is meant to store audio and reference files
!mkdir -p {ROOT}/database/AMI
!mkdir -p {ROOT}/database/DIHARD

In [77]:
# define utility functions

from pyannote.core import Timeline
from pyannote.core import Annotation
from typing import TextIO

def write_rttm(file: TextIO, reference: Annotation):
    """Write reference annotation to "rttm" file

    Parameters
    ----------
    file : file object
    reference : `pyannote.core.Annotation`
        Reference annotation
    """

    for s, t, l in reference.itertracks(yield_label=True):
        line = (
            f'SPEAKER {reference.uri} 1 {s.start:.3f} {s.duration:.3f} '
            f'<NA> <NA> {l} <NA> <NA>\n'
        )
        file.write(line)

def write_uem(file: TextIO, uem: Timeline):
    """Write evaluation map to "uem" file

    Parameters
    ----------
    file : file object
    uem : `pyannote.core.Timeline`
        Evaluation timeline
    """

    for s in uem:
        line = f'{uem.uri} 1 {s.start:.3f} {s.end:.3f}\n'
        file.write(line)

## AMI

First, we convert AMI annotations into the .rttm format. This piece of code will create a rttm folder in the AMI in {ROOT}/AMI folder :

In [78]:
import xml.etree.ElementTree as ET
import os, glob
import shutil

def get_participants_per_meeting(annotations_folder):
    """
    Parse corpusResources/meetings.xml to return a dictionnary whose keys are the meeting ids,
    and values are a dictionnary containing (letter of the speaker, speaker id)
    """
    root = ET.parse(os.path.join(annotations_folder,'corpusResources/meetings.xml')).getroot()

    participants_per_meeting = {}
    for m in root:          # meeting
        m_id = m.attrib["observation"]
        speakers = {}
        for s in m:         # speaker
            speaker_id = s.attrib["global_name"]
            nxt_agent = s.attrib["nxt_agent"]
            speakers[nxt_agent] = speaker_id
        participants_per_meeting[m_id] = speakers

    return participants_per_meeting


def ami_xml_to_rttm(annotations_folder, participants_per_meeting):
    files = glob.iglob(os.path.join(annotations_folder, "segments/*.xml"))
    rttm_dic = {}   # for storing the content of the rttm files (key = filename, value = content)
    rttm_folder = os.path.join(ROOT, 'database', 'AMI', 'rttm')
    shutil.rmtree(rttm_folder, ignore_errors=True)
    os.makedirs(rttm_folder)
    for f in files:
        basename = os.path.basename(f).split(".")
        meeting_id = basename[0]
        speaker_letter = basename[1]
        rttm_filepath = os.path.join(rttm_folder, meeting_id + ".rttm", )
        if rttm_filepath  not in rttm_dic.keys():
            rttm_dic[rttm_filepath] = []

        if speaker_letter in participants_per_meeting[meeting_id]:
            speaker_id = participants_per_meeting[meeting_id][speaker_letter]
        else:
            print("Can't find which speaker is associated to letter %s in %s" % (speaker_letter, f))

        root = ET.parse(f).getroot()
        for seg in root:
            onset = float(seg.attrib["transcriber_start"])
            duration = float(seg.attrib["transcriber_end"])-onset
            uri = str(meeting_id) + '.Mix-Headset'
            rttm_dic[rttm_filepath].append(["SPEAKER", uri, "1", "%.6f" % onset, 
                                            "%.6f" % duration, "<NA>", "<NA>", 
                                            speaker_id, "<NA>", "<NA>\n"])

    # Sort dictionnary by onset
    for k, v in rttm_dic.items():
        rttm_dic[k] = sorted(v, key=lambda x: float(x[3]))

    # Write rttm
    for k, v in rttm_dic.items():
        with open(k, 'a') as output_rttm:
            for line in v:
                output_rttm.write(' '.join(line))

# Path to the annotations folder such as provided by the download
annotations_folder = os.path.join(ami, "annotations")
participants_per_meeting = get_participants_per_meeting(annotations_folder)
ami_xml_to_rttm(annotations_folder, participants_per_meeting)

Can't find which speaker is associated to letter D in /home/engaclew/Documents/DATA_Marvin/amicorpus/annotations/segments/EN2002c.D.segments.xml
Can't find which speaker is associated to letter D in /home/engaclew/Documents/DATA_Marvin/amicorpus/annotations/segments/EN2003a.D.segments.xml
Can't find which speaker is associated to letter D in /home/engaclew/Documents/DATA_Marvin/amicorpus/annotations/segments/EN2009b.D.segments.xml
Can't find which speaker is associated to letter D in /home/engaclew/Documents/DATA_Marvin/amicorpus/annotations/segments/IN1001.D.segments.xml
Can't find which speaker is associated to letter D in /home/engaclew/Documents/DATA_Marvin/amicorpus/annotations/segments/EN2009c.D.segments.xml


If everything went as expected, you should have the following outputs : 

Those are the meetings for which there were only 3 speakers (instead of 4 usually).

This corpora is originally provided without a train/dev/test split. However, we followed the "Full-corpus partition" proposed [here](http://groups.inf.ed.ac.uk/ami/corpus/datasets.shtml).
You can use the same partition by following this piece of code :

In [79]:
import os, glob
import shutil

trn = ["ES2002", "ES2005", "ES2006", "ES2007", "ES2008", "ES2009",
         "ES2010", "ES2012", "ES2013", "ES2015", "ES2016", "IS1000",
         "IS1001", "IS1002", "IS1003", "IS1004", "IS1005",
         "IS1006", "IS1007", "TS3005", "TS3008", "TS3009", "TS3010", "TS3011",
         "TS3012", "EN2001", "EN2003", "EN2004a", "EN2005a", "EN2006", "EN2009",
         "IN1001", "IN1002", "IN1005", "IN1007", "IN1008", "IN1009", "IN1012", "IN1013",
         "IN1014", "IN1016"]

dev = ["ES2003", "ES2011", "IS1008", "TS3004", "TS3006",
       "IB4001", "IB4002", "IB4003", "IB4004", "IB4010", "IB4011"]

tst = ["ES2004", "ES2014", "IS1009", "TS3003", "TS3007", "EN2002"]

# Create needed folders
AMI_folder = f'{ROOT}/database/AMI'

for fold in ["trn", "dev", "tst"]:
    shutil.rmtree(f'{AMI_folder}/{fold}', ignore_errors=True)
    os.makedirs(f'{AMI_folder}/{fold}')

# Get all the rttm files in the right location
all = [train, dev, test]
fold_names = ["trn", "dev", "tst"]
for i in range(0, len(fold_names)):
    fold_meeting_ids = all[i]
    fold_name = fold_names[i]
    for m_id in fold_meeting_ids:
        files = glob.glob(os.path.join(AMI_folder, 'rttm', '%s*.rttm' % m_id))
        if len(files) == 0:
            print("Can't find files whose name matches with the regular experession %s*.rttm" % m_id)
            print("Something bad happened. You should consider restarting from the beginning.")
        for rttm_file in files:
            shutil.copyfile(rttm_file, f'{AMI_folder}/{fold_name}/%s' % os.path.basename(rttm_file).replace('.rttm', '.Mix-Headset.rttm'))
print("Done.")

Done.


Now that we have a train/dev/test folder, themselves containing a gold and a wav folder, we can bring the audios in the latter by creating symbolic links :

In [80]:
import glob, os
import shutil
import wave
import contextlib

def get_wav_duration(wav_path):
    with contextlib.closing(wave.open(wav_path,'r')) as f:
        frames = f.getnframes()
        rate = f.getframerate()
        duration = frames / float(rate)
        return duration
    
folds = ["trn", "dev", "tst"]
for fold in folds:
    files = glob.iglob(f'{ROOT}/database/AMI/{fold}/*.Mix-Headset.rttm')

    with open(f'{ROOT}/database/AMI/all.{fold}.uem', "w+") as uem_out:
        for rttm in files:
            basename = os.path.basename(rttm)           #EN2001a.Mix-Headset.rttm
            meeting_id = basename.split('.')[0]         #EN2001a
            filename = basename.replace(".rttm", "")    #EN2001a.Mix-Headset

            wav = os.path.join(f'{ami}/{meeting_id}/audio/{filename}.wav')
            uem_out.write( "%s 1 0.0 %s\n" % (filename, get_wav_duration(wav)))
            
            # Symlink to the .wav file
            os.symlink(wav, "%s/database/AMI/%s/%s.wav" % (ROOT, fold, filename))
    print(f'Audio files have been symlinked and all.{fold}.uem has been generated')



Audio files have been symlinked and all.trn.uem has been generated
Audio files have been symlinked and all.dev.uem has been generated
Audio files have been symlinked and all.tst.uem has been generated


Finally, we can create four files per (domain, subset) pair:
- `{domain}.{subset}.txt` contains list of files
- `{domain}.{subset}.rttm` contains manual annotation
- `{domain}.{subset}.uem` contains unpartitioned evaluation map (uem)
- `{domain}.domain.{subset}.txt` contains file-to-domain mapping

In [83]:
import pandas as pd

def get_df_file_domain(folder):
    list_dict = []
    files = glob.iglob(os.path.join(folder, "*.rttm"))
    for rttm in files:
        basename = os.path.basename(rttm).replace(".rttm", "")
        domain = basename[0]
        dict_value = {'uri': basename, 'domain': domain}
        list_dict.append(dict_value)
    return pd.DataFrame(list_dict)
  
trn = get_df_file_domain(f'{ROOT}/database/AMI/trn')
dev = get_df_file_domain(f'{ROOT}/database/AMI/dev') 
tst = get_df_file_domain(f'{ROOT}/database/AMI/tst')

corpora = {'trn':trn, 'dev':dev, 'tst':tst}
ami_domains = trn.domain.unique()

In [82]:
from pyannote.database.util import load_rttm
from pyannote.database.util import load_uem
from pyannote.audio.features.utils import get_audio_duration
from pyannote.core import Segment

for fold in ['trn', 'dev', 'tst']:
    fold_dict = corpora[fold]
    for domain, files in fold_dict.groupby('domain'):
        uems = load_uem(f'{ROOT}/database/AMI/all.{fold}.uem')

        with open(f'{ROOT}/database/AMI/{domain}.{fold}.txt', 'w') as f_uris, \
             open(f'{ROOT}//database/AMI/{domain}.{fold}.rttm', 'w') as f_rttm, \
             open(f'{ROOT}/database/AMI/{domain}.{fold}.uem', 'w') as f_uem, \
             open(f'{ROOT}/database/AMI/{domain}.domain.{fold}.txt', 'w') as f_domain:

            for index, row in files.iterrows():
                uri = row['uri']
                domain = row['domain']
                duration = get_audio_duration({'audio': f'{ROOT}/database/AMI/{fold}/{uri}.wav'})
                duration -= 0.001
                support = Segment(0, duration)

                f_uris.write(f'{fold}/{uri}\n')

                f_domain.write(f'{fold}/{uri} {domain}\n')
                reference = load_rttm(f'{ROOT}/database/AMI/{fold}/{uri}.rttm')[uri]
                reference.uri = f'{fold}/{uri}'
                reference = reference.crop(support, mode='intersection')

                write_rttm(f_rttm, reference)

                uem = uems[uri]
                uem.uri = f'{fold}/{uri}'
                uem = uem.crop(support, mode='intersection')

                write_uem(f_uem, uem)

Create database.yml

In [84]:
import yaml

database_yml = {
    'Databases': {
        'MUSAN': f'{musan}/{{uri}}.wav',
        'AMI': f'{ROOT}/database/AMI/{{uri}}.wav',
    },
    'Protocols': {
        'AMI': {'SpeakerDiarization': {}},
        'X': {'SpeakerDiarization': {}}
    }
}

for domain in ami_domains:
    database_yml['Protocols']['AMI']['SpeakerDiarization'][f'{domain}'] = {}
    for subset, short in {'train': 'trn', 'development': 'dev', 'test': 'tst'}.items():
        database_yml['Protocols']['AMI']['SpeakerDiarization'][f'{domain}'][subset] = {
            'uris': f'{ROOT}/database/AMI/{domain}.{short}.txt',
            'annotation': f'{ROOT}/database/AMI/{domain}.{short}.rttm',
            'annotated': f'{ROOT}/database/AMI/{domain}.{short}.uem',
            'domain': f'{ROOT}/database/AMI/{domain}.domain.{short}.txt',
        }
    
    all_but_domain = sorted(set(ami_domains) - {domain})
    database_yml['Protocols']['X']['SpeakerDiarization'][f'AMI_LeaveOneDomainOut_{domain}'] = {}
    for subset in ['train', 'development']:
        database_yml['Protocols']['X']['SpeakerDiarization'][f'AMI_LeaveOneDomainOut_{domain}'][subset] = {
            f'AMI.SpeakerDiarization.{other_domain}': [subset] for other_domain in all_but_domain
        }
    database_yml['Protocols']['X']['SpeakerDiarization'][f'AMI_LeaveOneDomainOut_{domain}']['test'] = {
        f'AMI.SpeakerDiarization.{domain}': ['test']
    }   
    
database_yml['Protocols']['X']['SpeakerDiarization']['AMI_Official'] = {
    subset: {
        f'AMI.SpeakerDiarization.{domain}': [subset] for domain in ami_domains
    } for subset in ['train', 'development', 'test']
}

with open(f'{ROOT}/database.yml', 'w') as f:
    f.write(yaml.dump(database_yml, 
                      default_flow_style=False))

## DIHARD

For some reason, development and evaluation subsets share the same names: `DH_0001` to `DH_0192` exist in both subsets.  
To avoid any confusion in `pyannote.database`, we create symbolic links so we can distinguish `dev/DH_0001` from `tst/DH_0001`.

In [10]:
!ln --symbolic {ldc2019e31}/data/single_channel/flac {ROOT}/database/DIHARD/dev
!ln --symbolic {ldc2019e32}/data/single_channel/flac {ROOT}/database/DIHARD/tst

ln: impossible de créer le lien symbolique '/home/lavechin/Bureau/DomainAdversarialVoiceActivityDetection/database/DIHARD/dev/flac': Le fichier existe
ln: impossible de créer le lien symbolique '/home/lavechin/Bureau/DomainAdversarialVoiceActivityDetection/database/DIHARD/tst/flac': Le fichier existe


In [21]:
from pandas import read_csv

# load list of test files (and their domain)

tst = read_csv(f'{ldc2019e32}/docs/sources.tbl', 
               delim_whitespace=True,
               names=['uri', 'language', 'domain', 'source'],     
               index_col='uri').filter(like='DH', axis=0)
# load list of development files (and their domain)
dev = read_csv(f'{ldc2019e31}/docs/sources.tbl', 
               delim_whitespace=True,
               names=['uri', 'language', 'domain', 'source'], 
               index_col='uri').filter(like='DH', axis=0)

# obtain list of domains
dihard_domains = sorted(dev.domain.unique())

The next cell will create four files per (domain, subset) pair:
- `{domain}.{subset}.txt` contains list of files
- `{domain}.{subset.rttm` contains manual annotation
- `{domain}.{subset}.uem` contains unpartitioned evaluation map (uem)
- `{domain}.domain.{subset}.txt` contains file-to-domain mapping

In [25]:
from pyannote.database.util import load_rttm
from pyannote.database.util import load_uem
from pyannote.audio.features.utils import get_audio_duration
from pyannote.core import Segment

# split ldc2019e31 into training set (two third) and developement set (one third)

# for each domain in ldc2019e31
for domain, files in dev.groupby('domain'):
    
    # load unpartitioned evaluation map (uem)
    uems = load_uem(f'{ldc2019e31}/data/single_channel/uem/{domain}.uem')
    
    # create four files per (domain, subset) pair
    # {domain}.{subset}.txt contains list of files
    # {domain}.{subset}.rttm contains manual annotation
    # {domain}.{subset}.uem contains unpartitioned evaluation map (uem)
    # {domain}.domain.{subset}.txt contains file-to-domain mapping
    with open(f'{ROOT}/database/DIHARD/{domain}.dev.txt', 'w') as uris_dev, \
         open(f'{ROOT}/database/DIHARD/{domain}.trn.txt', 'w') as uris_trn, \
         open(f'{ROOT}/database/DIHARD/{domain}.dev.rttm', 'w') as rttm_dev, \
         open(f'{ROOT}/database/DIHARD/{domain}.trn.rttm', 'w') as rttm_trn, \
         open(f'{ROOT}/database/DIHARD/{domain}.dev.uem', 'w') as uem_dev, \
         open(f'{ROOT}/database/DIHARD/{domain}.trn.uem', 'w') as uem_trn, \
         open(f'{ROOT}/database/DIHARD/{domain}.domain.dev.txt', 'w') as domain_dev, \
         open(f'{ROOT}/database/DIHARD/{domain}.domain.trn.txt', 'w') as domain_trn:
        
        # for each file in current domain
        for i, (uri, file) in enumerate(files.iterrows()):
            
            duration = get_audio_duration({'audio': f'{ROOT}/database/DIHARD/dev/{uri}.flac'})
            # ugly hack to avoid rounding errors: this has the effect of not considering 
            # the last millisecond of each file
            duration -= 0.001
            support = Segment(0, duration)
            
            # i = 0 ==> dev
            # i = 1 ==> trn
            # i = 2 ==> trn
            # i = 3 ==> dev
            # i = 4 ==> trn
            # i = 5 ==> trn
            # i = 6 ==> dev 
            # ...
            f_uris = uris_trn if i % 3 else uris_dev
            f_uris.write(f'dev/{uri}\n')
            
            # dump domain to disk
            f_domain = domain_trn if i % 3 else domain_dev
            f_domain.write(f'dev/{uri} {domain}\n')
            
            # load and crop reference (cf above hack)
            reference = load_rttm(f'{ldc2019e31}/data/single_channel/rttm/{uri}.rttm')[uri]
            reference.uri = f'dev/{uri}'
            reference = reference.crop(support, mode='intersection')
            
            # dump reference to disk
            f_rttm = rttm_trn if i % 3 else rttm_dev
            write_rttm(f_rttm, reference)
            
            # load and crop unpartitioned evaluation map
            uem = uems[uri]
            uem.uri = f'dev/{uri}'
            uem = uem.crop(support, mode='intersection')
            
            # dump uem to disk
            f_uem = uem_trn if i % 3 else uem_dev
            write_uem(f_uem, uem)

# same as above but applied to ldc2019e32 that is used entirely for test
for domain, files in tst.groupby('domain'):
    
    uems = load_uem(f'{ldc2019e32}/data/single_channel/uem/{domain}.uem')

    with open(f'{ROOT}/database/DIHARD/{domain}.tst.txt', 'w') as f_uris, \
         open(f'{ROOT}//database/DIHARD/{domain}.tst.rttm', 'w') as f_rttm, \
         open(f'{ROOT}/database/DIHARD/{domain}.tst.uem', 'w') as f_uem, \
         open(f'{ROOT}/database/DIHARD/{domain}.domain.tst.txt', 'w') as f_domain:

        for i, (uri, file) in enumerate(files.iterrows()):
            
            duration = get_audio_duration({'audio': f'{ROOT}/database/DIHARD/tst/{uri}.flac'})
            duration -= 0.001
            support = Segment(0, duration)
            
            f_uris.write(f'tst/{uri}\n')
            
            f_domain.write(f'tst/{uri} {domain}\n')
            
            reference = load_rttm(f'{ldc2019e32}/data/single_channel/rttm/{uri}.rttm')[uri]
            reference.uri = f'tst/{uri}'
            reference = reference.crop(support, mode='intersection')

            write_rttm(f_rttm, reference)
            
            uem = uems[uri]
            uem.uri = f'tst/{uri}'
            uem = uem.crop(support, mode='intersection')

            write_uem(f_uem, uem)

Create `database.yml`:

In [30]:
import yaml

database_yml = {
    'Databases': {
        'DIHARD': f'{ROOT}/database/DIHARD/{{uri}}.flac',
        'MUSAN': f'{musan}/{{uri}}.wav',
        'AMI': f'{ROOT}/database/AMI/{{uri}}.wav',
    },
    'Protocols': {
        'DIHARD': {'SpeakerDiarization': {}},
        'AMI': {'SpeakerDiarization': 'TODO'},
        'X': {'SpeakerDiarization': {}}
    }
}

for domain in dihard_domains:
    database_yml['Protocols']['DIHARD']['SpeakerDiarization'][f'{domain}'] = {}
    for subset, short in {'train': 'trn', 'development': 'dev', 'test': 'tst'}.items():
        database_yml['Protocols']['DIHARD']['SpeakerDiarization'][f'{domain}'][subset] = {
            'uris': f'{ROOT}/database/DIHARD/{domain}.{short}.txt',
            'annotation': f'{ROOT}/database/DIHARD/{domain}.{short}.rttm',
            'annotated': f'{ROOT}/database/DIHARD/{domain}.{short}.uem',
            'domain': f'{ROOT}/database/DIHARD/{domain}.domain.{short}.txt',
        }
    
    all_but_domain = sorted(set(dihard_domains) - {domain})
    database_yml['Protocols']['X']['SpeakerDiarization'][f'DIHARD_LeaveOneDomainOut_{domain}'] = {}
    for subset in ['train', 'development']:
        database_yml['Protocols']['X']['SpeakerDiarization'][f'DIHARD_LeaveOneDomainOut_{domain}'][subset] = {
            f'DIHARD.SpeakerDiarization.{other_domain}': [subset] for other_domain in all_but_domain
        }
    database_yml['Protocols']['X']['SpeakerDiarization'][f'DIHARD_LeaveOneDomainOut_{domain}']['test'] = {
        f'DIHARD.SpeakerDiarization.{domain}': ['test']
    }   
    
database_yml['Protocols']['X']['SpeakerDiarization']['DIHARD_Official'] = {
    subset: {
        f'DIHARD.SpeakerDiarization.{domain}': [subset] for domain in dihard_domains
    } for subset in ['train', 'development', 'test']
}

with open(f'{ROOT}/database.yml', 'w') as f:
    f.write(yaml.dump(database_yml, 
                      default_flow_style=False))

Setting `PYANNOTE_DATABASE_CONFIG` environment variable to `{ROOT}/database.yml` will give you a bunch of `pyannote.database` protocols:

- `X.SpeakerDiarization.DIHARD_Official` is the official protocol for `DIHARD2` 
- `X.SpeakerDiarization.DIHARD_LeaveOneDomainOut_{domain}` uses all domains but {domain} in the training and development sets, and only {domain} in the test set.