In [1]:
def to_seconds(t):
    """Convert BBC XML timestamps to seconds"""
    hours, minutes, seconds = t.split(':')
    hours = int(hours)
    minutes = int(minutes)
    seconds = float(seconds)
    return 3600 * hours + 60 * minutes + seconds

In [2]:
import string
def is_description(text):
    """Returns True if 'text' is (probably) a description 
    (e.g. SHE SOBS, PHONE RINGS)"""
    if set(text).intersection(set(string.digits)):
        return False
    
    if (text.upper() == text) and (text[-1] != '.'):
        return True

    else:
        return False

In [8]:
import numpy as np
import xml.dom.minidom
from pyannote.core.util import pairwise
from pyannote.core import Segment, Annotation

def load_xml(path):
    """Load BBC XML file"""

    annotation = Annotation()
    transcription = dict()
    colors, aligns = set(), set()

    dom = xml.dom.minidom.parse(path)

    body = dom.getElementsByTagName('body')[0]
    for s, subtitle in enumerate(body.getElementsByTagName('p')):

        # subtitle timespan
        begin = to_seconds(subtitle.getAttribute('begin'))
        end = to_seconds(subtitle.getAttribute('end'))
        duration = end - begin

        # each subtitle may contain several speech turns.
        # we try to infer them from the "tts:color" attribute of <span> tags
        previous_color = None
        current_speech_turn = []
        speech_turns = []
        for c, chunk in enumerate(subtitle.getElementsByTagName('span')):

            # concatenate all text within <span>
            text = []
            for node in chunk.childNodes:

                if hasattr(node, 'tagName') and node.tagName == 'br':
                    continue
                elif hasattr(node, 'data'):
                    text.append(node.data.strip())
                else:
                    raise NotImplementedError('Missing corner case')    
            text = " ".join(text).strip()

            # skip descriptions and empty texts
            if len(text) == 0 or is_description(text):
                continue                 
            color = chunk.getAttribute('tts:color')

            if (previous_color is None) or (color == previous_color):
                current_speech_turn.append(text)
                previous_color = color
            else:
                current_speech_turn = " ".join(current_speech_turn).strip() 
                if current_speech_turn != "":
                    speech_turns.append((current_speech_turn, previous_color))
                current_speech_turn = [text]
                previous_color = color

        current_speech_turn = " ".join(current_speech_turn).strip() 
        if current_speech_turn != "":
            speech_turns.append((current_speech_turn, previous_color))

        if not speech_turns:
            continue

        # split subtitle timespan based on number of characters
        alphas = [0] + list(np.cumsum([len(s) for s, c in speech_turns]) / sum(len(s) for s, c in speech_turns))
        for (b, e), (speech_turn, color) in zip(pairwise(alphas), speech_turns):
            segment = Segment(begin + b * duration, begin + e * duration)
            annotation[segment, s] = color
            transcription[segment, s] = speech_turn
    
    annotation = annotation.rename_labels(generator='int')
    return annotation, transcription

In [12]:
for subset in ['dev', 'tst']:
    with open(f'{subset}.lst', 'r') as fp:
        uris = [line.strip() for line in fp]
    with open(f'../Eastenders/data/subtitles.{subset}.txt', 'w') as fp:
        for uri in uris:
            annotation, transcription = load_xml(f'xml/{uri}.xml')
            for segment, s, label in annotation.itertracks(yield_label=True):
                fp.write(f'{uri} {segment.start:.3f} {segment.end:.3f} {s} {label}\n')