In [20]:
def to_seconds(t):
    """Convert BBC XML timestamps to seconds"""
    hours, minutes, seconds = t.split(':')
    hours = int(hours)
    minutes = int(minutes)
    seconds = float(seconds)
    return 3600 * hours + 60 * minutes + seconds

In [21]:
import string
def is_description(text):
    """Returns True if 'text' is (probably) a description 
    (e.g. SHE SOBS, PHONE RINGS)"""
    if set(text).intersection(set(string.digits)):
        return False
    
    if (text.upper() == text) and (text[-1] != '.'):
        return True

    else:
        return False

In [22]:
import numpy as np
import xml.dom.minidom
from pyannote.core.util import pairwise
from pyannote.core import Segment, Annotation

def load_xml(path):
    """Load BBC XML file"""

    annotation = Annotation()
    transcription = dict()
    colors, aligns = set(), set()

    dom = xml.dom.minidom.parse(path)

    body = dom.getElementsByTagName('body')[0]
    for s, subtitle in enumerate(body.getElementsByTagName('p')):

        # subtitle timespan
        begin = to_seconds(subtitle.getAttribute('begin'))
        end = to_seconds(subtitle.getAttribute('end'))
        duration = end - begin

        # each subtitle may contain several speech turns.
        # we try to infer them from the "tts:color" attribute of <span> tags
        previous_color = None
        current_speech_turn = []
        speech_turns = []
        for c, chunk in enumerate(subtitle.getElementsByTagName('span')):

            # concatenate all text within <span>
            text = []
            for node in chunk.childNodes:

                if hasattr(node, 'tagName') and node.tagName == 'br':
                    continue
                elif hasattr(node, 'data'):
                    text.append(node.data.strip())
                else:
                    raise NotImplementedError('Missing corner case')    
            text = " ".join(text).strip()

            # skip descriptions and empty texts
            if len(text) == 0 or is_description(text):
                continue                 
            color = chunk.getAttribute('tts:color')

            if (previous_color is None) or (color == previous_color):
                current_speech_turn.append(text)
                previous_color = color
            else:
                current_speech_turn = " ".join(current_speech_turn).strip() 
                if current_speech_turn != "":
                    speech_turns.append((current_speech_turn, previous_color))
                current_speech_turn = [text]
                previous_color = color

        current_speech_turn = " ".join(current_speech_turn).strip() 
        if current_speech_turn != "":
            speech_turns.append((current_speech_turn, previous_color))

        if not speech_turns:
            continue

        # split subtitle timespan based on number of characters
        alphas = [0] + list(np.cumsum([len(s) for s, c in speech_turns]) / sum(len(s) for s, c in speech_turns))
        for (b, e), (speech_turn, color) in zip(pairwise(alphas), speech_turns):
            segment = Segment(begin + b * duration, begin + e * duration)
            annotation[segment, s] = color
            transcription[segment, s] = speech_turn
    
    annotation = annotation.rename_labels(generator='int')
    return annotation, transcription

In [23]:
for subset in ['dev', 'tst']:
    with open(f'{subset}.lst', 'r') as fp:
        uris = [line.strip() for line in fp]
    with open(f'../Eastenders/data/subtitles.{subset}.txt', 'w') as fp:
        for uri in uris:
            annotation, transcription = load_xml(f'xml/{uri}.xml')
            for segment, s, label in annotation.itertracks(yield_label=True):
                fp.write(f'{uri}|{segment.start:.3f}|{segment.end:.3f}|{s}|{label}|{transcription[segment, s]}\n')

In [24]:
!head ../Eastenders/data/subtitles.dev.txt
!gzip ../Eastenders/data/subtitles.dev.txt

5082189274976367100|2.489|3.063|0|1|I can do that.
5082189274976367100|3.063|4.046|0|3|But you didn't, did you?
5082189274976367100|28.179|31.377|1|3|I said put her down, May.
5082189274976367100|31.377|34.813|2|1|It's OK.  I'm not going to hurt her.
5082189274976367100|34.813|36.341|3|1|I'm her mother.
5082189274976367100|36.341|37.970|3|3|No.  You're not.
5082189274976367100|37.970|41.527|4|1|I can take over now, Dawn, it's what we planned.
5082189274976367100|41.527|43.204|5|1|Give you back your life.
5082189274976367100|43.204|45.762|6|3|She is my life now.
5082189274976367100|45.762|47.959|7|1|That's the endorphins talking,


In [25]:
!head ../Eastenders/data/subtitles.tst.txt
!gzip ../Eastenders/data/subtitles.tst.txt

5084819083455904024|29.900|31.779|0|1|"Je m'appelle Bradley.
5084819083455904024|36.093|38.333|1|1|"Je m'appelle Bradley.
5084819083455904024|40.650|43.886|2|1|"Je m'appelle Bradley Branning."
5084819083455904024|43.886|45.760|3|0|You've decided to go then.
5084819083455904024|45.760|49.678|4|1|Yeah yeah. You do understand, don't you?
5084819083455904024|49.678|52.195|5|3|Yeah, course we understand.
5084819083455904024|52.195|56.910|6|3|I mean, we'll miss you but you've gotta do what's best for you, ain't that right, Jim?
5084819083455904024|56.910|58.470|7|0|Yeah, course.
5084819083455904024|58.470|61.266|8|3|What about Stacey?
5084819083455904024|64.347|66.307|9|1|What you got there?
