In [3]:
# Imports

import pympi
import re
from collections import defaultdict
import shutil


In [4]:
# Utils

def get_all_time_offsets(eaf_file):
    
    offsets = []
    eaf = pympi.Elan.Eaf(eaf_file)    

    for media_descriptor in eaf.media_descriptors:
        TIME_ORIGIN='TIME_ORIGIN'
        if (TIME_ORIGIN in media_descriptor):
            offsets.append(int((media_descriptor['TIME_ORIGIN'])))
    
    return offsets if offsets else None


def format_time(seconds, offset=0):
    """Helper function to format time in SRT format (HH:MM:SS,ms)"""
    seconds += offset / 1000  # Convert milliseconds to seconds
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    whole_seconds = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)  
    return f"{hours:02}:{minutes:02}:{whole_seconds:02},{milliseconds:03}"


def get_tier_names(eaf_filename):    
    try:
        # Load the EAF file
        eaf = pympi.Elan.Eaf(eaf_filename)
        
        # Get tier names
        tier_names = eaf.get_tier_names()
        
        return tier_names
    
    except Exception as e:
        print(f"Error reading EAF file: {str(e)}")
        return []

    
def printable_time(start_time, end_time):
    seconds = end_time-start_time
    if (seconds < 60):
        return f"{seconds:.0f} seconds"
    else:
        minutes = seconds/60.0
        return  f"{minutes:.2f} minutes"


In [33]:
def replace_pronoun(annotation, sentence, pronoun_gloss, replacement_options):
    # e.g. "PT:PRO1SG" is replaced by "I" or "me" depending
    # which is present in the main sentence
    gloss_present = None 
    if ("PT:"+pronoun_gloss) in annotation: 
        gloss_present = "PT:"+pronoun_gloss
    elif pronoun_gloss in annotation: 
        gloss_present = pronoun_gloss
    
    if gloss_present is not None:
        replacement_to_use = replacement_options[0]
        for replacement in replacement_options:
            if replacement in sentence:
                replacement_to_use = replacement
                break
        return annotation.replace(gloss_present, replacement)
    return annotation
    

def format_annotation(annotation, sentence):
    # Use regex to find and replace ADD-TO-SIGNBANK(...) wrapper
    pattern = r'ADD-TO-SIGNBANK\((.*?)\)'    
    def replacement(match):
        return match.group(1).strip()
    annotation = re.sub(pattern, replacement, annotation)
    
    # Replace pronouns with more english translations
    # Personal Pronouns:
    
    annotation = replace_pronoun(annotation, sentence, "PRO1SG", ["I", "me"])

    annotation = replace_pronoun(annotation, sentence, "PRO1SG", ["I","me"])
    annotation = replace_pronoun(annotation, sentence, "PRO2SG", ["you",])
    annotation = replace_pronoun(annotation, sentence, "PRO3SG", ["he","she","it"])
    annotation = replace_pronoun(annotation, sentence, "PRO1PL", ["we","us"])
    annotation = replace_pronoun(annotation, sentence, "PRO2PL", ["you(pl)",])
    annotation = replace_pronoun(annotation, sentence, "PRO3PL", ["they/them",])

    annotation = replace_pronoun(annotation, sentence, "POSS1SG", ["my","mine",])
    annotation = replace_pronoun(annotation, sentence, "POSS2SG", ["your","yours",])
    annotation = replace_pronoun(annotation, sentence, "POSS3SG", ["his","her","it"])
    annotation = replace_pronoun(annotation, sentence, "POSS1PL", ["our","ours",])
    annotation = replace_pronoun(annotation, sentence, "POSS2PL", ["your","yours",])
    annotation = replace_pronoun(annotation, sentence, "POSS3PL", ["their","theirs",])

    annotation = replace_pronoun(annotation, sentence, "BODY",["(points to body)",]) #  Point to a body part
    annotation = replace_pronoun(annotation, sentence, "LBUOY",["(points to list)",]) # Point to a list buoy
    annotation = replace_pronoun(annotation, sentence, "FBUOY",["(points to fragment)",]) # Point to a fragment buoy
    annotation = replace_pronoun(annotation, sentence, "BUOY",["(points)",]) # Point to a buoy (of unspecified type)*
    annotation = replace_pronoun(annotation, sentence, "PT:", ["(points)",])

   # annotation = annotation.replace("LBUOY-ONE", "(list)")
   # annotation = annotation.replace("LBUOY-TWO", "(list)")
   # annotation = annotation.replace("LBUOY-THREE", "(list)")

#    if "BUOY" in annotation:
 #       print(annotation)
    

    # Remove number suffixes
    annotation = re.sub(r'\d+$', '', annotation)
   
    return annotation 

def process_bsl(lh_gloss, rh_gloss, sentence):
#     if ("G:" in lh_gloss):
#         print(lh_gloss)
#     if ("G:" in rh_gloss):
#         print(rh_gloss)
    lh_gloss = lh_gloss.strip()
    rh_gloss = rh_gloss.strip()
    
    lh_gloss = format_annotation(lh_gloss, sentence)
    rh_gloss = format_annotation(rh_gloss, sentence)
    print(sentence)
    combined = ""
    if (lh_gloss == rh_gloss):
        combined = lh_gloss
    elif lh_gloss and rh_gloss:
        combined = lh_gloss + " | " + rh_gloss
    elif lh_gloss:
        combined = lh_gloss
    elif rh_gloss:
        combined = rh_gloss
    
    print(combined)
    return combined
    

In [40]:
def eaf_to_srt_combined(eaf_file, srt_file, offset):
    eaf = pympi.Elan.Eaf(eaf_file)
    
    # Collect all annotations from RH and LH tiers
    bsl_annotations = []
    en_annotations = []
    
    for tier_name in ['RH-IDgloss', 'LH-IDgloss']:
        for annotation in eaf.get_annotation_data_for_tier(tier_name):
            bsl_annotations.append({
                'tier': tier_name,
                'start': annotation[0],
                'end': annotation[1],
                'text': annotation[2]
            })
    
    for tier_name in ['Free Translation',]:
        for annotation in eaf.get_annotation_data_for_tier(tier_name):
            en_annotations.append({
                'tier': tier_name,
                'start': annotation[0],
                'end': annotation[1],
                'text': annotation[2]
            })
    
    # Sort annotations by start time
    en_annotations.sort(key=lambda x: x['start'])
    bsl_annotations.sort(key=lambda x: x['start'])
    
    # Combine L and R BSL annotations, with reference to english translation
    merged_annotations = []
    for en_ann in en_annotations:
        overlapping_bsl = [bsl_ann for bsl_ann in bsl_annotations 
                           if bsl_ann['start'] < en_ann['end'] and bsl_ann['end'] > en_ann['start']]
        
        current_annotation = None
        for bsl_ann in overlapping_bsl:
            if bsl_ann['tier'] == 'RH-IDgloss':
                rh_gloss = bsl_ann['text']
                lh_gloss = next((ann['text'] for ann in overlapping_bsl 
                                 if ann['tier'] == 'LH-IDgloss' and ann['start'] == bsl_ann['start']), '')
            else:
                lh_gloss = bsl_ann['text']
                rh_gloss = next((ann['text'] for ann in overlapping_bsl 
                                 if ann['tier'] == 'RH-IDgloss' and ann['start'] == bsl_ann['start']), '')
            
            if current_annotation is None or (lh_gloss, rh_gloss) != (current_annotation['lh_gloss'], current_annotation['rh_gloss']):
                if current_annotation:
                    merged_annotations.append(current_annotation)
                    
                current_annotation = {
                    'start': bsl_ann['start'],
                    'end': bsl_ann['end'],
                    'en_text': en_ann['text'],
                    'lh_gloss': lh_gloss,
                    'rh_gloss': rh_gloss
                }
            else:
                current_annotation['end'] = max(current_annotation['end'], bsl_ann['end'])
        
        if current_annotation:
            merged_annotations.append(current_annotation)
    
    
    # Remove duplicates
    unique_annotations = []
    for ann in merged_annotations:
        if not unique_annotations or (ann['lh_gloss'], ann['rh_gloss']) != unique_annotations[-1]:
            if unique_annotations:
                print(ann, unique_annotations[-1])
            unique_annotations.append(ann)
            
    # Write to SRT file
    with open(srt_file, 'w', encoding='utf-8') as f:
        for index, ann in enumerate(unique_annotations, 1):
            start_time = ann['start'] / 1000
            end_time = ann['end'] / 1000
            
            f.write(f"{index}\n")
            f.write(f"{format_time(start_time, offset)} --> {format_time(end_time, offset)}\n")
            parsed_bsl = process_bsl(ann['lh_gloss'], ann['rh_gloss'], ann['en_text'])
            f.write(f"{parsed_bsl}\n\n")
    
    print(f"Created SRT file: {srt_file}")

    
def process_file(eaf_file):

    srt_file_bsl = eaf_file.replace(".eaf", ".bsl.srt")
    srt_file_en = eaf_file.replace(".eaf", ".en.srt")

    tiers = get_tier_names(eaf_file)
    offsets = get_all_time_offsets(eaf_file)
    if (offsets is None):
        offset = 0
    elif (len(offsets)>1):
        offset = offsets[0]
    else:
        offset = offsets[0]

    print("Using time offset: "+str(offset))

    print("Extracting BSL subtitles")
    eaf_to_srt_combined(eaf_file, srt_file_bsl, offset)

process_file('inputs/BF10n.eaf')

Using time offset: 2960
Extracting BSL subtitles
{'start': 163, 'end': 598, 'en_text': 'Okay?', 'lh_gloss': '', 'rh_gloss': 'G:HEY'} {'start': 14, 'end': 159, 'en_text': 'Okay?', 'lh_gloss': '', 'rh_gloss': 'GOOD'}
{'start': 1206, 'end': 1394, 'en_text': 'Right, hey.', 'lh_gloss': '', 'rh_gloss': 'G:HEY'} {'start': 163, 'end': 598, 'en_text': 'Okay?', 'lh_gloss': '', 'rh_gloss': 'G:HEY'}
{'start': 1403, 'end': 1620, 'en_text': 'Right, hey.', 'lh_gloss': '', 'rh_gloss': 'GOOD'} {'start': 1206, 'end': 1394, 'en_text': 'Right, hey.', 'lh_gloss': '', 'rh_gloss': 'G:HEY'}
{'start': 1629, 'end': 1916, 'en_text': 'Right, hey.', 'lh_gloss': '', 'rh_gloss': 'PT:PRO1SG'} {'start': 1403, 'end': 1620, 'en_text': 'Right, hey.', 'lh_gloss': '', 'rh_gloss': 'GOOD'}
{'start': 1927, 'end': 2278, 'en_text': 'Right, hey.', 'lh_gloss': '', 'rh_gloss': 'G:HEY'} {'start': 1629, 'end': 1916, 'en_text': 'Right, hey.', 'lh_gloss': '', 'rh_gloss': 'PT:PRO1SG'}
{'start': 2285, 'end': 2883, 'en_text': 'Right, hey

In [34]:
import glob, os

for file in glob.glob('inputs/*.eaf'):
    print("-"*16)
    print(file)
    process_file(file)
    

----------------
inputs/BF24n.eaf
Using time offset: 4720
Extracting english subtitles
Found 5.80 minutes of subtitles from 00:00:04,723 to 00:05:52,610 
Extracting BSL subtitles
Found 48 seconds of subtitles from 00:00:04,723 to 00:00:52,710 
----------------
inputs/BF12n.eaf
Using time offset: 1635
Extracting english subtitles
Found 3.45 minutes of subtitles from 00:01:32,001 to 00:04:58,711 
Extracting BSL subtitles
Found 2.48 minutes of subtitles from 00:00:01,696 to 00:02:30,742 
----------------
inputs/BF13n.eaf
Using time offset: 18106
Extracting english subtitles
Found 5.48 minutes of subtitles from 00:01:55,972 to 00:07:25,046 
Extracting BSL subtitles
Found 3.65 minutes of subtitles from 00:00:19,706 to 00:03:58,448 
----------------
inputs/BF25n.eaf
Using time offset: 88
Extracting english subtitles
Found 3.85 minutes of subtitles from 00:00:01,598 to 00:03:52,567 
Extracting BSL subtitles
LBUOY-TWO (points to list)
Found 54 seconds of subtitles from 00:00:00,852 to 00:00:54

In [28]:


# Example usage
annotations = [
    {'start': 1.0, 'end': 2.0, 'rh_text': 'Hello', 'lh_text': 'World'},
    {'start': 2.5, 'end': 3.5, 'rh_text': 'How', 'lh_text': 'are you?'},
    {'start': 5, 'end': 5.01, 'rh_text': 'wibble', 'lh_text': ''},
    {'start': 6.0, 'end': 7.0, 'rh_text': 'Good', 'lh_text': 'morning'}
]

extended_annotations = extend_annotations_with_priority(annotations)
for annotation in extended_annotations:
    print(annotation)


{'start': 0.8999999999999999, 'end': 2.45, 'rh_text': 'Hello', 'lh_text': 'World'}
{'start': 2.45, 'end': 4.0, 'rh_text': 'How', 'lh_text': 'are you?'}
{'start': 4.655, 'end': 5.755, 'rh_text': 'wibble', 'lh_text': ''}
{'start': 5.9, 'end': 7.5, 'rh_text': 'Good', 'lh_text': 'morning'}


In [29]:
import re
from datetime import datetime, timedelta

def parse_srt(file_path):
    subtitles = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        for i in range(0, len(lines), 4):
            if i + 2 < len(lines):
                time_range = lines[i + 1].strip()
                text = lines[i + 2].strip()
                start, end = time_range.split(' --> ')
                start_time = datetime.strptime(start, '%H:%M:%S,%f')
                end_time = datetime.strptime(end, '%H:%M:%S,%f')
                subtitles.append({
                    'start': start_time,
                    'end': end_time,
                    'text': text
                })
    return subtitles

def process_subtitles(english_srt, bsl_srt):
    english_subtitles = parse_srt(english_srt)
    bsl_subtitles = parse_srt(bsl_srt)
    
    output = []
    bsl_index = 0
    
    for english_sub in english_subtitles:
        output.append(f"- {english_sub['text']}")
        
        while bsl_index < len(bsl_subtitles):
            bsl_sub = bsl_subtitles[bsl_index]
            if bsl_sub['start'] < english_sub['end'] and bsl_sub['end'] > english_sub['start']:
                output.append(f"- {bsl_sub['text']}")
                bsl_index += 1
            else:
                break
    
    # Add any remaining BSL subtitles
    while bsl_index < len(bsl_subtitles):
        output.append(f"- {bsl_subtitles[bsl_index]['text']}")
        bsl_index += 1
    
    return '\n'.join(output)

# Usage
english_srt = 'inputs/BF1n.en.srt'
bsl_srt = 'inputs/BF1n.bsl.srt'
result = process_subtitles(english_srt, bsl_srt)
print(result)


- Are we ready?
- GOOD
- I want to tell you about my puppy.
- I/me
- EXPLAIN
- ABOUT
- my/mine
- FS:PUPPY
- DSEW(FLAT)-BE:ANIMAL
- My family got a puppy last year.
- my/mine
- WANT
- FAMILY
- AT-LAST
- HAVE
- DSEW(FLAT)-BE:ANIMAL
- ?LAST-WEEK
- GOOD
- A new puppy, it's lovely.
- NEW
- DSEW(FLAT)-BE:ANIMAL
- LOOK-GOOD
- DSEW(FLAT)-BE:ANIMAL
- G:WELL
- My Dad had wanted a dog for a very long time
- ?LAST-WEEK
- TRUE
- my/mine
- FATHER
- ALWAYS
- WANT
- | WANT
- WANT
- | WANT
- DOG
- WANT
- | WANT
- SINCE
- Mum had said "no, no, no, no".
- my/mine
- MOTHER
- ALWAYS
- NO
- NO
- NO
- NO
- NO
- FATHER
- My Dad had been very patient
- BEHAVIOUR
- My Sister said to our Mum, "It's not fair, Dad wants a dog"
- my/mine
- SISTER
- SAY
- NO
- EQUAL
- my/mine
- FATHER
- WANT
- DOG
- G:WELL
- My Mum still wasn't sure but then things settled down.
- my/mine
- MOTHER
- G:ERM
- G:WELL
- SAME
- SETTLE
- SETTLE
- my/mine
- My sister got married and moved to England.
- SISTER
- MARRY
- MOVE
- SN:ENGLAND(RO