In [33]:
import pympi
import re
from collections import defaultdict
import shutil



def get_all_time_offsets(eaf_file):
    
    offsets = []
    eaf = pympi.Elan.Eaf(eaf_file)    

    for media_descriptor in eaf.media_descriptors:
        TIME_ORIGIN='TIME_ORIGIN'
        if (TIME_ORIGIN in media_descriptor):
            offsets.append(int((media_descriptor['TIME_ORIGIN'])))
    
    return offsets if offsets else None


def format_time(seconds, offset=0):
    """Helper function to format time in SRT format (HH:MM:SS,ms)"""
    seconds += offset / 1000  # Convert milliseconds to seconds
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    whole_seconds = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)  
    return f"{hours:02}:{minutes:02}:{whole_seconds:02},{milliseconds:03}"


def extend_annotations_with_priority(annotations, min_duration=0.5, start_buffer=0.1, end_buffer=0.5):
    
    # Sort annotations by start time
    annotations.sort(key=lambda x: x['start'])

    # Calculate durations
    for ann in annotations:
        ann['duration'] = ann['end'] - ann['start']
    
    # Pass 1: Extend annotations shorter than min_duration
    for i, ann in enumerate(annotations):
        if ann['duration'] < min_duration:
            buffer = min_duration - ann['duration']
            ann['start'] = max(0, ann['start'] - buffer * 0.5)
            ann['end'] += buffer * 0.5
            if i < len(annotations) - 1:
                next_ann = annotations[i + 1]
                if ann['end'] > next_ann['start']:
                    ann['end'] = next_ann['start']

    # Pass 2: Add half start_buffer where possible
    for i, ann in enumerate(annotations):
        buffer = start_buffer * 0.5
        ann['start'] = max(0, ann['start'] - buffer)
        if i > 0:
            prev_ann = annotations[i - 1]
            if ann['start'] < prev_ann['end']:
                ann['start'] = prev_ann['end']

    # Pass 3: Add full end_buffer and half start_buffer where possible
    for i, ann in enumerate(annotations):
        ann['end'] += end_buffer
        ann['start'] = max(0, ann['start'] - start_buffer * 0.5)
        if i < len(annotations) - 1:
            next_ann = annotations[i + 1]
            if ann['end'] > next_ann['start']:
                ann['end'] = next_ann['start']
        if i > 0:
            prev_ann = annotations[i - 1]
            if ann['start'] < prev_ann['end']:
                ann['start'] = prev_ann['end']

    
    # Remove temporary keys
    for ann in annotations:
        del ann['duration']

    return annotations

def format_annotation(annotation):
    # Use regex to find and replace ADD-TO-SIGNBANK(...) wrapper
    pattern = r'ADD-TO-SIGNBANK\((.*?)\)'    
    def replacement(match):
        return match.group(1).strip()
    annotation = re.sub(pattern, replacement, annotation)
    
    # Replace pronouns with more english translations
    # Personal Pronouns:
    
    
    annotation = annotation.replace("PT:PRO1SG", "I/me")
    annotation = annotation.replace("PT:PRO2SG", "you")
    annotation = annotation.replace("PT:PRO3SG", "he/she/it")
    annotation = annotation.replace("PT:PRO1PL", "we/us")
    annotation = annotation.replace("PT:PRO2PL", "you(pl)")
    annotation = annotation.replace("PT:PRO3PL", "they/them")

    annotation = annotation.replace("PRO1SG", "I/me")
    annotation = annotation.replace("PRO2SG", "you")
    annotation = annotation.replace("PRO3SG", "he/she/it")
    annotation = annotation.replace("PRO1PL", "we/us")
    annotation = annotation.replace("PRO2PL", "you(pl)")
    annotation = annotation.replace("PRO3PL", "they/them")
    # Possessive Pronouns:
    annotation = annotation.replace("POSS1SG", "my/mine")
    annotation = annotation.replace("POSS2SG", "your/yours")
    annotation = annotation.replace("POSS3SG", "his/her/it")
    annotation = annotation.replace("POSS1PL", "our/ours")
    annotation = annotation.replace("POSS2PL", "your/yours")
    annotation = annotation.replace("POSS3PL", "their/theirs")
    annotation = annotation.replace("PT:my/mine", "my/mine")

    annotation = annotation.replace("PT:BODY","(points to body)") #  Point to a body part
    annotation = annotation.replace("PT:LBUOY","(points to list)") # Point to a list buoy
    annotation = annotation.replace("PT:FBUOY","(points to fragment)") # Point to a fragment buoy
    annotation = annotation.replace("PT:BUOY","(points)") # Point to a buoy (of unspecified type)*
    annotation = annotation.replace("PT:", "(points)")

   # annotation = annotation.replace("LBUOY-ONE", "(list)")
   # annotation = annotation.replace("LBUOY-TWO", "(list)")
   # annotation = annotation.replace("LBUOY-THREE", "(list)")

#    if "BUOY" in annotation:
 #       print(annotation)
    

    # Remove number suffixes
    annotation = re.sub(r'\d+$', '', annotation)
   
    return annotation

            
def get_tier_names(eaf_filename):
    """
    Returns a list of tier names from an EAF file.
    
    Parameters:
    eaf_filename (str): Path to the EAF file
    
    Returns:
    list: List of tier names
    """
    try:
        # Load the EAF file
        eaf = pympi.Elan.Eaf(eaf_filename)
        
        # Get tier names
        tier_names = eaf.get_tier_names()
        
        return tier_names
    
    except Exception as e:
        print(f"Error reading EAF file: {str(e)}")
        return []

    
def eaf_to_srt_combined(eaf_file, srt_file, offset):
    eaf = pympi.Elan.Eaf(eaf_file)
    
    # Collect all annotations from RH and LH tiers
    annotations = []
    for tier_name in ['RH-IDgloss', 'LH-IDgloss']:
        for annotation in eaf.get_annotation_data_for_tier(tier_name):
            annotations.append({
                'tier': tier_name,
                'start': annotation[0],
                'end': annotation[1],
                'text': format_annotation(annotation[2])
            })
    
    # Sort annotations by start time
    annotations.sort(key=lambda x: x['start'])
    
    # Merge overlapping annotations
    merged_annotations = []
    current_annotation = None
    for annotation in annotations:
        if current_annotation is None or annotation['start'] > current_annotation['end']:
            if current_annotation:
                merged_annotations.append(current_annotation)
            current_annotation = {
                'start': annotation['start'],
                'end': annotation['end'],
                'rh_text': '',
                'lh_text': ''
            }
        current_annotation['end'] = max(current_annotation['end'], annotation['end'])
        if annotation['tier'] == 'RH-IDgloss':
            current_annotation['rh_text'] = annotation['text']
        else:
            current_annotation['lh_text'] = annotation['text']
    
    if current_annotation:
        merged_annotations.append(current_annotation)
    
    # Write to SRT file
    overall_start_time=None
    overall_end_time=None
    with open(srt_file, 'w', encoding='utf-8') as f:
        for index, annotation in enumerate(merged_annotations, 1):
            # annotation is: 'start', 'end', 'rh_text', 'lh_text'
            start_time = annotation['start'] / 1000
            end_time = annotation['end'] / 1000
            
            if overall_start_time is None:
                overall_start_time = start_time
            overall_end_time = end_time
            
            # Combine RH and LH texts, with RH first
            text = annotation['rh_text']
            
            if "BUOY" in annotation['lh_text'] or "BUOY" in annotation['rh_text']:
                print(annotation['lh_text'], annotation['rh_text'])
            if annotation['lh_text'] and annotation['lh_text'] != annotation['rh_text']:
                text += ' | ' + annotation['lh_text']
            
            f.write(f"{index}\n")
            f.write(f"{format_time(start_time, offset)} --> {format_time(end_time, offset)}\n")
            f.write(f"{text}\n\n")
    print(f"Found {printable_time(overall_start_time, overall_end_time)} of subtitles from {format_time(overall_start_time, offset)} to {format_time(overall_end_time, offset)} ")

def printable_time(start_time, end_time):
    seconds = end_time-start_time
    if (seconds < 60):
        return f"{seconds:.0f} seconds"
    else:
        minutes = seconds/60.0
        return  f"{minutes:.2f} minutes"

def eaf_to_srt(eaf_file, srt_file, tier_name, offset):
    eaf = pympi.Elan.Eaf(eaf_file)
    overall_start_time=None
    overall_end_time=None
    
    with open(srt_file, 'w', encoding='utf-8') as f:
        index = 1
        annotations = eaf.get_annotation_data_for_tier(tier_name)
        for annotation in annotations:
            start_time = annotation[0] / 1000
            end_time = annotation[1] / 1000
            text = format_annotation(annotation[2])
            
            if overall_start_time is None:
                overall_start_time = start_time
            overall_end_time = end_time            
            
            f.write(f"{index}\n")
            f.write(f"{format_time(start_time, offset)} --> {format_time(end_time, offset)}\n")
            f.write(f"{text}\n\n")
            index += 1
    
    print(f"Found {printable_time(overall_start_time, overall_end_time)} of subtitles from {format_time(overall_start_time, offset)} to {format_time(overall_end_time, offset)} ")


def process_file(eaf_file):

    srt_file_bsl = eaf_file.replace(".eaf", ".bsl.srt")
    srt_file_en = eaf_file.replace(".eaf", ".en.srt")

    tiers = get_tier_names(eaf_file)
    #print("Found tiers:")
    #print(tiers)
    offsets = get_all_time_offsets(eaf_file)
    if (offsets is None):
        offset = 0
    elif (len(offsets)>1):
    #    print("Multiple offsets found:")
    #    print(offsets)
        offset = offsets[0]
    else:
        offset = offsets[0]

    print("Using time offset: "+str(offset))


    # Example usage
    english='Free Translation'
    if (english in tiers):
        print("Extracting english subtitles")
        eaf_to_srt(eaf_file, srt_file_en, 'Free Translation', offset)

    print("Extracting BSL subtitles")
    eaf_to_srt_combined(eaf_file, srt_file_bsl, offset)

process_file('inputs/BF10n.eaf')

Using time offset: 2960
Extracting english subtitles
Found 2.26 minutes of subtitles from 00:00:02,960 to 00:02:18,597 
Extracting BSL subtitles
LBUOY-ONE (points to list)
LBUOY-FIVE (points to list)
LBUOY-FIVE (points to list)
LBUOY-ONE (points to list)
LBUOY-FIVE (points to list)
LBUOY-FIVE (points to list)
Found 1.15 minutes of subtitles from 00:00:02,973 to 00:01:12,040 


In [34]:
import glob, os

for file in glob.glob('inputs/*.eaf'):
    print("-"*16)
    print(file)
    process_file(file)
    

----------------
inputs/BF24n.eaf
Using time offset: 4720
Extracting english subtitles
Found 5.80 minutes of subtitles from 00:00:04,723 to 00:05:52,610 
Extracting BSL subtitles
Found 48 seconds of subtitles from 00:00:04,723 to 00:00:52,710 
----------------
inputs/BF12n.eaf
Using time offset: 1635
Extracting english subtitles
Found 3.45 minutes of subtitles from 00:01:32,001 to 00:04:58,711 
Extracting BSL subtitles
Found 2.48 minutes of subtitles from 00:00:01,696 to 00:02:30,742 
----------------
inputs/BF13n.eaf
Using time offset: 18106
Extracting english subtitles
Found 5.48 minutes of subtitles from 00:01:55,972 to 00:07:25,046 
Extracting BSL subtitles
Found 3.65 minutes of subtitles from 00:00:19,706 to 00:03:58,448 
----------------
inputs/BF25n.eaf
Using time offset: 88
Extracting english subtitles
Found 3.85 minutes of subtitles from 00:00:01,598 to 00:03:52,567 
Extracting BSL subtitles
LBUOY-TWO (points to list)
Found 54 seconds of subtitles from 00:00:00,852 to 00:00:54

In [28]:


# Example usage
annotations = [
    {'start': 1.0, 'end': 2.0, 'rh_text': 'Hello', 'lh_text': 'World'},
    {'start': 2.5, 'end': 3.5, 'rh_text': 'How', 'lh_text': 'are you?'},
    {'start': 5, 'end': 5.01, 'rh_text': 'wibble', 'lh_text': ''},
    {'start': 6.0, 'end': 7.0, 'rh_text': 'Good', 'lh_text': 'morning'}
]

extended_annotations = extend_annotations_with_priority(annotations)
for annotation in extended_annotations:
    print(annotation)


{'start': 0.8999999999999999, 'end': 2.45, 'rh_text': 'Hello', 'lh_text': 'World'}
{'start': 2.45, 'end': 4.0, 'rh_text': 'How', 'lh_text': 'are you?'}
{'start': 4.655, 'end': 5.755, 'rh_text': 'wibble', 'lh_text': ''}
{'start': 5.9, 'end': 7.5, 'rh_text': 'Good', 'lh_text': 'morning'}


In [29]:
import re
from datetime import datetime, timedelta

def parse_srt(file_path):
    subtitles = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        for i in range(0, len(lines), 4):
            if i + 2 < len(lines):
                time_range = lines[i + 1].strip()
                text = lines[i + 2].strip()
                start, end = time_range.split(' --> ')
                start_time = datetime.strptime(start, '%H:%M:%S,%f')
                end_time = datetime.strptime(end, '%H:%M:%S,%f')
                subtitles.append({
                    'start': start_time,
                    'end': end_time,
                    'text': text
                })
    return subtitles

def process_subtitles(english_srt, bsl_srt):
    english_subtitles = parse_srt(english_srt)
    bsl_subtitles = parse_srt(bsl_srt)
    
    output = []
    bsl_index = 0
    
    for english_sub in english_subtitles:
        output.append(f"- {english_sub['text']}")
        
        while bsl_index < len(bsl_subtitles):
            bsl_sub = bsl_subtitles[bsl_index]
            if bsl_sub['start'] < english_sub['end'] and bsl_sub['end'] > english_sub['start']:
                output.append(f"- {bsl_sub['text']}")
                bsl_index += 1
            else:
                break
    
    # Add any remaining BSL subtitles
    while bsl_index < len(bsl_subtitles):
        output.append(f"- {bsl_subtitles[bsl_index]['text']}")
        bsl_index += 1
    
    return '\n'.join(output)

# Usage
english_srt = 'inputs/BF1n.en.srt'
bsl_srt = 'inputs/BF1n.bsl.srt'
result = process_subtitles(english_srt, bsl_srt)
print(result)


- Are we ready?
- GOOD
- I want to tell you about my puppy.
- I/me
- EXPLAIN
- ABOUT
- my/mine
- FS:PUPPY
- DSEW(FLAT)-BE:ANIMAL
- My family got a puppy last year.
- my/mine
- WANT
- FAMILY
- AT-LAST
- HAVE
- DSEW(FLAT)-BE:ANIMAL
- ?LAST-WEEK
- GOOD
- A new puppy, it's lovely.
- NEW
- DSEW(FLAT)-BE:ANIMAL
- LOOK-GOOD
- DSEW(FLAT)-BE:ANIMAL
- G:WELL
- My Dad had wanted a dog for a very long time
- ?LAST-WEEK
- TRUE
- my/mine
- FATHER
- ALWAYS
- WANT
- | WANT
- WANT
- | WANT
- DOG
- WANT
- | WANT
- SINCE
- Mum had said "no, no, no, no".
- my/mine
- MOTHER
- ALWAYS
- NO
- NO
- NO
- NO
- NO
- FATHER
- My Dad had been very patient
- BEHAVIOUR
- My Sister said to our Mum, "It's not fair, Dad wants a dog"
- my/mine
- SISTER
- SAY
- NO
- EQUAL
- my/mine
- FATHER
- WANT
- DOG
- G:WELL
- My Mum still wasn't sure but then things settled down.
- my/mine
- MOTHER
- G:ERM
- G:WELL
- SAME
- SETTLE
- SETTLE
- my/mine
- My sister got married and moved to England.
- SISTER
- MARRY
- MOVE
- SN:ENGLAND(RO