In [48]:
import pympi
import re
from collections import defaultdict
import shutil



def get_all_time_offsets(eaf_file):
    
    offsets = []
    eaf = pympi.Elan.Eaf(eaf_file)    

    for media_descriptor in eaf.media_descriptors:
        TIME_ORIGIN='TIME_ORIGIN'
        if (TIME_ORIGIN in media_descriptor):
            offsets.append(int((media_descriptor['TIME_ORIGIN'])))
    
    return offsets if offsets else None


def format_time(seconds, offset=0):
    """Helper function to format time in SRT format (HH:MM:SS,ms)"""
    seconds += offset / 1000  # Convert milliseconds to seconds
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    whole_seconds = int(seconds % 60)
    milliseconds = int((seconds % 1) * 1000)  
    return f"{hours:02}:{minutes:02}:{whole_seconds:02},{milliseconds:03}"

def format_annotation(annotation):
    # Use regex to find and replace ADD-TO-SIGNBANK(...) wrapper
    pattern = r'ADD-TO-SIGNBANK\((.*?)\)'    
    def replacement(match):
        return match.group(1).strip()
    annotation = re.sub(pattern, replacement, annotation)
    
    # Replace pronouns with more english translations
    # Personal Pronouns:
    annotation = annotation.replace("PRO1SG", "I/me")
    annotation = annotation.replace("PRO2SG", "you")
    annotation = annotation.replace("PRO3SG", "he/she/it")
    annotation = annotation.replace("PRO1PL", "we/us")
    annotation = annotation.replace("PRO2PL", "you(pl)")
    annotation = annotation.replace("PRO3PL", "they/them")

    # Possessive Pronouns:
    annotation = annotation.replace("POSS1SG", "my/mine")
    annotation = annotation.replace("POSS2SG", "your/yours")
    annotation = annotation.replace("POSS3SG", "his/her/it")
    annotation = annotation.replace("POSS1PL", "our/ours")
    annotation = annotation.replace("POSS2PL", "your/yours")
    annotation = annotation.replace("POSS3PL", "their/theirs")

    # Remove number suffixes
    annotation = re.sub(r'\d+$', '', annotation)
   
    return annotation

            
def get_tier_names(eaf_filename):
    """
    Returns a list of tier names from an EAF file.
    
    Parameters:
    eaf_filename (str): Path to the EAF file
    
    Returns:
    list: List of tier names
    """
    try:
        # Load the EAF file
        eaf = pympi.Elan.Eaf(eaf_filename)
        
        # Get tier names
        tier_names = eaf.get_tier_names()
        
        return tier_names
    
    except Exception as e:
        print(f"Error reading EAF file: {str(e)}")
        return []

def eaf_to_srt_combined(eaf_file, srt_file, offset):
    eaf = pympi.Elan.Eaf(eaf_file)
    
    # Collect all annotations from RH and LH tiers
    annotations = []
    for tier_name in ['RH-IDgloss', 'LH-IDgloss']:
        for annotation in eaf.get_annotation_data_for_tier(tier_name):
            annotations.append({
                'tier': tier_name,
                'start': annotation[0],
                'end': annotation[1],
                'text': format_annotation(annotation[2])
            })
    
    # Sort annotations by start time
    annotations.sort(key=lambda x: x['start'])
    
    # Merge overlapping annotations
    merged_annotations = []
    current_annotation = None
    for annotation in annotations:
        if current_annotation is None or annotation['start'] > current_annotation['end']:
            if current_annotation:
                merged_annotations.append(current_annotation)
            current_annotation = {
                'start': annotation['start'],
                'end': annotation['end'],
                'rh_text': '',
                'lh_text': ''
            }
        current_annotation['end'] = max(current_annotation['end'], annotation['end'])
        if annotation['tier'] == 'RH-IDgloss':
            current_annotation['rh_text'] = annotation['text']
        else:
            current_annotation['lh_text'] = annotation['text']
    
    if current_annotation:
        merged_annotations.append(current_annotation)
    
    # Write to SRT file
    overall_start_time=None
    overall_end_time=None
    with open(srt_file, 'w', encoding='utf-8') as f:
        for index, annotation in enumerate(merged_annotations, 1):
            
            start_time = annotation['start'] / 1000
            end_time = annotation['end'] / 1000
            
            if overall_start_time is None:
                overall_start_time = start_time
            overall_end_time = end_time
            
            # Combine RH and LH texts, with RH first
            text = annotation['rh_text']
            if annotation['lh_text'] and annotation['lh_text'] != annotation['rh_text']:
                text += ' | ' + annotation['lh_text']
            
            f.write(f"{index}\n")
            f.write(f"{format_time(start_time, offset)} --> {format_time(end_time, offset)}\n")
            f.write(f"{text}\n\n")
    print(f"Found {printable_time(overall_start_time, overall_end_time)} of subtitles from {format_time(overall_start_time, offset)} to {format_time(overall_end_time, offset)} ")

def printable_time(start_time, end_time):
    seconds = end_time-start_time
    if (seconds < 60):
        return f"{seconds:.0f} seconds"
    else:
        minutes = seconds/60.0
        return  f"{minutes:.2f} minutes"

def eaf_to_srt(eaf_file, srt_file, tier_name, offset):
    eaf = pympi.Elan.Eaf(eaf_file)
    overall_start_time=None
    overall_end_time=None
    
    with open(srt_file, 'w', encoding='utf-8') as f:
        index = 1
        annotations = eaf.get_annotation_data_for_tier(tier_name)
        for annotation in annotations:
            start_time = annotation[0] / 1000
            end_time = annotation[1] / 1000
            text = format_annotation(annotation[2])
            
            if overall_start_time is None:
                overall_start_time = start_time
            overall_end_time = end_time            
            
            f.write(f"{index}\n")
            f.write(f"{format_time(start_time, offset)} --> {format_time(end_time, offset)}\n")
            f.write(f"{text}\n\n")
            index += 1
    
    print(f"Found {printable_time(overall_start_time, overall_end_time)} of subtitles from {format_time(overall_start_time, offset)} to {format_time(overall_end_time, offset)} ")


def process_file(eaf_file):

    srt_file_bsl = eaf_file.replace(".eaf", ".bsl.srt")
    srt_file_en = eaf_file.replace(".eaf", ".en.srt")

    tiers = get_tier_names(eaf_file)
    #print("Found tiers:")
    #print(tiers)
    offsets = get_all_time_offsets(eaf_file)
    if (offsets is None):
        offset = 0
    elif (len(offsets)>1):
    #    print("Multiple offsets found:")
    #    print(offsets)
        offset = offsets[0]
    else:
        offset = offsets[0]

    print("Using time offset: "+str(offset))


    # Example usage
    english='Free Translation'
    if (english in tiers):
        print("Extracting english subtitles")
        eaf_to_srt(eaf_file, srt_file_en, 'Free Translation', offset)

    print("Extracting BSL subtitles")
    eaf_to_srt_combined(eaf_file, srt_file_bsl, offset)

#process_file('BF10n.eaf')

In [49]:
import glob, os

for file in glob.glob('inputs/*.eaf'):
    print("-"*16)
    print(file)
    process_file(file)
    

----------------
inputs\BF10n.eaf
Using time offset: 2960
Extracting english subtitles
Found 2.26 minutes of subtitles from 00:00:02,960 to 00:02:18,597 
Extracting BSL subtitles
Found 1.15 minutes of subtitles from 00:00:02,973 to 00:01:12,040 
----------------
inputs\BF11n.eaf
Using time offset: 0
Extracting english subtitles
Found 1.46 minutes of subtitles from 00:00:02,217 to 00:01:29,832 
Extracting BSL subtitles
Found 46 seconds of subtitles from 00:00:01,104 to 00:00:46,777 
----------------
inputs\BF12n.eaf
Using time offset: 1635
Extracting english subtitles
Found 3.45 minutes of subtitles from 00:01:32,001 to 00:04:58,711 
Extracting BSL subtitles
Found 2.48 minutes of subtitles from 00:00:01,696 to 00:02:30,742 
----------------
inputs\BF13n.eaf
Using time offset: 18106
Extracting english subtitles
Found 5.48 minutes of subtitles from 00:01:55,972 to 00:07:25,046 
Extracting BSL subtitles
Found 3.65 minutes of subtitles from 00:00:19,706 to 00:03:58,448 
----------------
inp