# Parsing the file content to a dictionary

From the elan converted texts

```
defaultdict(<function __main__.parse_file_content.<locals>.<lambda>()>,
            {'Simeon': defaultdict(list,
                         {'po': ['Sentence 1 in po', 'Sentence 2 in po'],
                          'tn': ['Sentence 1 in tn', 'Sentence 2 in tn']})})

```

In [31]:
from collections import defaultdict
import re

def parse_file_content(file_content):
    # Initialize data structure to hold parsed content
    parsed_data = defaultdict(lambda: defaultdict(list))
    
    # Split the content by lines and initialize variables
    lines = file_content.strip().split("\n")
    current_speaker = None
    current_tier = None
    
    # Regular expression patterns for extracting information
    speaker_pattern = r"Speaker: (.+)"
    tier_pattern = r"Tier Type: (\w+), Annotations Count: \d+"
    annotation_pattern = r"Annotation: (.+)"
    
    # Parse the lines one by one
    for line in lines:
        speaker_match = re.search(speaker_pattern, line)
        if speaker_match:
            current_speaker = speaker_match.group(1)
            continue
        tier_match = re.search(tier_pattern, line)
        if tier_match:
            current_tier = tier_match.group(1)
            continue
        annotation_match = re.search(annotation_pattern, line)
        if annotation_match and current_speaker and current_tier:
            annotation = annotation_match.group(1)
            if annotation.lower() != "none":  # Skip "None" annotations
                parsed_data[current_speaker][current_tier].append(annotation)
            
    return parsed_data

# Join annotations from the parsed data

In [32]:
def join_annotations_by_index(parsed_data):
    joined_data = defaultdict(list)
    
    for speaker, tiers in parsed_data.items():
        # Get the number of annotations in the 'po' tier for comparison
        po_count = len(tiers.get('po', []))
        tn_count = len(tiers.get('tn', []))

        print(f"Speaker: {speaker}, po: {po_count}, tn: {tn_count}")

        # Skip the speaker if 'po' annotations are empty
        if po_count == 0:
            continue

        # Initialize a list to hold the joined annotations for this speaker
        joined_annotations = [{} for _ in range(po_count)]
        
        # First, handle the special case for 'po' and 'tn'
        # They should be identical, so we can just copy them directly
        for i in range(po_count):
            joined_annotations[i]['po'] = tiers.get('po', [None])[i]
            joined_annotations[i]['tn'] = tiers.get('tn', [None])[min(i, tn_count - 1)] if tn_count > 0 else "none"
        
        # Loop through each tier type and its annotations, excluding 'po' and 'tn'
        for tier, annotations in {k: v for k, v in tiers.items() if k not in ['po', 'tn']}.items():
            # If the annotation counts match 'po', join by index
            if len(annotations) == po_count:
                for i, annotation in enumerate(annotations):
                    joined_annotations[i][tier] = annotation
            else:
                # If counts don't match, mark it for later processing
                for i, annotation in enumerate(annotations):
                    if 'unmatched' not in joined_annotations[min(i, po_count-1)]:
                        joined_annotations[min(i, po_count-1)]['unmatched'] = {}
                    joined_annotations[min(i, po_count-1)]['unmatched'][tier] = annotation
                    
        # Store the joined annotations for this speaker
        joined_data[speaker] = joined_annotations

    return joined_data

## Join annotations in case they don't match by indexing using levenshtein distance

In [33]:
def levenshtein_distance(s1, s2):
    """
    Calculate the Levenshtein distance between two strings.
    """
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

In [34]:
from itertools import chain

def match_unmatched_annotations(joined_data):
    # Initialize data structure to hold the final matched annotations
    matched_data = defaultdict(list)
    
    for speaker, annotations in joined_data.items():
        # Initialize a list to hold the matched annotations for this speaker
        matched_annotations = []
        
        for annotation_set in annotations:
            # Initialize a dictionary to hold the matched annotation for this set
            matched_annotation = {}
            
            # Copy over the annotations that were already matched by index
            for tier, annotation in chain(annotation_set.items(), annotation_set.get('unmatched', {}).items()):
                if tier != 'unmatched':
                    matched_annotation[tier] = annotation
            
            # Handle unmatched annotations, if any
            unmatched_tiers = annotation_set.get('unmatched', {})
            if unmatched_tiers:
                po_annotation = annotation_set['po']
                for tier, annotation in unmatched_tiers.items():
                    # Calculate the Levenshtein distance to the 'po' annotation
                    distance = levenshtein_distance(po_annotation, annotation)
                    
                    # If the distance is below a certain threshold, match it
                    # Here we use a threshold of 5 as an example; you can adjust this as needed
                    if distance < 5:
                        matched_annotation[tier] = annotation
                    else:
                        matched_annotation[tier] = "none"  # No good match found
                        
            # Add the matched annotation set to the list for this speaker
            matched_annotations.append(matched_annotation)
        
        # Store the matched annotations for this speaker
        matched_data[speaker] = matched_annotations

    return matched_data

# Join everything together

In [35]:
import os
import json

def process_and_save_annotations(input_dir, output_dir):
    # Ensure the output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Loop through each file in the input directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            filepath = os.path.join(input_dir, filename)
            
            # Step 2: Read the content of the file
            with open(filepath, 'r', encoding='utf-8') as f:
                print(f"Current file: {filename}\n")
                file_content = f.read()
            
            # Step 3: Parse the content
            parsed_data = parse_file_content(file_content)
            
            # Step 4: Join annotations by index
            joined_data = join_annotations_by_index(parsed_data)
            
            # Step 5: Match unmatched annotations
            matched_data = match_unmatched_annotations(joined_data)
            
            # Step 6: Save the final matched annotations to a file in the output directory
            output_filepath = os.path.join(output_dir, f"{filename.split('.')[0]}_processed.json")
            with open(output_filepath, 'w', encoding='utf-8') as f:
                json.dump(matched_data, f, ensure_ascii=False, indent=4)
            
            print("\n\n")

In [36]:

# Let's test the function with the uploaded file and save the output to a test directory
input_dir = 'Data/Simeon/Floyd_merged/'
output_dir = 'Data/Simeon/TestResults/'
process_and_save_annotations(input_dir, output_dir)

# Check if the output file has been generated
os.listdir(output_dir)

Current file: QUSF2018_02_09S1_pub.txt

Speaker: señora de saco azul, po: 81, tn: 81
Speaker: señora de chalinaa, po: 38, tn: 38
Speaker: señora de saco blanco, po: 111, tn: 111
Speaker: Simeon, po: 115, tn: 115
Speaker: señora con sombrero, po: 27, tn: 27



Current file: QUSF2018_10_09S1_pub.txt

Speaker: HOMBRE 1 A, po: 987, tn: 987
Speaker: HOMBRE 2 B, po: 1466, tn: 1466
Speaker: MUJER 1 C, po: 16, tn: 0
Speaker: MUJERV 1 C, po: 0, tn: 16
Speaker: Simeon D, po: 33, tn: 33



Current file: QUSF2018_02_24S2_pub.txt

Speaker: SIMEON A, po: 720, tn: 0
Speaker: SIMEON  A, po: 0, tn: 720
Speaker: mujer 1 B, po: 1166, tn: 1166
Speaker: hombre 1 C, po: 798, tn: 797
Speaker: D, po: 80, tn: 80



Current file: QUSF2019_03_03S1_pub.txt

Speaker: Entrevistador, po: 94, tn: 94
Speaker: Mujer joven, po: 107, tn: 107
Speaker: Abuelita, po: 64, tn: 64
Speaker: Simeon, po: 32, tn: 32



Current file: QUSF2018_08_25S2_pub.txt

Speaker: Hombre, po: 245, tn: 244
Speaker: Niño, po: 3, tn: 3
Speaker: Mu

['QUSF2018_10_14S1_pub_processed.json',
 'QUSF2018_09_16S2_pub_processed.json',
 'QUSF2018_10_10S4_pub_processed.json',
 'QUSF2018_03_11S1_pub_processed.json',
 'QUSF2019_12_19S1_pub_processed.json',
 'QUSF2018_10_09S2_pub_processed.json',
 'QUSF2020_01_04S2_pub_processed.json',
 'QUSF2018_02_24S1_pub_processed.json',
 'QUSF2020_01_04S1_pub_processed.json',
 'QUSF2018_02_24S2_pub_processed.json',
 '.DS_Store',
 'QUSF2018_10_09S1_pub_processed.json',
 'QUSF2018_09_16S1_pub_processed.json',
 'QUSF2019_03_22S1_pub_processed.json',
 'QUSF2019_03_03S1_pub_processed.json',
 'QUSF2019_12_17S1_pub_processed.json',
 'QUSF2019_03_23S2_pub_processed.json',
 'QUSF2018_10_10S2_pub_processed.json',
 'QUSF2018_02_18S2_pub_processed.json',
 'QUSF2018_02_09S1_pub_processed.json',
 'QUSF2019_01_09S1_pub_processed.json',
 'QUSF2018_02_03S1 _pub_processed.json',
 'QUSF2018_09_09S1_pub_processed.json',
 'QUSF2019_01_09S2_pub_processed.json',
 'QUSF2018_02_03S2_pub_processed.json',
 'QUSF2018_12_02S1_pub_pr