# Explore Radio Broadcasts and Radio Bulletins Schemas

We are now integrating Radio data into Impresso. 
Radio data is more complex and diverse than the Newspaper data we have handled until now, and we are now looking at how we could define schemas for the various sources that work with our internal formats and pipelines.

### Imports

In [1]:
from tqdm import tqdm
from pathlib import Path
import os
import glob
import json
from datetime import datetime, date
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from collections import Counter
from impresso_essentials.utils import IssueDir, SourceType, SourceMedium
from text_preparation.utils import coords_to_xywh
from text_preparation.importers.classes import CanonicalIssue, CanonicalPage
from text_preparation.importers.swissinfo.detect import SwissInfoIssueDir
from text_preparation.importers.ina.classes import INABroadcastAudioRecord, INABroadcastIssue
from text_preparation.importers.ina.detect import INAIssueDir
from impresso_essentials.utils import ALL_MEDIA, PARTNER_TO_MEDIA, id_to_issuedir, timestamp
from impresso_essentials.io.fs_utils import canonical_path
from text_preparation.importers.swissinfo.classes import SwissInfoRadioBulletinPage, SwissInfoRadioBulletinIssue
from impresso_essentials.versioning import helpers
from text_preparation.utils import validate_page_schema, validate_issue_schema
from mutagen.mp3 import MP3

## 1. Small debugs and checks

In [None]:
SourceType.RB.value

In [None]:
SourceMedium.TPS.value

In [5]:
s3_mft_path = "s3://10-canonical-sandbox/canonical_v6-0-0.json"
basename = os.path.basename(s3_mft_path)
version = basename.replace(".json", "").split("_")[-1]

version, int(version[1:].replace("-", "")), version.replace("-", ".")

('v6-0-0', 600, 'v6.0.0')

In [57]:
def extract_version(name_or_path: str, as_int: bool = False) -> str | int:
    """Extract the version from a string filename or path.

    This function is in particular mean to extract the version from paths or filenames
    of manifests: structured as [data-stage]_vM-m-p.json.

    Args:
        name_or_path (str): Filename or path from which to extract the version.
        as_int (bool, optional): Whether to return the extracted version as int or str.
            Defaults to False.

    Returns:
        Union[str, int]: Extracted version, as int or str based on `as_int`.
    """
    # in the case it's a path
    basename = os.path.basename(name_or_path)
    version = basename.replace(".json", "").split("_")[-1]

    if as_int:
        ind_nums = version[1:].split('-')
        # multiply each part of the version with a larger multiple of 10
        as_ints = [int(n)*(10**(2*i)) for i,n in enumerate(ind_nums[::-1])][::-1]
        return sum(as_ints)
    return version.replace("-", ".")


In [31]:
ind_nums = version[1:].split('-')
ind_nums

['6', '0', '0']

In [45]:
[int(n)*(10**(2*i)) for i,n in enumerate(ind_nums[::-1])][::-1]

[60000, 0, 0]

In [63]:
def find_s3_data_manifest_path(
    bucket_name: str, data_stage: str, partition: str|None = None
) -> str|None:
    """Find and return the latest data manifest in a given S3 bucket.

    On S3, different Data stages will be stored in different ways.
    In particular, data stages corresponding to enrichments are all placed in the
    same bucket but in different partitions.
    Data stages "canonical", "rebuilt", "evenized-rebuilt" & ones related to Solr
    are the ones where each stage has its own bucket.

    Args:
        bucket_name (str): Name of the bucket in which to look.
        data_stage (str): Data stage corresponding to the manifest to fetch.
        partition (Optional[str], optional): Partition within the bucket to look
            into. Defaults to None.

    Returns:
        Optional[str]: S3 path of the latest manifest in the bucket, None if no
            manifests were found inside.
    """
    # fetch the data stage as the naming value
    if isinstance(data_stage, helpers.DataStage):
        stage_value = data_stage.value
    else:
        stage_value = helpers.validate_stage(data_stage, return_value_str=True)

    print(stage_value)

    # manifests have a json extension and are named after the format (value)
    path_filter = f"{stage_value}_v*.json"

    print(path_filter)

    if partition is None and stage_value in [
        helpers.DataStage.CANONICAL.value,  # "canonical"
        helpers.DataStage.REBUILT.value,  # "rebuilt"
        helpers.DataStage.PASSIM.value,  # "passim"
        helpers.DataStage.SOLR_TEXT.value,  # "solr-ingestion-text"
    ]:
        # manifest in top-level partition of bucket
        bucket = helpers.get_bucket(bucket_name)
        matches = helpers.fixed_s3fs_glob(path_filter, boto3_bucket=bucket)

        print(f"matches 1: {matches}")
    else:
        assert partition is not None, "partition should be provided for processed data"
        # processed data are all in the same bucket,
        # manifest should be directly fetched from path
        full_s3_path = os.path.join(bucket_name, partition, path_filter)
        print(full_s3_path)
        # print(full_s3_path)
        matches = helpers.fixed_s3fs_glob(full_s3_path)
        print(f"matches 2: {matches}")

    # matches will always be a list
    if len(matches) == 1:
        return matches[0]
    if len(matches) == 0:
        # no matches means it's the first manifest for the stage or bucket
        return None

    #print(list(map(lambda x: extract_version(x, as_int=True), matches)))
    print(sorted(list(map(lambda x: extract_version(x, as_int=True), matches))))
    print(sorted(matches, key=lambda x: extract_version(x, as_int=True)))
    print(matches[-1])
    # if multiple versions exist, return the latest one
    return sorted(matches, key=lambda x: extract_version(x, as_int=True))[-1]


In [64]:
find_s3_data_manifest_path("10-canonical-sandbox", "canonical")

canonical
canonical_v*.json
matches 1: ['s3://10-canonical-sandbox/canonical_v0-0-1.json', 's3://10-canonical-sandbox/canonical_v0-0-2.json', 's3://10-canonical-sandbox/canonical_v0-0-3.json', 's3://10-canonical-sandbox/canonical_v0-2-0.json', 's3://10-canonical-sandbox/canonical_v0-3-0.json', 's3://10-canonical-sandbox/canonical_v1-0-0.json', 's3://10-canonical-sandbox/canonical_v1-0-1.json', 's3://10-canonical-sandbox/canonical_v1-0-2.json', 's3://10-canonical-sandbox/canonical_v1-0-3.json', 's3://10-canonical-sandbox/canonical_v2-0-0.json', 's3://10-canonical-sandbox/canonical_v3-0-0.json', 's3://10-canonical-sandbox/canonical_v4-0-0.json', 's3://10-canonical-sandbox/canonical_v4-1-0.json', 's3://10-canonical-sandbox/canonical_v4-2-0.json', 's3://10-canonical-sandbox/canonical_v4-2-1.json', 's3://10-canonical-sandbox/canonical_v5-0-0.json', 's3://10-canonical-sandbox/canonical_v5-1-0.json', 's3://10-canonical-sandbox/canonical_v5-10-0.json', 's3://10-canonical-sandbox/canonical_v5-1

's3://10-canonical-sandbox/canonical_v6-0-0.json'

## 2. SWISSINFO

### 1. Issue implementation

In [6]:
base_swissinfo_path = "/mnt/project_impresso/original/"
eg_issue_path = "SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/07/22/a"
og_data_base_dir = os.path.join(base_swissinfo_path, "SWISSINFO/WW2-SOC-bulletins/ww2-PDF")
metadata_file_path = os.path.join(base_swissinfo_path, "SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json")

debug_issueDir = SwissInfoIssueDir("SOC_CJ", date(1940, 7, 22), 'a', os.path.join(base_swissinfo_path,eg_issue_path), metadata_file_path)
debug_issueDir

IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 7, 22), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/07/22/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json')

In [None]:
IssueDir("SOC_CJ", datetime.date(1940, 7, 22), 'a', eg_issue_path)

IssueDir(alias='SOC_CJ', date=datetime.date(1940, 7, 22), edition='a', path='SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/07/22/a')

In [None]:
test_issue = SwissInfoRadioBulletinIssue(debug_issueDir)
test_issue

<text_preparation.importers.swissinfo.classes.SwissInfoRadioBulletinIssue at 0x7f53b7c28810>

In [None]:
test_issue.issue_data

{'id': 'SOC_CJ-1940-07-22-a',
 'cdt': '2025-04-30 15:40:31',
 'st': 'radio_broadcast',
 'sm': 'typescript',
 'i': [{'m': {'id': 'SOC_CJ-1940-07-22-a-i0001',
    'lg': 'de',
    'pp': [1, 2],
    'tp': 'chronicle',
    'ro': 1,
    't': 'Tageschronik 22.07.1940'},
   'l': {'source': 'WW2-SOC-bulletins/ww2-PDF/SRI_XY_CJ_19400722_DE.pdf'}}],
 'pp': ['SOC_CJ-1940-07-22-a-p0001', 'SOC_CJ-1940-07-22-a-p0002'],
 'rc': 'SOC (KWD)',
 'rp': 'Tageschronik',
 'n': ['Page 1: page size within OCR before coord rescaling: [595.9199829101562, 837.1199951171875]',
  'Page 2: page size within OCR before coord rescaling: [607.6799926757812, 848.6400146484375]']}

In [None]:
validate_issue_schema(test_issue.issue_data)

SOC_CJ-1940-07-22-a - Validating against issue schema


In [None]:
Path('WW2-SOC-bulletins/ww2-PDF/SRI_XY_CJ_19400722_DE.pdf').stem

NameError: name 'Path' is not defined

In [None]:
test_issue.path.split("/")[:-5]

['',
 'mnt',
 'project_impresso',
 'original',
 'SWISSINFO',
 'WW2-SOC-bulletins-json']

In [None]:
test_issue.pages[1].page_data

In [None]:
with open(test_issue.json_file, encoding="utf-8") as f:
    bulletin_json = json.load(f)

test_issue.bulletin_lang = bulletin_json['lang']

test_issue.page_jsons = []

for page in bulletin_json["ocr_pages"]:
    
    page_img_file = bulletin_json["jp2_full_paths"][page["page_num"]]
    page_no = int(page["page_num"])+1
    page_id = "{}-p{}".format(test_issue.id, str(page_no).zfill(4))
    page_img_name = page_img_file.split("/")[-1].split(".")[0]
    # ensure the page numbering is correct
    assert page_img_name == page_id, f"{test_issue.id} problem with page numbering/naming, page_img_name ({page_img_name}) != page_id ({page_id})"
    
    # format the page json for future use
    #test_issue.page_jsons.append(test_issue.construct_page_json(page_id, page))

    # create page object and add it to the list of pages
    page = SwissInfoRadioBulletinPage(page_id, page_no)
    test_issue.pages.append(page)

    # TODO maybe - extract fonts

In [None]:
ocr_p1 = bulletin_json["ocr_pages"][0]
ocr_p1

In [None]:
round(ocr_p1['ocr_page_size'][0], 4)

In [None]:
ocr_p1['blocks_with_lines'][0]

In [None]:
json_pages = []
for page_orc in bulletin_json["ocr_pages"]:
    page_json = {
        "id": page_id,
        "fw": page_orc['jp2_img_size'][0],
        "fh": page_orc['jp2_img_size'][1],
    }
    blocks = []
    for block in page['blocks_with_lines']:
        block = {
            coords
        }

#### 1.a Check and process swi.xml to fetch the channel and program name metadata
see
https://docs.google.com/presentation/d/1NrbgjwLVhUFr2NRSEun6e2NPHp_AAEnA7dKVMUMvSWc/edit#slide=id.g2d8c9a6b5e0_0_16

In [None]:
swi_metadata_path = "/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins/swi.xml"

with open(swi_metadata_path, "r") as f:
    raw_xml = f.read()

pb_idx = 134436754

In [None]:
swi_metadata = BeautifulSoup(raw_xml, "html")
swi_metadata

Loading this xml takes very long, so the best is to preprocess it into a json file with all required info for them to be ready during ingestion

In [None]:
def keep_only_radio_bulletins(full_metadata, out_path, base_dir = og_data_base_dir):
    """
    Process the contents of the SWI metadata XML file to keep only the radio bulletins we have at our disposal.
    """
    existing_bulletins = os.listdir(base_dir)
    programs = []
    skipped = 0
    processed = 0
    for p_idx, program in tqdm(enumerate(full_metadata.find_all("program"))):

        archive_keys = [k.text for k in program.find_all("archivekey")]

        if not any ([f"{ak}.pdf" in existing_bulletins for ak in archive_keys]):

            skipped += 1
            if skipped % 500 == 0:
                print(f"Skipping program {p_idx+1}/19332 (no bulletin archives) - skipped {skipped} programs so far.")

        else:
            elems = []
            for elem in program.find_all("item"):
                
                seg_forms = [f.text for f_term in elem.find_all("form") for f in f_term.find_all("formterm")]
                originators = [
                    {
                        "last_name": None if o.find("originatorlastname") is None else o.find("originatorlastname").text,
                        "first_name": None if o.find("originatorfirstname") is None else o.find("originatorfirstname").text, 
                        "role": None if o.find("originatorrole") is None else o.find("originatorrole").text,
                    }
                    for ors in elem.find_all("originators") for o in ors.find_all("originator")
                ]
                persons = [{
                        "last_name": None if p.find("personlastname") is None else p.find("personlastname").text,
                        "first_name": None if p.find("personfirstname") is None else p.find("personfirstname").text,
                        "language": None if p.find("sprache") is None else p.find("sprache").text,
                        "is_portrait": None if p.find("portrait") is None else p.find("portrait").text,
                    } for pers in elem.find_all("persons") for p in pers.find_all("person")]

                    
                elems.append({
                    "archive_key": None if elem.find("archivekey") is None else elem.find("archivekey").text,
                    "segment_title": None if elem.find("itembeitragstitel") is None else elem.find("itembeitragstitel").text,
                    "abstract": None if elem.find("itemabstract") is None else elem.find("itemabstract").text,
                    "additional_notes": [n.text for n in elem.find_all("itembemerkung") if n is not None],
                    "segment_form": seg_forms,
                    "producers": [p.text for p in elem.find_all("producer") if p is not None],
                    "originators": originators,
                    "persons": persons,
                })
            
            program = {
                "program_title": None if program.find("sgef") is None else program.find("sgef").text,
                "program_subtitle": None if program.find("ptit") is None else program.find("ptit").text,
                "transmission": [{
                    "date": None if t.find("transmissiondatestart") is None else t.find("transmissiondatestart").text,
                    "channel": None if t.find("transmissionchannel") is None else t.find("transmissionchannel").text,
                } for t in program.find_all("transmission")],
            }
            if len(elems) == 1:
                program.update(elems[0])
            else:
                print(f"Program {p_idx+1}/19332 - More than one item!.")
                program["items"] = elems

            programs.append(program) 
            processed += 1
            if processed % 500 == 0:
                print(f"Finished processing program {p_idx+1}/19332 - processed {processed} programs so far.")
            # save the current metadata to a file
            if processed % 50 == 0:
                with open(out_path, "w", encoding="utf-8") as f:
                    json.dump(programs, f, ensure_ascii=False, indent=4)

    # save once more at the end               
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(programs, f, ensure_ascii=False, indent=4)


    return programs

In [None]:
program_metadata = keep_only_radio_bulletins(swi_metadata, "/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/SWISSINFO/SOC_rb_metadata.json")

In [None]:
len(program_metadata)

6718

In [None]:
sample_prgrams = swi_metadata.find_all("program")[100:110]
sample_prgrams

In [None]:
for p in sample_prgrams:
    archive_keys = [k.text for k in p.find_all("archivekey")]
    print(archive_keys)
    for elem in p.find_all("item"):
        seg_forms = [f.text for f_term in elem.find_all("form") for f in f_term.find_all("formterm")]
        originators = [
            {
                "last_name": o.find("originatorlastname").text,
                "first_name": o.find("originatorfirstname").text, 
                "role": o.find("originatorrole").text,
            }
            for ors in elem.find_all("originators") for o in ors.find_all("originator")
        ]
        persons = [{
                "last_name": None if p.find("personlastname") is None else p.find("personlastname").text,
                "first_name": None if p.find("personfirstname") is None else p.find("personfirstname").text,
                "language": None if p.find("sprache") is None else p.find("sprache").text,
                "is_portrait": None if p.find("portrait") is None else p.find("portrait").text,
            } for pers in elem.find_all("persons") for p in pers.find_all("person")]

        print(seg_forms, originators, persons)

['SRI_KAS_BRP_199612_Track12']
['Bericht'] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'role': 'AUT'}] [{'last_name': 'Conedera', 'first_name': 'Marco', 'language': 'dt', 'is_portrait': '0'}, {'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'language': 'dt', 'is_portrait': '0'}, {'last_name': 'Scheccia', 'first_name': 'Carlo', 'language': 'it', 'is_portrait': '0'}, {'last_name': 'Lendi', 'first_name': 'Peter', 'language': 'dt', 'is_portrait': '0'}]
['SRI_KAS_BRP_199609_Track03']
['Bericht', 'GerÃ¤usch'] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'role': 'AUT'}] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'language': 'dt', 'is_portrait': '0'}]
['SRI_KAS_BRP_199708_Track09']
['Bericht'] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'role': 'AUT'}] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'language': 'dt', 'is_portrait': '0'}]
['SRI_KAS_BRP_199308_Track05']
['Trailer'] [{'last_name': 'Zimmermann', 'first_name': 'StÃ©phanie', 'role': 'AUT'}] [{'last_name': '

In [None]:
programs_with_mult_archives = any([len(p.find_all("archivekey"))>1 for p in swi_metadata.find_all("program")])
programs_with_mult_archives

True

#### Checking out what possible values exist in the data

In [None]:
possible_program_types = set([t for p in program_metadata for t in p['segment_form']])
possible_program_types

{'Chronik'}

In [None]:
possible_producers = set([t for p in program_metadata for t in p['producers']])
possible_producers

{'KWD (Radio)'}

In [None]:
rb_metadata_path = os.path.join(base_swissinfo_path, "SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json")
with open(rb_metadata_path, encoding="utf-8") as f:
    program_metadata = json.load(f)

In [None]:
possible_channels = set([t['channel'] for p in program_metadata for t in p['transmission']])
possible_channels

{'KWD', 'SOC (KWD)'}

In [None]:
possible_titles = list(set([t['segment_title'] for t in program_metadata]))
len(possible_titles), possible_titles[:10]

(6051,
 ['Tageschronik 20.10.1942',
  'Chroniques du jour 30.05.1944',
  'Law and Freedom: The basis of the Swiss conception of state',
  'El problema de los abastecimientos suizos',
  'Tageschronik 28.11.1943',
  'Chroniques du jour 05.02.1945',
  'Tageschronik 28.10.1940',
  'Verfeinerte Rationierung',
  'Tageschronik 11.02.1944',
  'Today at Home and Abroad 12.09.1944'])

Only one type - chronique/chronicle and only one channel/producer: SOC (KWD).
This will be fixed in the importer

### 2. Page implementation

In [None]:
test_pg1 = test_issue.pages[0]
test_pg1.add_issue(test_issue)
test_pg1.page_data

{'id': 'SOC_CJ-1940-07-22-a-p0001',
 'cdt': '2025-04-30 15:40:31',
 'r': [],
 'iiif_img_base_uri': 'https://impresso-project.ch/api/proxy/iiif/SOC_CJ-1940-07-22-a-p0001/info.json',
 'st': 'radio_broadcast',
 'sm': 'typescript',
 'cc': True}

In [None]:
test_pg1.issue.content_items[0]['m']['id']

'SOC_CJ-1940-07-22-a-i0001'

In [None]:
#def parse_line
ocr_json = test_pg1.issue.page_jsons[test_pg1.number - 1]
line = ocr_json["blocks_with_lines"][0]['lines'][0]
line_tokens = line["spans"]
line_coords = coords_to_xywh(line["rescaled_bbox"])
len(line_tokens),line_tokens

In [None]:
coords_to_xywh([
                       152.72815326551716,
                                2112.2831453229055,
                                1520.289302060428,
                                2157.3988528493646
                                    ])

[152, 2112, 1368, 45]

In [None]:
def compute_agg_coords(all_coords):
    """
    Compute the coordinates of a paragraph from the coordinates of its lines.
    """
    x1 = min([l[0] for l in all_coords])
    y1 = min([l[1] for l in all_coords])
    x2 = max([l[2] for l in all_coords])
    y2 = max([l[3] for l in all_coords])
    return [x1, y1, x2, y2]
    #return coords_to_xywh([x1, y1, x2, y2])

In [None]:
def parse_lines(blocks_with_lines, pg_id, pg_notes):

    all_blocks_xy_coords = []
    paragraphs = []
    hyphen_at_last = False
    par_sizes = []
    for block_id, block in enumerate(blocks_with_lines):
        all_blocks_xy_coords.append(block["rescaled_bbox"])
        # there is usually only one paragraph per paragraph
        block_lines = []
        for line_id, line in enumerate(block["lines"]):
            tokens = []
            # tokens are in this "spans" object
            for t_id, token in enumerate(line["spans"]):

                # Skip tokens which are only spaces
                if token["text"] == " ":
                    continue

                curr_token = {
                    "c": coords_to_xywh(token["rescaled_bbox"]),
                    "tx": token["text"],
                    "gn": False,
                }

                # second half of a hyphen should be at the start of a line/block (which is not the first)
                if (block_id != 0 or line_id != 0) and t_id == 0 and hyphen_at_last:
                    if (
                        line_id != 0
                        and len(paragraphs) == 0
                        and not ("hy" in block_lines[-1]["t"][-1])
                    ):
                        msg = f"{pg_id} - Warning! problem 1 with hyphen_at_last!: curr_token: {curr_token}, block_lines[-1]['t'][-1]: {block_lines[-1]['t'][-1]}"
                        #logger.info(msg)
                        print(msg)
                        # saving in the notes
                        pg_notes.append(
                            f"block {block_id} ('number' {block['number']}), line {line_id}, token {t_id} - problem with hyphenation: hyphen_at_last is true but no 'hy' in previous token."
                        )
                    elif (
                        block_id != 0 and line_id == 0 and not ("hy" in paragraphs[-1]["l"][-1]['t'][-1])
                    ):
                        msg = f"{pg_id} - Warning! problem 2 with hyphen_at_last!: curr_token: {curr_token}, all_lines[-1]['l'][-1]['t'][-1]: {paragraphs[-1]['l'][-1]['t'][-1]}"
                        #logger.info(msg)
                        print(msg)
                        # saving in the notes
                        pg_notes.append(
                            f"block {block_id} ('number' {block['number']}), line {line_id}, token {t_id} - problem with hyphenation: hyphen_at_last is true but no 'hy' in previous token."
                        )

                    # if the first token of the line is a the second part of a hyphen,
                    # we need to merge it with the last token (after removing the hyphen)
                    if len(paragraphs) == 0:
                        full_word = (
                            block_lines[-1]["t"][-1]["tx"].split("-")[0] + token["text"]
                        )
                    else:
                        full_word = paragraphs[-1]['l'][-1]["t"][-1]["tx"].split("-")[0] + token["text"]
                    curr_token["nf"] = full_word

                # reset the hyphenation flag
                hyphen_at_last = False

                tokens.append(curr_token)

            # handle hyphenation
            if len(tokens) > 1 and tokens[-1]["tx"].endswith("-"):
                tokens[-1]["hy"] = True
                hyphen_at_last = True
            else:
                hyphen_at_last = False

            block_lines.append({"c": coords_to_xywh(line["rescaled_bbox"]), "t": tokens})
            #block_xy_coords.append(line["rescaled_bbox"])

        par_sizes.append(len(block_lines))
        paragraphs.append({"c": coords_to_xywh(block["rescaled_bbox"]), "l": block_lines})
        # there is usually only one line per block
        """if len(block_lines) == 1:
            all_lines.append(block_lines[0])
        else:
            # cases where there were more than one line seemed to be errors - to be checked.
            # msg = f"{pg_id} - Warning! {len(block_lines)} lines in this paragraph, adding them separately!! block coords: {[b['c'] for b in block_lines]}"
            pg_notes.append(
                f"block {block_id} ('number' {block['number']}), lines {len(all_lines)}-{len(all_lines)+len(block_lines)} were in the same block initially."
            )
            # print(msg)
            # logger.info(msg)
            all_lines.extend(block_lines)"""

    return all_blocks_xy_coords, paragraphs, par_sizes

In [None]:
def parse_page(pg):
    ocr_json = pg.issue.page_jsons[pg.number - 1]

    all_line_xy_coords, lines = parse_lines(ocr_json["blocks_with_lines"])

    # easier to merge all the coords if they stay in x1yx2y2 format
    para_coords = compute_paragraph_coords(all_line_xy_coords)
    paragraph = {
        "c": para_coords,
        "l": lines,
    }
    region = {"c": para_coords, "p": [paragraph], "pOf": pg.issue.content_items[0]['m']['id']}
    return region

In [None]:
parsed_pg1_r = parse_page(test_pg1)
parsed_pg1_r



{'c': [132, 287, 1498, 1870],
 'p': [{'c': [132, 287, 1498, 1870],
   'l': [{'c': [384, 287, 872, 45],
     't': [{'c': [384, 287, 135, 45], 'tx': 'UuTnrrlnl', 'gn': False},
      {'c': [532, 287, 8, 45], 'tx': 'i', 'gn': False},
      {'c': [552, 287, 34, 45], 'tx': 'rlii', 'gn': False},
      {'c': [608, 287, 37, 45], 'tx': 'Uli', 'gn': False},
      {'c': [677, 287, 12, 45], 'tx': 'r', 'gn': False},
      {'c': [711, 287, 136, 45], 'tx': 'Montag,', 'gn': False},
      {'c': [877, 287, 59, 45], 'tx': 'den', 'gn': False},
      {'c': [958, 287, 51, 45], 'tx': '22.', 'gn': False},
      {'c': [1038, 287, 79, 45], 'tx': 'Juli', 'gn': False},
      {'c': [1141, 287, 93, 45], 'tx': '1940.', 'gn': False}]},
    {'c': [372, 441, 1133, 51],
     't': [{'c': [372, 447, 6, 45], 'tx': 'i', 'gn': False},
      {'c': [610, 447, 40, 45], 'tx': 'Im', 'gn': False},
      {'c': [668, 447, 224, 45], 'tx': 'Vordergrund', 'gn': False},
      {'c': [917, 447, 56, 45], 'tx': 'der', 'gn': False},
      {'c

In [None]:
test_pg1.parse()
test_pg1.page_data



{'id': 'SOC_CJ-1940-07-22-a-p0001',
 'cdt': '2025-04-25 17:01:23',
 'r': {'c': [132, 287, 1498, 1870],
  'p': [{'c': [132, 287, 1498, 1870],
    'l': [{'c': [384, 287, 872, 45],
      't': [{'c': [384, 287, 135, 45], 'tx': 'UuTnrrlnl', 'gn': False},
       {'c': [532, 287, 8, 45], 'tx': 'i', 'gn': False},
       {'c': [552, 287, 34, 45], 'tx': 'rlii', 'gn': False},
       {'c': [608, 287, 37, 45], 'tx': 'Uli', 'gn': False},
       {'c': [677, 287, 12, 45], 'tx': 'r', 'gn': False},
       {'c': [711, 287, 136, 45], 'tx': 'Montag,', 'gn': False},
       {'c': [877, 287, 59, 45], 'tx': 'den', 'gn': False},
       {'c': [958, 287, 51, 45], 'tx': '22.', 'gn': False},
       {'c': [1038, 287, 79, 45], 'tx': 'Juli', 'gn': False},
       {'c': [1141, 287, 93, 45], 'tx': '1940.', 'gn': False}]},
     {'c': [372, 441, 1133, 51],
      't': [{'c': [372, 447, 6, 45], 'tx': 'i', 'gn': False},
       {'c': [610, 447, 40, 45], 'tx': 'Im', 'gn': False},
       {'c': [668, 447, 224, 45], 'tx': 'Vorderg

#### Test with the various configurations

In [None]:
case_1_issue_path = "SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/07/23/a"
case_1_issueDir = SwissInfoIssueDir("SOC_CJ", datetime.date(1940, 7, 23), 'a', os.path.join(base_swissinfo_path,case_1_issue_path), metadata_file_path)

case_2_issue_path = "SWISSINFO/WW2-SOC-bulletins-json/SOC_CP/1942/02/28/a"
case_2_issueDir = SwissInfoIssueDir("SOC_CP", datetime.date(1942, 2, 28), 'a', os.path.join(base_swissinfo_path,case_2_issue_path), metadata_file_path)

case_3_issue_1_path = "SWISSINFO/WW2-SOC-bulletins-json/SOC_TH/1945/03/07/a"
case_3_issueDir_1 = SwissInfoIssueDir("SOC_TH", datetime.date(1945, 3, 7), 'a', os.path.join(base_swissinfo_path,case_3_issue_1_path), metadata_file_path)

case_3_issue_2_path = "SWISSINFO/WW2-SOC-bulletins-json/SOC_VS/1944/11/29/a"
case_3_issueDir_2 = SwissInfoIssueDir("SOC_VS", datetime.date(1944, 11, 29), 'a', os.path.join(base_swissinfo_path,case_3_issue_2_path), metadata_file_path)

case_1_issue = SwissInfoRadioBulletinIssue(case_1_issueDir)
case_2_issue = SwissInfoRadioBulletinIssue(case_2_issueDir)
case_3_issue_1 = SwissInfoRadioBulletinIssue(case_3_issueDir_1)
case_3_issue_2 = SwissInfoRadioBulletinIssue(case_3_issueDir_2)

case_1_issueDir, case_2_issueDir, case_3_issueDir_1, case_3_issueDir_2

SOC_CJ-1940-07-23-a, some of the pages ([2]) had no OCR but not all!


SOC_CJ-1940-07-23-a, some of the pages ([2]) had no OCR but not all!


(IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 7, 23), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/07/23/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
 IssueDirectory(alias='SOC_CP', date=datetime.date(1942, 2, 28), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CP/1942/02/28/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
 IssueDirectory(alias='SOC_TH', date=datetime.date(1945, 3, 7), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_TH/1945/03/07/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
 IssueDirectory(alias='SOC_VS', date=datetime.date(1944, 11, 29), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_VS/1944/11/29/a', metadata_fi

In [None]:
first_page = case_2_issue.pages[0]
all_blocks_xy_coords, paragraphs, par_sizes = parse_lines(case_2_issue.page_jsons[0]["blocks_with_lines"], first_page.id, first_page.notes)
len(paragraphs), sum(len(p['l']) for p in paragraphs), paragraphs

(6,
 42,
 [{'c': [129, 54, 1330, 53],
   'l': [{'c': [129, 54, 1330, 53],
     't': [{'c': [129, 55, 180, 51], 'tx': 'Australie', 'gn': False},
      {'c': [319, 55, 22, 51], 'tx': '  ', 'gn': False},
      {'c': [351, 55, 21, 51], 'tx': '  ', 'gn': False},
      {'c': [372, 55, 36, 51], 'tx': '28', 'gn': False},
      {'c': [408, 55, 43, 51], 'tx': '    ', 'gn': False},
      {'c': [451, 55, 137, 51], 'tx': 'février', 'gn': False},
      {'c': [611, 55, 77, 51], 'tx': '1942', 'gn': False},
      {'c': [1327, 55, 122, 51], 'tx': 'Perrin', 'gn': False}]}]},
  {'c': [132, 197, 436, 51],
   'l': [{'c': [132, 197, 436, 51],
     't': [{'c': [132, 197, 97, 51], 'tx': 'Ghers', 'gn': False},
      {'c': [253, 197, 252, 51], 'tx': 'compatriotes,', 'gn': False},
      {'c': [505, 197, 20, 51], 'tx': '  ', 'gn': False},
      {'c': [556, 197, 2, 51], 'tx': '.', 'gn': False}]}]},
  {'c': [132, 248, 1524, 88],
   'l': [{'c': [491, 248, 1165, 52],
     't': [{'c': [491, 248, 117, 52], 'tx': 'Durant

In [None]:
merged_paragraph_lines = []
merged_paragraph_coords = []
for p in paragraphs:
    merged_paragraph_lines.extend(p['l'])

region_coords = coords_to_xywh(compute_agg_coords(all_blocks_xy_coords))
new_p = {'c': region_coords, 'l': merged_paragraph_lines}
len(merged_paragraph_lines), merged_paragraph_lines

(42,
 [{'c': [129, 54, 1330, 53],
   't': [{'c': [129, 55, 180, 51], 'tx': 'Australie', 'gn': False},
    {'c': [319, 55, 22, 51], 'tx': '  ', 'gn': False},
    {'c': [351, 55, 21, 51], 'tx': '  ', 'gn': False},
    {'c': [372, 55, 36, 51], 'tx': '28', 'gn': False},
    {'c': [408, 55, 43, 51], 'tx': '    ', 'gn': False},
    {'c': [451, 55, 137, 51], 'tx': 'février', 'gn': False},
    {'c': [611, 55, 77, 51], 'tx': '1942', 'gn': False},
    {'c': [1327, 55, 122, 51], 'tx': 'Perrin', 'gn': False}]},
  {'c': [132, 197, 436, 51],
   't': [{'c': [132, 197, 97, 51], 'tx': 'Ghers', 'gn': False},
    {'c': [253, 197, 252, 51], 'tx': 'compatriotes,', 'gn': False},
    {'c': [505, 197, 20, 51], 'tx': '  ', 'gn': False},
    {'c': [556, 197, 2, 51], 'tx': '.', 'gn': False}]},
  {'c': [491, 248, 1165, 52],
   't': [{'c': [491, 248, 117, 52], 'tx': 'Durant', 'gn': False},
    {'c': [631, 248, 37, 52], 'tx': 'ïé', 'gn': False},
    {'c': [688, 248, 77, 52], 'tx': 'mtois', 'gn': False},
    {'c': [

### 3. Detect/Select functions

In [None]:
args = {
    "access_rights":"",
    "chunk_size":5,
    "config_file":"/home/piconti/impresso-text-acquisition/text_preparation/config/importer_config/import_swissinfo_debug.json",
    "input_dir":"/mnt/project_impresso/original/SWISSINFO/",
    "log_file":"/home/piconti/impresso-text-acquisition/text_preparation/data/logs/test_logs/debug_swissinfo_importer_g.log",
    "output_dir":"/scratch/piconti/impresso/SWISSINFO",
    "s3_bucket":"10-canonical-sandbox",
    "temp_dir":"/scratch/piconti/impresso/temp_dump",
    "git_repo":"/home/piconti/impresso-text-acquisition",
}

access_rights=""
chunk_size=5
config_file="/home/piconti/impresso-text-acquisition/text_preparation/config/importer_config/import_swissinfo_debug.json"
input_dir="/mnt/project_impresso/original/SWISSINFO/"
log_file="/home/piconti/impresso-text-acquisition/text_preparation/data/logs/test_logs/debug_swissinfo_importer_g.log"
output_dir="/scratch/piconti/impresso/SWISSINFO"
s3_bucket="10-canonical-sandbox"
temp_dir="/scratch/piconti/impresso/temp_dump"
git_repo="/home/piconti/impresso-text-acquisition"


In [None]:
from text_preparation.importers.swissinfo.detect import detect_issues as SI_detect_issues
from text_preparation.importers.swissinfo.detect import select_issues as SI_select_issues
from text_preparation.importers import generic_importer as gi
from impresso_essentials.utils import init_logger
from collections import Counter
import logging

In [None]:
issue_class = SwissInfoRadioBulletinIssue
detect_func = SI_detect_issues
select_func = SI_select_issues

In [None]:
logger = logging.getLogger()
init_logger(logger, logging.DEBUG, args['log_file'])

<RootLogger root (DEBUG)>

In [None]:
client = gi.get_dask_client(None, args['log_file'], logging.DEBUG, 8)

In [None]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 16,Total memory: 251.79 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:33943,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 251.79 GiB

0,1
Comm: tcp://127.0.0.1:39139,Total threads: 2
Dashboard: http://127.0.0.1:46389/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:46877,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-yiwqmv9i,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-yiwqmv9i

0,1
Comm: tcp://127.0.0.1:38109,Total threads: 2
Dashboard: http://127.0.0.1:41801/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:35213,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-o1tz6z7d,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-o1tz6z7d

0,1
Comm: tcp://127.0.0.1:43723,Total threads: 2
Dashboard: http://127.0.0.1:39511/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:43791,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-2932e_77,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-2932e_77

0,1
Comm: tcp://127.0.0.1:36133,Total threads: 2
Dashboard: http://127.0.0.1:35825/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:37095,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-9qcv90qw,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-9qcv90qw

0,1
Comm: tcp://127.0.0.1:36917,Total threads: 2
Dashboard: http://127.0.0.1:44721/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:40585,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-orr62onk,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-orr62onk

0,1
Comm: tcp://127.0.0.1:44463,Total threads: 2
Dashboard: http://127.0.0.1:36719/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:40555,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-9ujk_l5u,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-9ujk_l5u

0,1
Comm: tcp://127.0.0.1:38323,Total threads: 2
Dashboard: http://127.0.0.1:43531/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:41537,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-alt7cq0n,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-alt7cq0n

0,1
Comm: tcp://127.0.0.1:33929,Total threads: 2
Dashboard: http://127.0.0.1:46527/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:33105,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-7z4uz5ww,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-7z4uz5ww


In [None]:
client.close()

In [None]:
client.shutdown()

In [None]:
p = '/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/19/a'

p.split("/")[-5]

'SOC_CJ'

In [None]:
if config_file and os.path.isfile(config_file):
    logger.info("Found config file: %s", os.path.realpath(config_file))
    with open(config_file, "r", encoding="utf-8") as f:
        config = json.load(f)
    issues = gi.apply_select_func(
        issue_class,
        config,
        input_dir=input_dir,
        access_rights=access_rights,
        select_func=select_func,
        tmp_dir=temp_dir,
    )
    logger.info(
        "%s issues remained to import after applying filter: %s", len(issues), issues
    )
else:
    logger.info("No config file found.")
    issues = gi.apply_detect_func(
        issue_class,
        input_dir,
        access_rights,
        detect_func=detect_func,
        tmp_dir=temp_dir,
    )
    logger.info("%s  issues to import detected", len(issues))

In [None]:
len(issues), sorted(issues, key=lambda i: i.date)

(362,
 [IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 8, 2), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/02/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
  IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 8, 3), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/03/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
  IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 8, 5), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/05/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
  IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 8, 6), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/06/a', metad

In [None]:
soc_cj_issues = list(filter(lambda i: i.alias=='SOC_CJ', issues))
soc_vs_issues = list(filter(lambda i: i.alias=='SOC_VS', issues))

len(soc_cj_issues), len(soc_vs_issues)

(30, 332)

Issues detection and filtering works! 
Check to see what we detect for ingestion

In [None]:
all_issues = gi.apply_detect_func(
        issue_class,
        input_dir,
        access_rights,
        detect_func=detect_func,
        
        tmp_dir=temp_dir,
    )

len(all_issues), Counter([i.alias for i in all_issues])

(6666,
 Counter({'SOC_CJ': 3924,
          'SOC_CP': 1481,
          'SOC_SO': 746,
          'SOC_VS': 332,
          'SOC_TH': 183}))

In [None]:
issues_per_alias = {}
for i in all_issues:
    if i.alias in issues_per_alias:
        issues_per_alias[i.alias].append(i)
    else:
        issues_per_alias[i.alias] = [i]

min_max_dates = {
    alias: (str(min(a_issues, key=lambda i: i.date).date), str(max(a_issues , key=lambda i: i.date).date)) 
    for alias, a_issues in issues_per_alias.items()
}
min_max_dates

{'SOC_TH': ('1941-03-30', '1945-03-29'),
 'SOC_CP': ('1939-03-18', '1945-08-29'),
 'SOC_VS': ('1943-01-31', '1944-12-30'),
 'SOC_CJ': ('1940-07-22', '1945-12-31'),
 'SOC_SO': ('1940-06-12', '1945-07-30')}

## 3. INA

For INA we currently only have a very small sample on the european construction

In [2]:
metadata_filepath = "/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/ConstructionEuropeenne_EMISSIONPHONO.csv"

ina_sample_dir = os.path.dirname(metadata_filepath)
metadata_filepath, ina_sample_dir

('/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/ConstructionEuropeenne_EMISSIONPHONO.csv',
 '/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA')

### 1. Metadata exploration and/or extraction

INA has provided us with a small excel document on their sub-corpus.
This document contains the link between metadata about the shows and the source files provided

In [30]:
df = pd.read_csv(metadata_filepath)
print(len(df))
df.head()

278


Unnamed: 0,Identifiant de la notice,Titre propre,Titre collection,Date d'enregistrement,Durée,Société de programmes,Canal de diffusion,Générique (Aff. Lig.),Résumé,Nom fichier (info)
0,PH811014938,,Conseil de l'Europe,01/01/1949,00:17:13,RDF,Chaine non determinée,,,DB09994_01_0_171306
1,PHD86046252,Conseil de l'Europe et Parlement européen,La ronde des nations,20/07/1949,00:17:16,RTF,Chaîne Nationale,"PAR Philip, André ; PAR Roche, Emile ; P...","A propos de la réunion prochaine, à Strasbourg...",DBA01061_01_000000_001716\n
2,PHD86026894,1ère session de l'assemblée européenne,Conseil de l'Europe,08/09/1949,00:05:00,RTF,Chaîne Nationale,"PAR Spaak, Paul Henri ;",,DB03300_01__\n
3,PHD85028461,L'UNESCO au travail,La ronde des nations,02/09/1949,,RTF,Chaîne Nationale,,,DB03070_01__\n
4,PH808003302,Accords économiques internationaux,La ronde des nations,12/01/1950,00:17:13,RTF,Chaîne Parisienne,"PAR Devinat, Paul ; PAR Gascuel, Jacques ;...","Attention, matériel sauvegardé, une restaurati...",DB09994_01_000000_001713\n


In [24]:
# first check if the titles are unique
print("Checking for duplicates in each column")
for col in df.columns:
    print(f" - {col}: {df[col].duplicated().any()}")

Checking for duplicates in each column
 - Identifiant de la notice: False
 - Titre propre: True
 - Titre collection: True
 - Date d'enregistrement: True
 - Durée: True
 - Société de programmes: True
 - Canal de diffusion: True
 - Générique (Aff. Lig.) : True
 - Résumé: True
 - Nom fichier (info): True


In [25]:
df_collections = df.groupby("Titre collection").agg(list)
df_collections

Unnamed: 0_level_0,Identifiant de la notice,Titre propre,Date d'enregistrement,Durée,Société de programmes,Canal de diffusion,Générique (Aff. Lig.),Résumé,Nom fichier (info)
Titre collection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
"C'est en France, c'est en Europe","[00165368, 00167288, 00169364, 00171017, 00172...","[Le dimanche des Européens, Enseignement, éduc...","[08/09/1996, 15/09/1996, 22/09/1996, 29/09/199...","[00:55:00 , 00:55:00 , 00:55:00 , 00:55:00 , 0...","[RF, RF, RF, RF, RF, RF, RF, RF, RF, RF, RF, R...","[France Inter, France Inter, France Inter, Fra...","[PRO Dhordain, Roland ; PRO Mercier, Sandri...",[Reportages au coeur de la ville de Düsseldorf...,"[96F04705SA0001_01__\n, 96F04705SA0002_01__\n,..."
Conseil de l'Europe,"[PH811014938, PHD86026894, PHD86026156, PHD860...","[nan, 1ère session de l'assemblée européenne, ...","[01/01/1949, 08/09/1949, 03/05/1950, 09/08/195...","[00:17:13, 00:05:00 , 00:04:04 , nan, 00:02:20...","[RDF, RTF, RTF, RTF, RTF, RTF, RTF, RTF, RTF, ...","[Chaine non determinée, Chaîne Nationale, Chai...","[nan, PAR Spaak, Paul Henri ; , nan, JOU We...","[nan, nan, L'importance de cette institution, ...","[DB09994_01_0_171306, DB03300_01__\n, DB16046_..."
L'Europe est pour demain,"[PHD88014525, PHD88014523, PHD88014524, PHD880...",[L'Europe est pour demain : enregistrement du ...,"[12/05/1955, 04/05/1955, 06/04/1955, 01/06/195...","[00:22:00 , 00:18:00 , nan, 00:15:00 , 00:17:0...","[RTF, RTF, RTF, RTF, RTF, RTF, RTF]","[Chaîne Nationale, Chaîne Parisienne, Chaîne N...","[PRO Andrieu, Jean José ; PAR Brousse, Pier...",[Monsieur Pierre BROUSSE : le problème des tra...,"[LXG00305_01__\n, LXF00333_01__\n, LXG00272_01..."
La ronde des nations,"[PHD86046252, PHD85028461, PH808003302, PHD860...","[Conseil de l'Europe et Parlement européen, L'...","[20/07/1949, 02/09/1949, 12/01/1950, 12/01/195...","[00:17:16 , nan, 00:17:13 , 00:17:00 , 00:21:3...","[RTF, RTF, RTF, RTF, RTF, RTF, RTF, RTF, RTF, ...","[Chaîne Nationale, Chaîne Nationale, Chaîne Pa...","[PAR Philip, André ; PAR Roche, Emile ; ...","[A propos de la réunion prochaine, à Strasbour...","[DBA01061_01_000000_001716\n, DB03070_01__\n, ..."
Mode d'emploi de l'Europe,"[PHD94031438, PHD94031439, PHD94031441, PHD940...","[Europe, République fédérale d'Allemagne, Holl...","[01/01/1964, 01/01/1964, 01/01/1964, 01/01/196...","[00:11:00 , 00:11:00 , 00:11:00 , 00:11:00 , 0...","[RTF, RTF, RTF, RTF, RTF, RTF, RTF, RTF, RTF, ...","[France Inter, France Inter, France Inter, Fra...","[PRO Comte, Pierre ; PRO Franceschini, Paul...",[Une émission de Pierre COMTE avec le concours...,[243P00001_01__\n243P00065_01__\n243P00067_01_...
Problèmes européens,"[PHD88018787, PHD88018788, PHD88018789, PHD880...",[Problèmes européens : émission du 21 mai 1956...,"[21/05/1956, 28/05/1956, 11/06/1956, 18/06/195...","[00:19:50 , 00:20:00 , 00:20:10 , 00:20:00 , n...","[RTF, RTF, RTF, RTF, RTF, RTF, RTF, RTF, RTF, ...","[Chaîne Nationale, Chaîne Nationale, Chaîne Na...","[PRO Balensi, Jean ; PRO Dournes, Pierre ;...",[Magazine hebdomadaire d'une vingtaine de minu...,"[LD06861_01__\n, LD07024_01__\n, LD07386_01__\..."


#### Creating aliases for each collection title - corresponding to media titles

In [None]:
# option 1, longer aliases, easier to identify
collection_aliases = {
    "C'est en France, c'est en Europe": "france_europe",
    "Conseil de l'Europe": "conseil_europe",
    "L'Europe est pour demain": "europe_demain",
    "La ronde des nations": "ronde_nations",
    "Mode d'emploi de l'Europe": "emploi_europe",
    "Problèmes européens": "problemes_europeens"
}
# option 2, shorter aliases, similar to existing ones
collection_aliases_2 = {
    "C'est en France, c'est en Europe": "CFCE",
    "Conseil de l'Europe": "CDE",
    "L'Europe est pour demain": "EPD",
    "La ronde des nations": "RDN",
    "Mode d'emploi de l'Europe": "MEE",
    "Problèmes européens": "PBE"
}

# check if any of these aliases are already used in our collections
first_valid = not any(a in ALL_MEDIA for a in collection_aliases.values())
second_valid = not any(a in ALL_MEDIA for a in collection_aliases_2.values())
print(f"First option: {first_valid}, decond option: {second_valid}")
if first_valid and second_valid:
    print("All the listed aliases could be used as they don't exist yet in Impresso")

First option: True, decond option: True
All the listed aliases could be used as they don't exist yet in Impresso


Create the final formating before creating a json from the csv.

We need to add the aliases for each collection, as well as creating 1 line per notice ID and file name pair - as they seem to be the issue folder/filenames

In [53]:
date_str = '20/07/1949'
date_obj = datetime.strptime(date_str, "%d/%m/%Y")

# both approaches work but the one "by hand" is faster
date_obj, date_obj.strftime('%Y-%m-%d'), '-'.join(date_str.split('/')[::-1])

(datetime.datetime(1949, 7, 20, 0, 0), '1949-07-20', '1949-07-20')

In [59]:
i=3
chr(i+ord('a'))

'd'

In [54]:
ord('a')

97

In [78]:
def issue_id_from_row(row, existing_issue_dates):
    # the date is in format DD/MM/YYYY and we want YYYY-MM-DD
    i_date = '-'.join(row["Date d'enregistrement"].split('/')[::-1])
    alias = row['Alias Collection']
    prev_ed = Counter(existing_issue_dates)[i_date]
    # automatically define the edition based on the existing number of shows on the given day
    edition = chr(prev_ed+ord('a'))
    if edition != 'a': 
        print(f"{alias}-{i_date}-{edition}: notice_id = {row['Identifiant de la notice']}, edition={edition}")
    existing_issue_dates.append(i_date)
    # construct the id
    return f"{alias}-{i_date}-{edition}"

In [120]:
final_df = df.copy()
final_df['Alias Collection'] = final_df["Titre collection"].apply(lambda x: collection_aliases_2[x])
final_df['Noms fichers'] = final_df['Nom fichier (info)'].apply(lambda x: [x.replace('\n', '')] if x.endswith("\n") else x.split('\n'))
final_df = final_df.explode('Noms fichers')
#final_df['Noms fichers'] = final_df['Noms fichers'].apply(lambda x: x.split('_')[0])

final_df['Nom Dossier'] = final_df[['Identifiant de la notice', 'Noms fichers']].apply(lambda x: '_'.join([x['Identifiant de la notice'], x['Noms fichers'].split('_')[0]]), axis=1)

# finally, create the canonical IDs (issues and audio records) for each program based on the existing info
existing_issue_dates = []
final_df['Issue ID'] = final_df.apply(lambda r: issue_id_from_row(r, existing_issue_dates), axis=1)
final_df['Audio Record ID'] = final_df['Issue ID'].apply(lambda x: f"{x}-r0001")

final_df

RDN-1950-01-12-b: notice_id = PHD86024916, edition=b
CDE-1950-08-01-b: notice_id = PHD86062503, edition=b
CDE-1950-08-01-c: notice_id = PHD86062506, edition=c
CDE-1950-08-01-d: notice_id = PHD86062501, edition=d
CDE-1950-08-01-e: notice_id = PHD86062502, edition=e
CDE-1950-08-01-f: notice_id = PHD86062504, edition=f
CDE-1950-08-01-g: notice_id = PHD86062500, edition=g
CDE-1950-08-01-h: notice_id = PHD86062505, edition=h
PBE-1957-03-09-b: notice_id = PHD98209737, edition=b
PBE-1957-03-09-c: notice_id = PHD98209739, edition=c
PBE-1957-03-09-d: notice_id = PHD98209740, edition=d
PBE-1957-03-09-e: notice_id = PHD88028574, edition=e
PBE-1957-03-30-b: notice_id = PHD98205959, edition=b
PBE-1957-04-13-b: notice_id = PHD88028580, edition=b
PBE-1957-04-15-b: notice_id = PHD88028583, edition=b
PBE-1957-04-27-b: notice_id = PHD98206181, edition=b
PBE-1957-05-18-b: notice_id = PHD88028586, edition=b
PBE-1957-06-01-b: notice_id = PHD98202120, edition=b
PBE-1957-06-01-c: notice_id = PHD88028588, edi

Unnamed: 0,Identifiant de la notice,Titre propre,Titre collection,Date d'enregistrement,Durée,Société de programmes,Canal de diffusion,Générique (Aff. Lig.),Résumé,Nom fichier (info),Alias Collection,Noms fichers,Nom Dossier,Issue ID,Audio Record ID
0,PH811014938,,Conseil de l'Europe,01/01/1949,00:17:13,RDF,Chaine non determinée,,,DB09994_01_0_171306,CDE,DB09994_01_0_171306,PH811014938_DB09994,CDE-1949-01-01-a,CDE-1949-01-01-a-r0001
1,PHD86046252,Conseil de l'Europe et Parlement européen,La ronde des nations,20/07/1949,00:17:16,RTF,Chaîne Nationale,"PAR Philip, André ; PAR Roche, Emile ; P...","A propos de la réunion prochaine, à Strasbourg...",DBA01061_01_000000_001716\n,RDN,DBA01061_01_000000_001716,PHD86046252_DBA01061,RDN-1949-07-20-a,RDN-1949-07-20-a-r0001
2,PHD86026894,1ère session de l'assemblée européenne,Conseil de l'Europe,08/09/1949,00:05:00,RTF,Chaîne Nationale,"PAR Spaak, Paul Henri ;",,DB03300_01__\n,CDE,DB03300_01__,PHD86026894_DB03300,CDE-1949-09-08-a,CDE-1949-09-08-a-r0001
3,PHD85028461,L'UNESCO au travail,La ronde des nations,02/09/1949,,RTF,Chaîne Nationale,,,DB03070_01__\n,RDN,DB03070_01__,PHD85028461_DB03070,RDN-1949-09-02-a,RDN-1949-09-02-a-r0001
4,PH808003302,Accords économiques internationaux,La ronde des nations,12/01/1950,00:17:13,RTF,Chaîne Parisienne,"PAR Devinat, Paul ; PAR Gascuel, Jacques ;...","Attention, matériel sauvegardé, une restaurati...",DB09994_01_000000_001713\n,RDN,DB09994_01_000000_001713,PH808003302_DB09994,RDN-1950-01-12-a,RDN-1950-01-12-a-r0001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273,PHD86024396,L'Allemagne dans les organisations internation...,La ronde des nations,31/10/1949,00:20:23,RTF,Chaine non determinée,"PAR Rifflet, Raymond ;","Attention, matériel sauvegardé, une restaurati...",DB06032_01_000000_002023\n,RDN,DB06032_01_000000_002023,PHD86024396_DB06032,RDN-1949-10-31-a,RDN-1949-10-31-a-r0001
274,PHD86024472,L'Angleterre et l'Europe,La ronde des nations,17/11/1949,00:24:14,RTF,Chaîne Nationale,"PAR Servan Schreiber, Jean Jacques ; PAR Al...","Attention, matériel sauvegardé, une restaurati...",DB07004_01_000000_002414\n,RDN,DB07004_01_000000_002414,PHD86024472_DB07004,RDN-1949-11-17-a,RDN-1949-11-17-a-r0001
275,PHD86024499,Fédéralisme et régionalisme,La ronde des nations,21/11/1949,00:19:05,RTF,Chaîne Nationale,"PAR Plisnier, Charles ; PAR Pilet Golaz, Ma...","Attention, matériel sauvegardé, une restaurati...",DB07255_01_000004_001910\n,RDN,DB07255_01_000004_001910,PHD86024499_DB07255,RDN-1949-11-21-a,RDN-1949-11-21-a-r0001
276,PHD86024615,L'Organisation Européenne de coopération écono...,La ronde des nations,09/12/1949,00:19:04,RTF,Chaine non determinée,,"Attention, matériel sauvegardé, une restaurati...",DB08319_01_000000_001904\n,RDN,DB08319_01_000000_001904,PHD86024615_DB08319,RDN-1949-12-09-a,RDN-1949-12-09-a-r0001


In [121]:
# save this df in its current state to put upload back to the Drive
metadata_outpath = os.path.join(os.path.dirname(metadata_filepath), "construction_europeenne_metadata.csv")
#metadata_filepath
final_df.to_csv(metadata_outpath, index=False)

#### Now reformat this DF into a metadata json to be used during ingestion

In [122]:
# set the index to be the foldername, so that the rest of the info can be directly fetched from it.
metadata_json_outpath = os.path.join(os.path.dirname(metadata_filepath), "ina_metadata.json")

df_dict = final_df.replace(np.nan, None).to_dict(orient="records")

metadata_dict = {r['Nom Dossier']: r for r in df_dict}

with open(metadata_json_outpath, 'w', encoding='utf-8') as f:
    json.dump(metadata_dict, f)

### 2. Now the importer

#### a. issues

In [3]:

metadata_json_outpath = os.path.join(ina_sample_dir, "ina_metadata.json")

with open(metadata_json_outpath, 'r', encoding='utf-8') as f:
    read_meatadata_json = json.load(f)

read_meatadata_json['PH808003302_DB09994']

{'Identifiant de la notice': 'PH808003302',
 'Titre propre': 'Accords économiques internationaux',
 'Titre collection': 'La ronde des nations',
 "Date d'enregistrement": '12/01/1950',
 'Durée': '00:17:13 ',
 'Société de programmes': 'RTF',
 'Canal de diffusion': 'Chaîne Parisienne',
 'Générique (Aff. Lig.) ': 'PAR Devinat, Paul  ;   PAR Gascuel, Jacques  ;   PAR Servan Schreiber, Emile  ;   ',
 'Résumé': 'Attention, matériel sauvegardé, une restauration peut être nécessaire avant réutilisation.\nDocument non monté.\nDébat présenté par Fabrice LAMATHE (?) sur le charbon et l\'Europe avec Paul DEVINAT, député et ancien ministre, Jacques GASCUEL, directeur de "Perspectives" et Emile SERVAN-SCHREIBER directeur des "Echos" : la surproduction européenne de charbon selon l\'OECE, sa répartition et les inégalités de prix entre pays possible source de crises, la possible ouverture des frontières européennes, la montée d\'autres énergies comme l\'énergie atomique.',
 'Nom fichier (info)': 'DB099

In [4]:
ina_issuedirs = {}
for rb_issue_subdir in os.listdir(ina_sample_dir):
    issue_path = os.path.join(ina_sample_dir,rb_issue_subdir)
    if os.path.isdir(issue_path):
        print(rb_issue_subdir)
        if rb_issue_subdir in read_meatadata_json:
            issue_metadata = read_meatadata_json[rb_issue_subdir]
            issuedir = id_to_issuedir(issue_metadata['Issue ID'], issue_path)
            ina_issuedirs[rb_issue_subdir] = INAIssueDir(issuedir.alias, issuedir.date, issuedir.edition, issue_path, metadata_json_outpath)

ina_issuedirs

00165368_96F04705SA0001
00167288_96F04705SA0002
PH808003302_DB09994


{'00165368_96F04705SA0001': IssueDirectory(alias='CFCE', date=datetime.date(1996, 9, 8), edition='a', path='/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/00165368_96F04705SA0001', metadata_file='/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/ina_metadata.json'),
 '00167288_96F04705SA0002': IssueDirectory(alias='CFCE', date=datetime.date(1996, 9, 15), edition='a', path='/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/00167288_96F04705SA0002', metadata_file='/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/ina_metadata.json'),
 'PH808003302_DB09994': IssueDirectory(alias='RDN', date=datetime.date(1950, 1, 12), edition='a', path='/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/PH808003302_DB09994', metadata_file='/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/ina_metadata.json')}

In [23]:
for rb_issue_subdir, issuedir in ina_issuedirs.items():
    issue_metadata = read_meatadata_json[rb_issue_subdir]
    print(issue_metadata)
    print(os.path.basename(issuedir.path))
    print(sum(x.endswith('.xml') for x in os.listdir(issuedir.path)))

{'Identifiant de la notice': '00165368', 'Titre propre': 'Le dimanche des Européens', 'Titre collection': "C'est en France, c'est en Europe", "Date d'enregistrement": '08/09/1996', 'Durée': '00:55:00 ', 'Société de programmes': 'RF', 'Canal de diffusion': 'France Inter', 'Générique (Aff. Lig.) ': 'PRO Dhordain, Roland  ;   PRO Mercier, Sandrine  ;   REA Kouyoumdjian, Hélène  ;   PRE Dhordain, Roland  ;   PRE Mercier, Sandrine  ;   PRE Lamarque, José Manuel  ;   ', 'Résumé': "Reportages au coeur de la ville de Düsseldorf au moment où elle fête les 50 ans du land ; une allemande parle des règles strictes de fermeture des magasins le soir et le samedi et de la vie le dimanche (randonnée avec les amis, les sorties au théâtre ou au concert, la lecture...) ; des élèves allemand de 15 ans racontent comment ils passent leur dimanche. Comment les suédois passent leur dimanche. Le dimanche d'une famille anglaise. Un dimanche en Grèce (entr. téléphonique de Grèce). - les dimanches au pays Basque 

#### Get the languages

In [5]:
ex_audio_path = "/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/00165368_96F04705SA0001/00165368_96F04705SA0001_01___EXPORT.xml"

with open(ex_audio_path, "r", encoding="utf-8") as f:
    raw_xml = f.read()

xml_doc = BeautifulSoup(raw_xml, "xml")

In [29]:
langs = Counter([s.get('lang') for s in xml_doc.find_all('Speaker')]+[ss.get('lang') for ss in xml_doc.find_all("SpeechSegment")] + ['en', 'en', 'de'])
langs_text = list(set([ss.get('lang') for ss in xml_doc.find_all("SpeechSegment")]))
langs, len(langs.keys()), len(langs), max(langs)

(Counter({'fre': 931, 'en': 2, 'de': 1}), 3, 3, 'fre')

In [6]:
{k: v.path for k,v in ina_issuedirs.items()}

{'00165368_96F04705SA0001': '/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/00165368_96F04705SA0001',
 '00167288_96F04705SA0002': '/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/00167288_96F04705SA0002',
 'PH808003302_DB09994': '/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/PH808003302_DB09994'}

In [5]:
ina_issues = {k: INABroadcastIssue(v) for k,v in ina_issuedirs.items()}
ina_issues

RDN-1950-01-12-a - The issue's folder /home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/PH808003302_DB09994 does not contain the expected xml file PH808003302_DB09994_01_000000_001713. Contents of the folder are ['PH808003302_DB09994_01_0_171306_EXPORT.MP3', 'PH808003302_DB09994_01_0_171306_EXPORT.json', 'PH808003302_DB09994_01_0_171306_EXPORT.xml'] will be used.


RDN-1950-01-12-a - The issue's folder /home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/INA/PH808003302_DB09994 does not contain the expected xml file PH808003302_DB09994_01_000000_001713. Contents of the folder are ['PH808003302_DB09994_01_0_171306_EXPORT.MP3', 'PH808003302_DB09994_01_0_171306_EXPORT.json', 'PH808003302_DB09994_01_0_171306_EXPORT.xml'] will be used.


{'00165368_96F04705SA0001': <text_preparation.importers.ina.classes.INABroadcastIssue at 0x7f5ec142f8d0>,
 '00167288_96F04705SA0002': <text_preparation.importers.ina.classes.INABroadcastIssue at 0x7f5ec0833e90>,
 'PH808003302_DB09994': <text_preparation.importers.ina.classes.INABroadcastIssue at 0x7f5ebb463e10>}

In [12]:
ina_issues['00165368_96F04705SA0001'].content_items

[{'m': {'id': 'CFCE-1996-09-08-a-i0001',
   'lg': 'fre',
   'pp': [],
   'tp': 'radio_broadcast_episode',
   'ro': 1,
   't': 'Le dimanche des Européens'},
  'l': {'source': ['Identifiant de la notice (in metadata): 00165368',
    'Noms fichers (in metadata): 96F04705SA0001_01__',
    'Noms fichers (in practice): 00165368_96F04705SA0001_01___EXPORT']}}]

#### b. Audio Records

In [6]:
test_audio = ina_issues['00165368_96F04705SA0001'].audio_records[0]
test_audio.record_data

{'id': 'CFCE-1996-09-08-a-r0001',
 'cdt': '2025-05-16 17:59:43',
 'ts': '2025-05-16T15:59:43Z',
 's': [],
 'iiif_base_uri': 'https://impresso-project.ch/api/proxy/iiif/',
 'stt': '00:00:00',
 'dur': '',
 'st': 'radio_broadcast',
 'sm': 'audio',
 'cc': True}

In [8]:
test_audio.add_issue(ina_issues['00165368_96F04705SA0001'])
test_audio.parse()

test_audio.record_data

Saving utterance: {'tc': [0.0, 3.0], 'speaker': 'SPEAKER_21', 'ss': [{'tc': [0.0, 3.0], 't': [{'tc': [0.0, 0.333], 'tx': "C'est"}, {'tc': [0.333, 0.333], 'tx': 'en'}, {'tc': [0.666, 0.333], 'tx': 'France,'}, {'tc': [1.0, 0.333], 'tx': "c'est"}, {'tc': [1.333, 0.333], 'tx': 'en'}, {'tc': [1.666, 0.333], 'tx': 'Europe'}, {'tc': [2.0, 0.333], 'tx': 'avec'}, {'tc': [2.333, 0.333], 'tx': 'Air'}, {'tc': [2.666, 0.334], 'tx': 'France.'}]}]}
Saving utterance: {'tc': [30.0, 103.0], 'speaker': 'SPEAKER_09', 'ss': [{'tc': [30.0, 23.0], 't': [{'tc': [30.0, 1.916], 'tx': "C'est"}, {'tc': [31.916, 1.916], 'tx': 'en'}, {'tc': [33.833, 1.916], 'tx': 'France,'}, {'tc': [35.75, 1.916], 'tx': "c'est"}, {'tc': [37.666, 1.916], 'tx': 'en'}, {'tc': [39.583, 1.916], 'tx': 'Europe,'}, {'tc': [41.5, 1.916], 'tx': "c'est"}, {'tc': [43.416, 1.916], 'tx': 'une'}, {'tc': [45.333, 1.916], 'tx': 'nouvelle'}, {'tc': [47.25, 1.916], 'tx': 'émission,'}, {'tc': [49.166, 1.916], 'tx': 'Le'}, {'tc': [51.083, 1.917], 'tx':

{'id': 'CFCE-1996-09-08-a-r0001',
 'cdt': '2025-05-16 17:59:43',
 'ts': '2025-05-16T15:59:43Z',
 's': [{'tc': [0.0, 3283.04],
   'u': [{'tc': [0.0, 3.0],
     'speaker': 'SPEAKER_21',
     'ss': [{'tc': [0.0, 3.0],
       't': [{'tc': [0.0, 0.333], 'tx': "C'est"},
        {'tc': [0.333, 0.333], 'tx': 'en'},
        {'tc': [0.666, 0.333], 'tx': 'France,'},
        {'tc': [1.0, 0.333], 'tx': "c'est"},
        {'tc': [1.333, 0.333], 'tx': 'en'},
        {'tc': [1.666, 0.333], 'tx': 'Europe'},
        {'tc': [2.0, 0.333], 'tx': 'avec'},
        {'tc': [2.333, 0.333], 'tx': 'Air'},
        {'tc': [2.666, 0.334], 'tx': 'France.'}]}]},
    {'tc': [30.0, 103.0],
     'speaker': 'SPEAKER_09',
     'ss': [{'tc': [30.0, 23.0],
       't': [{'tc': [30.0, 1.916], 'tx': "C'est"},
        {'tc': [31.916, 1.916], 'tx': 'en'},
        {'tc': [33.833, 1.916], 'tx': 'France,'},
        {'tc': [35.75, 1.916], 'tx': "c'est"},
        {'tc': [37.666, 1.916], 'tx': 'en'},
        {'tc': [39.583, 1.916], 'tx'

In [32]:
test_audio.mp3_filepath = test_audio.xml_filepath.replace(".xml", ".MP3")
audio = MP3(test_audio.mp3_filepath)
test_audio.record_data['dur'] = time.strftime("%H:%M:%S", time.gmtime(audio.info.length))
test_audio.record_data

{'id': 'CFCE-1996-09-08-a-r0001',
 'cdt': '2025-05-16 16:16:02',
 'ts': '2025-05-16T14:16:02Z',
 'r': [],
 'iiif_base_uri': 'https://impresso-project.ch/api/proxy/iiif/',
 'st': 'radio_broadcast',
 'sm': 'audio',
 'cc': True,
 'dur': '00:54:44'}

In [16]:
xml_doc = test_audio.xml

audio_etimes_secs = [float(ss.get('etime')) for ss in xml_doc.findAll('SpeechSegment')]
audio_len_secs = max(audio_etimes_secs)
audio_len_secs

3283.04

In [None]:
def parse(self) -> None:

    self.record_data["dur"] = self._get_duration()
    xml_doc = self.xml

    
    speech_segs = get_utterances(xml_doc)
    

In [57]:
def extract_time_coords_from_elem(elem):
    if elem.name == 'SpeechSegment':
        return [float(elem.get('stime')), float(elem.get('etime'))-float(elem.get('stime'))]
    elif elem.name == 'Word':
        return [float(elem.get('stime')), float(elem.get('dur'))]

In [61]:
def get_utterances(xml_doc):
    xml_speech_segs = xml_doc.findAll('SpeechSegment')
    utterances = []

    same_speaker_speech_segs = []
    last_speaker = None
    last_utt_stime = 0
    last_utt_etime = 0
    for xml_ss in xml_speech_segs:
        print(f"New speech segment")

        tokens = [
            {
                "tc": extract_time_coords_from_elem(word),
                "tx": word.get_text()
            } for word in xml_ss.findAll('Word')
        ]
        
        if xml_ss.get('spkid') == last_speaker:
            # case 1, same speaker as last speech segment,
            same_speaker_speech_segs.append(
                {
                    "tc": extract_time_coords_from_elem(xml_ss),
                    "t": tokens
                }
            )
            # update the last end time for the current utterance
            last_utt_etime = float(xml_ss.get('etime')) 
        else:
            # case 2: new speaker, save the last utterance if possible and start a new one
            if last_speaker is not None:
                utterances.append(
                    {
                        "tc": [last_utt_stime, last_utt_etime-last_utt_stime],
                        "speaker": last_speaker,
                        "ss": same_speaker_speech_segs
                    }
                )
                print(f"Saving utterance: {utterances[-1]}")
            
            # start the new utterance
            last_utt_stime = float(xml_ss.get('stime')) 
            last_utt_etime = float(xml_ss.get('etime'))
            last_speaker = xml_ss.get('spkid')
            same_speaker_speech_segs = [
                {
                    "tc": extract_time_coords_from_elem(xml_ss),
                    "t": tokens
                }
            ]

    return utterances


In [62]:
utts = get_utterances(xml_doc)

New speech segment
New speech segment
Saving utterance: {'tc': [0.0, 3.0], 'speaker': 'SPEAKER_21', 'ss': [{'tc': [0.0, 3.0], 't': [{'tc': [0.0, 0.333], 'tx': "C'est"}, {'tc': [0.333, 0.333], 'tx': 'en'}, {'tc': [0.666, 0.333], 'tx': 'France,'}, {'tc': [1.0, 0.333], 'tx': "c'est"}, {'tc': [1.333, 0.333], 'tx': 'en'}, {'tc': [1.666, 0.333], 'tx': 'Europe'}, {'tc': [2.0, 0.333], 'tx': 'avec'}, {'tc': [2.333, 0.333], 'tx': 'Air'}, {'tc': [2.666, 0.334], 'tx': 'France.'}]}]}
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
New speech segment
Saving utterance: {'tc': [30.0, 103.0], 'speaker': 'SPEAKER_09', 'ss': [{'tc': [30.0, 23.0], 't': [{'tc': [30.0, 1.916], 'tx': "C'est"}, {'tc': [31.916, 1.916], 'tx': 'en'}, {'tc': [