# Explore Radio Broadcasts and Radio Bulletins Schemas

We are now integrating Radio data into Impresso. 
Radio data is more complex and diverse than the Newspaper data we have handled until now, and we are now looking at how we could define schemas for the various sources that work with our internal formats and pipelines.

### Imports

In [1]:
from tqdm import tqdm
from pathlib import Path
import os
import json
import datetime
from bs4 import BeautifulSoup
from impresso_essentials.utils import IssueDir, SourceType, SourceMedium
from text_preparation.utils import coords_to_xywh
from text_preparation.importers.classes import CanonicalIssue, CanonicalPage
from text_preparation.importers.swissinfo.detect import SwissInfoIssueDir
from impresso_essentials.io.fs_utils import canonical_path
from text_preparation.importers.swissinfo.classes import SwissInfoRadioBulletinPage, SwissInfoRadioBulletinIssue

## 1. Small debugs and checks

In [None]:
SourceType.RB.value

In [None]:
SourceMedium.TPS.value

## 2. Issue implementation

In [2]:
base_swissinfo_path = "/mnt/project_impresso/original/"
eg_issue_path = "SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/07/22/a"
og_data_base_dir = os.path.join(base_swissinfo_path, "SWISSINFO/WW2-SOC-bulletins/ww2-PDF")

debug_issueDir = SwissInfoIssueDir("SOC_CJ", datetime.date(1940, 7, 22), 'a', os.path.join(base_swissinfo_path,eg_issue_path))
debug_issueDir

IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 7, 22), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/07/22/a')

In [3]:
IssueDir("SOC_CJ", datetime.date(1940, 7, 22), 'a', eg_issue_path)

IssueDir(alias='SOC_CJ', date=datetime.date(1940, 7, 22), edition='a', path='SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/07/22/a')

In [4]:
test_issue = SwissInfoRadioBulletinIssue(debug_issueDir)
test_issue

<text_preparation.importers.swissinfo.classes.SwissInfoRadioBulletinIssue at 0x7ff904880c10>

In [65]:
test_issue.issue_data

{'id': 'SOC_CJ-1940-07-22-a',
 'cdt': '2025-04-25 17:00:46',
 'st': 'radio_broadcast',
 'sm': 'typescript',
 'i': [{'m': {'id': 'SOC_CJ-1940-07-22-a-i0001',
    'lg': 'de',
    'pp': [1, 2],
    'tp': 'chronicle',
    'ro': 1,
    't': 'Tageschronik 22.07.1940'},
   'l': {'source': 'WW2-SOC-bulletins/ww2-PDF/SRI_XY_CJ_19400722_DE.pdf'}}],
 'pp': ['SOC_CJ-1940-07-22-a-p0001', 'SOC_CJ-1940-07-22-a-p0002'],
 'rc': 'SOC (KWD)',
 'rp': 'Tageschronik',
 'n': ['Page 1: page size within OCR before coord rescaling: [595.9199829101562, 837.1199951171875]',
  'Page 2: page size within OCR before coord rescaling: [607.6799926757812, 848.6400146484375]']}

In [23]:
Path('WW2-SOC-bulletins/ww2-PDF/SRI_XY_CJ_19400722_DE.pdf').stem

'SRI_XY_CJ_19400722_DE'

In [18]:
test_issue.path.split("/")[:-5]

['',
 'mnt',
 'project_impresso',
 'original',
 'SWISSINFO',
 'WW2-SOC-bulletins-json']

In [None]:
test_issue.pages[1].page_data

In [None]:
with open(test_issue.json_file, encoding="utf-8") as f:
    bulletin_json = json.load(f)

test_issue.bulletin_lang = bulletin_json['lang']

test_issue.page_jsons = []

for page in bulletin_json["ocr_pages"]:
    
    page_img_file = bulletin_json["jp2_full_paths"][page["page_num"]]
    page_no = int(page["page_num"])+1
    page_id = "{}-p{}".format(test_issue.id, str(page_no).zfill(4))
    page_img_name = page_img_file.split("/")[-1].split(".")[0]
    # ensure the page numbering is correct
    assert page_img_name == page_id, f"{test_issue.id} problem with page numbering/naming, page_img_name ({page_img_name}) != page_id ({page_id})"
    
    # format the page json for future use
    #test_issue.page_jsons.append(test_issue.construct_page_json(page_id, page))

    # create page object and add it to the list of pages
    page = SwissInfoRadioBulletinPage(page_id, page_no)
    test_issue.pages.append(page)

    # TODO maybe - extract fonts

In [None]:
ocr_p1 = bulletin_json["ocr_pages"][0]
ocr_p1

In [None]:
round(ocr_p1['ocr_page_size'][0], 4)

In [None]:
ocr_p1['blocks_with_lines'][0]

In [None]:
json_pages = []
for page_orc in bulletin_json["ocr_pages"]:
    page_json = {
        "id": page_id,
        "fw": page_orc['jp2_img_size'][0],
        "fh": page_orc['jp2_img_size'][1],
    }
    blocks = []
    for block in page['blocks_with_lines']:
        block = {
            coords
        }

### 2.a Check and process swi.xml to fetch the channel and program name metadata
see
https://docs.google.com/presentation/d/1NrbgjwLVhUFr2NRSEun6e2NPHp_AAEnA7dKVMUMvSWc/edit#slide=id.g2d8c9a6b5e0_0_16

In [None]:
swi_metadata_path = "/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins/swi.xml"

with open(swi_metadata_path, "r") as f:
    raw_xml = f.read()

pb_idx = 134436754

In [None]:
swi_metadata = BeautifulSoup(raw_xml, "html")
swi_metadata

Loading this xml takes very long, so the best is to preprocess it into a json file with all required info for them to be ready during ingestion

In [84]:
def keep_only_radio_bulletins(full_metadata, out_path, base_dir = og_data_base_dir):
    """
    Process the contents of the SWI metadata XML file to keep only the radio bulletins we have at our disposal.
    """
    existing_bulletins = os.listdir(base_dir)
    programs = []
    skipped = 0
    processed = 0
    for p_idx, program in tqdm(enumerate(full_metadata.find_all("program"))):

        archive_keys = [k.text for k in program.find_all("archivekey")]

        if not any ([f"{ak}.pdf" in existing_bulletins for ak in archive_keys]):

            skipped += 1
            if skipped % 500 == 0:
                print(f"Skipping program {p_idx+1}/19332 (no bulletin archives) - skipped {skipped} programs so far.")

        else:
            elems = []
            for elem in program.find_all("item"):
                
                seg_forms = [f.text for f_term in elem.find_all("form") for f in f_term.find_all("formterm")]
                originators = [
                    {
                        "last_name": None if o.find("originatorlastname") is None else o.find("originatorlastname").text,
                        "first_name": None if o.find("originatorfirstname") is None else o.find("originatorfirstname").text, 
                        "role": None if o.find("originatorrole") is None else o.find("originatorrole").text,
                    }
                    for ors in elem.find_all("originators") for o in ors.find_all("originator")
                ]
                persons = [{
                        "last_name": None if p.find("personlastname") is None else p.find("personlastname").text,
                        "first_name": None if p.find("personfirstname") is None else p.find("personfirstname").text,
                        "language": None if p.find("sprache") is None else p.find("sprache").text,
                        "is_portrait": None if p.find("portrait") is None else p.find("portrait").text,
                    } for pers in elem.find_all("persons") for p in pers.find_all("person")]

                    
                elems.append({
                    "archive_key": None if elem.find("archivekey") is None else elem.find("archivekey").text,
                    "segment_title": None if elem.find("itembeitragstitel") is None else elem.find("itembeitragstitel").text,
                    "abstract": None if elem.find("itemabstract") is None else elem.find("itemabstract").text,
                    "additional_notes": [n.text for n in elem.find_all("itembemerkung") if n is not None],
                    "segment_form": seg_forms,
                    "producers": [p.text for p in elem.find_all("producer") if p is not None],
                    "originators": originators,
                    "persons": persons,
                })
            
            program = {
                "program_title": None if program.find("sgef") is None else program.find("sgef").text,
                "program_subtitle": None if program.find("ptit") is None else program.find("ptit").text,
                "transmission": [{
                    "date": None if t.find("transmissiondatestart") is None else t.find("transmissiondatestart").text,
                    "channel": None if t.find("transmissionchannel") is None else t.find("transmissionchannel").text,
                } for t in program.find_all("transmission")],
            }
            if len(elems) == 1:
                program.update(elems[0])
            else:
                print(f"Program {p_idx+1}/19332 - More than one item!.")
                program["items"] = elems

            programs.append(program) 
            processed += 1
            if processed % 500 == 0:
                print(f"Finished processing program {p_idx+1}/19332 - processed {processed} programs so far.")
            # save the current metadata to a file
            if processed % 50 == 0:
                with open(out_path, "w", encoding="utf-8") as f:
                    json.dump(programs, f, ensure_ascii=False, indent=4)

    # save once more at the end               
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(programs, f, ensure_ascii=False, indent=4)


    return programs

In [None]:
program_metadata = keep_only_radio_bulletins(swi_metadata, "/home/piconti/impresso-text-acquisition/text_preparation/data/sample_data/SWISSINFO/SOC_rb_metadata.json")

In [76]:
len(program_metadata)

6718

In [None]:
sample_prgrams = swi_metadata.find_all("program")[100:110]
sample_prgrams

In [60]:
for p in sample_prgrams:
    archive_keys = [k.text for k in p.find_all("archivekey")]
    print(archive_keys)
    for elem in p.find_all("item"):
        seg_forms = [f.text for f_term in elem.find_all("form") for f in f_term.find_all("formterm")]
        originators = [
            {
                "last_name": o.find("originatorlastname").text,
                "first_name": o.find("originatorfirstname").text, 
                "role": o.find("originatorrole").text,
            }
            for ors in elem.find_all("originators") for o in ors.find_all("originator")
        ]
        persons = [{
                "last_name": None if p.find("personlastname") is None else p.find("personlastname").text,
                "first_name": None if p.find("personfirstname") is None else p.find("personfirstname").text,
                "language": None if p.find("sprache") is None else p.find("sprache").text,
                "is_portrait": None if p.find("portrait") is None else p.find("portrait").text,
            } for pers in elem.find_all("persons") for p in pers.find_all("person")]

        print(seg_forms, originators, persons)

['SRI_KAS_BRP_199612_Track12']
['Bericht'] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'role': 'AUT'}] [{'last_name': 'Conedera', 'first_name': 'Marco', 'language': 'dt', 'is_portrait': '0'}, {'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'language': 'dt', 'is_portrait': '0'}, {'last_name': 'Scheccia', 'first_name': 'Carlo', 'language': 'it', 'is_portrait': '0'}, {'last_name': 'Lendi', 'first_name': 'Peter', 'language': 'dt', 'is_portrait': '0'}]
['SRI_KAS_BRP_199609_Track03']
['Bericht', 'GerÃ¤usch'] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'role': 'AUT'}] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'language': 'dt', 'is_portrait': '0'}]
['SRI_KAS_BRP_199708_Track09']
['Bericht'] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'role': 'AUT'}] [{'last_name': 'LÃ¼thi', 'first_name': 'Andreas', 'language': 'dt', 'is_portrait': '0'}]
['SRI_KAS_BRP_199308_Track05']
['Trailer'] [{'last_name': 'Zimmermann', 'first_name': 'StÃ©phanie', 'role': 'AUT'}] [{'last_name': '

In [31]:
programs_with_mult_archives = any([len(p.find_all("archivekey"))>1 for p in swi_metadata.find_all("program")])
programs_with_mult_archives

True

#### Checking out what possible values exist in the data

In [86]:
possible_program_types = set([t for p in program_metadata for t in p['segment_form']])
possible_program_types

{'Chronik'}

In [87]:
possible_producers = set([t for p in program_metadata for t in p['producers']])
possible_producers

{'KWD (Radio)'}

In [27]:
rb_metadata_path = os.path.join(base_swissinfo_path, "SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json")
with open(rb_metadata_path, encoding="utf-8") as f:
    program_metadata = json.load(f)

In [28]:
possible_channels = set([t['channel'] for p in program_metadata for t in p['transmission']])
possible_channels

{'KWD', 'SOC (KWD)'}

In [89]:
possible_titles = list(set([t['segment_title'] for t in program_metadata]))
len(possible_titles), possible_titles[:10]

(6051,
 ['Tageschronik 20.10.1942',
  'Chroniques du jour 30.05.1944',
  'Law and Freedom: The basis of the Swiss conception of state',
  'El problema de los abastecimientos suizos',
  'Tageschronik 28.11.1943',
  'Chroniques du jour 05.02.1945',
  'Tageschronik 28.10.1940',
  'Verfeinerte Rationierung',
  'Tageschronik 11.02.1944',
  'Today at Home and Abroad 12.09.1944'])

Only one type - chronique/chronicle and only one channel/producer: SOC (KWD).
This will be fixed in the importer

## 3. Page implementation

In [5]:
test_pg1 = test_issue.pages[0]
test_pg1.add_issue(test_issue)
test_pg1.page_data

{'id': 'SOC_CJ-1940-07-22-a-p0001',
 'cdt': '2025-04-25 17:01:23',
 'r': [],
 'iiif_img_base_uri': 'https://impresso-project.ch/api/proxy/iiif/SOC_CJ-1940-07-22-a-p0001/info.json',
 'st': 'radio_broadcast',
 'sm': 'typescript',
 'cc': True}

In [7]:
test_pg1.issue.content_items[0]['m']['id']

'SOC_CJ-1940-07-22-a-i0001'

In [None]:
#def parse_line
ocr_json = test_pg1.issue.page_jsons[test_pg1.number - 1]
line = ocr_json["blocks_with_lines"][0]['lines'][0]
line_tokens = line["spans"]
line_coords = coords_to_xywh(line["rescaled_bbox"])
len(line_tokens),line_tokens

In [25]:
coords_to_xywh([
                       152.72815326551716,
                                2112.2831453229055,
                                1520.289302060428,
                                2157.3988528493646
                                    ])

[152, 2112, 1368, 45]

In [None]:
def parse_lines(blocks_with_lines):
    
    all_line_xy_coords = []
    all_lines = []
    hyphen_at_last = False
    for block_id, block in enumerate(blocks_with_lines):
        all_line_xy_coords.append(block['rescaled_bbox'])
        # there is usually only one paragraph per paragraph
        block_lines = []
        for line_id, line in enumerate(block['lines']):
            tokens = []
            # tokens are in this "spans" object
            for t_id, token in enumerate(line["spans"]):
                
                # Skip tokens which are only spaces
                if token["text"] == " ":
                    continue

                curr_token = {
                    "c": coords_to_xywh(token["rescaled_bbox"]),
                    "tx": token["text"],
                    "gn": False
                }

                # second half of a hyphen should be at the start of a line/block (which is not the first)
                if (block_id!= 0 or line_id!=0) and t_id == 0 and hyphen_at_last:
                    
                    if not('hy' in all_lines[-1]['t'][-1] and all_lines[-1]['t'][-1]['hy']):
                        print("Warning! problem with hyphen_at_last!")
                        raise NotImplementedError
                    
                    # if the first token of the line is a the second part of a hyphen, 
                    # we need to merge it with the last token (after removing the hyphen)
                    full_word = all_lines[-1]['t'][-1]['tx'].split("-")[0] + token["text"]
                    curr_token['nf'] = full_word

                # reset the hyphenation flag
                
                hyphen_at_last = False
                tokens.append(curr_token)

            
            # handle hyphenation
            if len(tokens) > 1 and tokens[-1]["tx"].endswith("-"):
                tokens[-1]['hy'] = True
                hyphen_at_last = True
            else:
                hyphen_at_last = False

            block_lines.append({
                "c": coords_to_xywh(line["rescaled_bbox"]),
                "t": tokens
            })

        # there is usually only one line per block
        if len(block_lines) == 1:
            all_lines.append(block_lines[0])
        else:
            # cases where there were more than one line seemed to be errors - to be checked.
            print("Warning! more than one line in this paragraph, adding them separately!!")
            all_lines.extend(block_lines)

        
    return all_line_xy_coords, all_lines

In [9]:
def compute_paragraph_coords(all_line_coords):
    """
    Compute the coordinates of a paragraph from the coordinates of its lines.
    """
    x1 = min([l[0] for l in all_line_coords])
    y1 = min([l[1] for l in all_line_coords])
    x2 = max([l[2] for l in all_line_coords])
    y2 = max([l[3] for l in all_line_coords])
    return coords_to_xywh([x1, y1, x2, y2])

In [59]:
def parse_page(pg):
    ocr_json = pg.issue.page_jsons[pg.number - 1]

    all_line_xy_coords, lines = parse_lines(ocr_json["blocks_with_lines"])

    # easier to merge all the coords if they stay in x1yx2y2 format
    para_coords = compute_paragraph_coords(all_line_xy_coords)
    paragraph = {
        "c": para_coords,
        "l": lines,
    }
    region = {"c": para_coords, "p": [paragraph], "pOf": pg.issue.content_items[0]['m']['id']}
    return region

In [60]:
parsed_pg1_r = parse_page(test_pg1)
parsed_pg1_r



{'c': [132, 287, 1498, 1870],
 'p': [{'c': [132, 287, 1498, 1870],
   'l': [{'c': [384, 287, 872, 45],
     't': [{'c': [384, 287, 135, 45], 'tx': 'UuTnrrlnl', 'gn': False},
      {'c': [532, 287, 8, 45], 'tx': 'i', 'gn': False},
      {'c': [552, 287, 34, 45], 'tx': 'rlii', 'gn': False},
      {'c': [608, 287, 37, 45], 'tx': 'Uli', 'gn': False},
      {'c': [677, 287, 12, 45], 'tx': 'r', 'gn': False},
      {'c': [711, 287, 136, 45], 'tx': 'Montag,', 'gn': False},
      {'c': [877, 287, 59, 45], 'tx': 'den', 'gn': False},
      {'c': [958, 287, 51, 45], 'tx': '22.', 'gn': False},
      {'c': [1038, 287, 79, 45], 'tx': 'Juli', 'gn': False},
      {'c': [1141, 287, 93, 45], 'tx': '1940.', 'gn': False}]},
    {'c': [372, 441, 1133, 51],
     't': [{'c': [372, 447, 6, 45], 'tx': 'i', 'gn': False},
      {'c': [610, 447, 40, 45], 'tx': 'Im', 'gn': False},
      {'c': [668, 447, 224, 45], 'tx': 'Vordergrund', 'gn': False},
      {'c': [917, 447, 56, 45], 'tx': 'der', 'gn': False},
      {'c

In [6]:
test_pg1.parse()
test_pg1.page_data



{'id': 'SOC_CJ-1940-07-22-a-p0001',
 'cdt': '2025-04-25 17:01:23',
 'r': {'c': [132, 287, 1498, 1870],
  'p': [{'c': [132, 287, 1498, 1870],
    'l': [{'c': [384, 287, 872, 45],
      't': [{'c': [384, 287, 135, 45], 'tx': 'UuTnrrlnl', 'gn': False},
       {'c': [532, 287, 8, 45], 'tx': 'i', 'gn': False},
       {'c': [552, 287, 34, 45], 'tx': 'rlii', 'gn': False},
       {'c': [608, 287, 37, 45], 'tx': 'Uli', 'gn': False},
       {'c': [677, 287, 12, 45], 'tx': 'r', 'gn': False},
       {'c': [711, 287, 136, 45], 'tx': 'Montag,', 'gn': False},
       {'c': [877, 287, 59, 45], 'tx': 'den', 'gn': False},
       {'c': [958, 287, 51, 45], 'tx': '22.', 'gn': False},
       {'c': [1038, 287, 79, 45], 'tx': 'Juli', 'gn': False},
       {'c': [1141, 287, 93, 45], 'tx': '1940.', 'gn': False}]},
     {'c': [372, 441, 1133, 51],
      't': [{'c': [372, 447, 6, 45], 'tx': 'i', 'gn': False},
       {'c': [610, 447, 40, 45], 'tx': 'Im', 'gn': False},
       {'c': [668, 447, 224, 45], 'tx': 'Vorderg

## 4. Detect/Select functions

In [2]:
args = {
    "access_rights":"",
    "chunk_size":5,
    "config_file":"/home/piconti/impresso-text-acquisition/text_preparation/config/importer_config/import_swissinfo_debug.json",
    "input_dir":"/mnt/project_impresso/original/SWISSINFO/",
    "log_file":"/home/piconti/impresso-text-acquisition/text_preparation/data/logs/test_logs/debug_swissinfo_importer.log",
    "output_dir":"/scratch/piconti/impresso/SWISSINFO",
    "s3_bucket":"10-canonical-sandbox",
    "temp_dir":"/scratch/piconti/impresso/temp_dump",
    "git_repo":"/home/piconti/impresso-text-acquisition",
}

access_rights=""
chunk_size=5
config_file="/home/piconti/impresso-text-acquisition/text_preparation/config/importer_config/import_swissinfo_debug.json"
input_dir="/mnt/project_impresso/original/SWISSINFO/"
log_file="/home/piconti/impresso-text-acquisition/text_preparation/data/logs/test_logs/debug_swissinfo_importer.log"
output_dir="/scratch/piconti/impresso/SWISSINFO"
s3_bucket="10-canonical-sandbox"
temp_dir="/scratch/piconti/impresso/temp_dump"
git_repo="/home/piconti/impresso-text-acquisition"


In [18]:
from text_preparation.importers.swissinfo.detect import detect_issues as SI_detect_issues
from text_preparation.importers.swissinfo.detect import select_issues as SI_select_issues
from text_preparation.importers import generic_importer as gi
from impresso_essentials.utils import init_logger
from collections import Counter
import logging

In [4]:
issue_class = SwissInfoRadioBulletinIssue
detect_func = SI_detect_issues
select_func = SI_select_issues

In [5]:
logger = logging.getLogger()
init_logger(logger, logging.DEBUG, args['log_file'])

<RootLogger root (DEBUG)>

In [None]:
client = gi.get_dask_client(None, args['log_file'], logging.DEBUG, 8)

2025-04-28 13:47:07,207 - tornado.application - ERROR - Uncaught exception GET /status/ws (128.179.181.56)
HTTPServerRequest(protocol='http', host='iccluster040.iccluster.epfl.ch:8787', method='GET', uri='/status/ws', version='HTTP/1.1', remote_ip='128.179.181.56')
Traceback (most recent call last):
  File "/scratch/piconti/.conda/envs/text_prep/lib/python3.11/site-packages/tornado/websocket.py", line 938, in _accept_connection
    open_result = handler.open(*handler.open_args, **handler.open_kwargs)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/piconti/.conda/envs/text_prep/lib/python3.11/site-packages/tornado/web.py", line 3301, in wrapper
    return method(self, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/scratch/piconti/.conda/envs/text_prep/lib/python3.11/site-packages/bokeh/server/views/ws.py", line 149, in open
    raise ProtocolError("Token is expired.")
bokeh.protocol.exceptions.ProtocolError: Token is expired

In [7]:
client

0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: http://127.0.0.1:8787/status,

0,1
Dashboard: http://127.0.0.1:8787/status,Workers: 8
Total threads: 16,Total memory: 251.79 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:35475,Workers: 8
Dashboard: http://127.0.0.1:8787/status,Total threads: 16
Started: Just now,Total memory: 251.79 GiB

0,1
Comm: tcp://127.0.0.1:41575,Total threads: 2
Dashboard: http://127.0.0.1:44481/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:43413,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-zkh5nsli,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-zkh5nsli

0,1
Comm: tcp://127.0.0.1:41639,Total threads: 2
Dashboard: http://127.0.0.1:41453/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:45673,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-cujos34r,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-cujos34r

0,1
Comm: tcp://127.0.0.1:37345,Total threads: 2
Dashboard: http://127.0.0.1:41659/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:44149,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-a19pa7n9,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-a19pa7n9

0,1
Comm: tcp://127.0.0.1:35293,Total threads: 2
Dashboard: http://127.0.0.1:33821/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:38183,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-8puox92q,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-8puox92q

0,1
Comm: tcp://127.0.0.1:34671,Total threads: 2
Dashboard: http://127.0.0.1:42801/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:45051,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-nf4gzx7w,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-nf4gzx7w

0,1
Comm: tcp://127.0.0.1:45651,Total threads: 2
Dashboard: http://127.0.0.1:46099/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:45679,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-7z1s9m70,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-7z1s9m70

0,1
Comm: tcp://127.0.0.1:42857,Total threads: 2
Dashboard: http://127.0.0.1:37491/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:40931,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-yq0tk8ps,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-yq0tk8ps

0,1
Comm: tcp://127.0.0.1:36275,Total threads: 2
Dashboard: http://127.0.0.1:38951/status,Memory: 31.47 GiB
Nanny: tcp://127.0.0.1:36779,
Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-le5mwlmg,Local directory: /scratch/piconti/impresso/dask_tmp/dask-scratch-space/worker-le5mwlmg


In [12]:
p = '/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/19/a'

p.split("/")[-5]

'SOC_CJ'

In [12]:
if config_file and os.path.isfile(config_file):
    logger.info("Found config file: %s", os.path.realpath(config_file))
    with open(config_file, "r", encoding="utf-8") as f:
        config = json.load(f)
    issues = gi.apply_select_func(
        issue_class,
        config,
        input_dir=input_dir,
        access_rights=access_rights,
        select_func=select_func,
        tmp_dir=temp_dir,
    )
    logger.info(
        "%s issues remained to import after applying filter: %s", len(issues), issues
    )
else:
    logger.info("No config file found.")
    issues = gi.apply_detect_func(
        issue_class,
        input_dir,
        access_rights,
        detect_func=detect_func,
        tmp_dir=temp_dir,
    )
    logger.info("%s  issues to import detected", len(issues))

In [13]:
len(issues), sorted(issues, key=lambda i: i.date)

(362,
 [IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 8, 2), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/02/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
  IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 8, 3), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/03/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
  IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 8, 5), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/05/a', metadata_file='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_rb_metadata.json'),
  IssueDirectory(alias='SOC_CJ', date=datetime.date(1940, 8, 6), edition='a', path='/mnt/project_impresso/original/SWISSINFO/WW2-SOC-bulletins-json/SOC_CJ/1940/08/06/a', metad

In [16]:
soc_cj_issues = list(filter(lambda i: i.alias=='SOC_CJ', issues))
soc_vs_issues = list(filter(lambda i: i.alias=='SOC_VS', issues))

len(soc_cj_issues), len(soc_vs_issues)

(30, 332)

Issues detection and filtering works! 
Check to see what we detect for ingestion

In [19]:
all_issues = gi.apply_detect_func(
        issue_class,
        input_dir,
        access_rights,
        detect_func=detect_func,
        
        tmp_dir=temp_dir,
    )

len(all_issues), Counter([i.alias for i in all_issues])

(6666,
 Counter({'SOC_CJ': 3924,
          'SOC_CP': 1481,
          'SOC_SO': 746,
          'SOC_VS': 332,
          'SOC_TH': 183}))

In [23]:
issues_per_alias = {}
for i in all_issues:
    if i.alias in issues_per_alias:
        issues_per_alias[i.alias].append(i)
    else:
        issues_per_alias[i.alias] = [i]

min_max_dates = {
    alias: (str(min(a_issues, key=lambda i: i.date).date), str(max(a_issues , key=lambda i: i.date).date)) 
    for alias, a_issues in issues_per_alias.items()
}
min_max_dates

{'SOC_TH': ('1941-03-30', '1945-03-29'),
 'SOC_CP': ('1939-03-18', '1945-08-29'),
 'SOC_VS': ('1943-01-31', '1944-12-30'),
 'SOC_CJ': ('1940-07-22', '1945-12-31'),
 'SOC_SO': ('1940-06-12', '1945-07-30')}

## 5. Final check/debug