In [1]:
from pathlib import Path
API_PATH = Path("/Users/joregan/Playing/rdapi/api_output")

In [3]:
from sync_asr.riksdag.riksdag_api import RiksdagAPI

In [31]:
SAMPLE = RiksdagAPI(filename=str(API_PATH / "H2C120150122fs"))

In [32]:
YEARS_RAW = {}
YEARS_TRANSCRIBED = {}

In [62]:
import re

def sanity_check(api_file, key):
    if "videodata" not in api_file.__dict__:
        return False
    if key not in api_file.__dict__["videodata"]:
        return False
    else:
        return True


def get_year(api_file):
    if not sanity_check(api_file, "debatedate"):
        return None
    if api_file.__dict__["videodata"]["debatedate"] is None:
        return None
    if api_file.__dict__["videodata"]["debatedate"].strip() == "":
        return None
    year = re.search("([12][0-9][0-9][0-9])$", api_file.__dict__["videodata"]["debatedate"].strip())
    if year:
        return year.group(0)
    else:
        return None

In [43]:
def get_raw_seconds(api_file):
    if not sanity_check(api_file, "debateseconds"):
        return 0
    return api_file.__dict__["videodata"]["debateseconds"]


In [53]:
type(get_raw_seconds(SAMPLE))

int

In [58]:
def get_transcribed_total(rdapi):
    total = 0
    if not sanity_check(rdapi, "speakers"):
        return total
    for speaker in rdapi.__dict__["videodata"]["speakers"]:
        if "duration" in speaker:
            total += speaker["duration"]
    return total

In [63]:
for filename in API_PATH.glob("*"):
    api_file = RiksdagAPI(filename=str(API_PATH / filename))
    year = get_year(api_file)
    if year is None:
        continue
    raw = get_raw_seconds(api_file)
    ts = get_transcribed_total(api_file)
    if year in YEARS_RAW:
        YEARS_RAW[year] += raw
    else:
        YEARS_RAW[year] = raw
    if year in YEARS_TRANSCRIBED:
        YEARS_TRANSCRIBED[year] += ts
    else:
        YEARS_TRANSCRIBED[year] = ts

In [70]:
MD_TABLE = """
| Year    | Raw audio (seconds) | Transcribed audio (seconds) |
|---------|---------------------|-----------------------------|
"""
for key in sorted(YEARS_RAW):
    MD_TABLE += f"| {key} | {YEARS_RAW[key]} | {YEARS_TRANSCRIBED[key]} |\n"

In [72]:
from IPython.display import display, Markdown
display(Markdown(MD_TABLE))


| Year    | Raw audio (seconds) | Transcribed audio (seconds) |
|---------|---------------------|-----------------------------|
| 2012 | 19 | 0 |
| 2013 | 2466902 | 1912591 |
| 2014 | 3062128 | 2374238 |
| 2015 | 2846652 | 2224617 |
| 2016 | 2807182 | 2190416 |
| 2017 | 2601171 | 2078119 |
| 2018 | 2182782 | 1650943 |
| 2019 | 2334274 | 1786766 |
| 2020 | 2060902 | 1719997 |
| 2021 | 3086041 | 2625609 |
| 2022 | 118687 | 114522 |


In [73]:
print(MD_TABLE)


| Year    | Raw audio (seconds) | Transcribed audio (seconds) |
|---------|---------------------|-----------------------------|
| 2012 | 19 | 0 |
| 2013 | 2466902 | 1912591 |
| 2014 | 3062128 | 2374238 |
| 2015 | 2846652 | 2224617 |
| 2016 | 2807182 | 2190416 |
| 2017 | 2601171 | 2078119 |
| 2018 | 2182782 | 1650943 |
| 2019 | 2334274 | 1786766 |
| 2020 | 2060902 | 1719997 |
| 2021 | 3086041 | 2625609 |
| 2022 | 118687 | 114522 |

