In [4]:
import json
from pathlib import Path
from whoosh import index

In [5]:
from bs4 import BeautifulSoup

In [2]:
sample = "/sbtal/riksdag-video/api_output/H9C120211014fs"

In [33]:
from whoosh.fields import Schema, TEXT, KEYWORD, ID, STORED, NUMERIC, BOOLEAN

schema = Schema(videostatus=NUMERIC(stored=True),
                committee=ID(stored=True),
                type=ID(stored=True),
                debatepreamble=ID(stored=True),
                debatetexthtml=ID(stored=True),
                livestreamurl=ID(stored=True),
                activelivespeaker=ID(stored=True),
                id=ID(stored=True),
                dokid=ID(stored=True),
                title=ID(stored=True),
                debatename=ID(stored=True),
                debatedate=ID(stored=True),
                debatetype=ID(stored=True),
                debateurl=ID(stored=True),
                fromchamber=BOOLEAN(stored=True),
                thumbnailurl=ID(stored=True),
                debateseconds=NUMERIC(stored=True),
                streamurl=ID(stored=True),
                start=NUMERIC(stored=True),
                duration=NUMERIC(stored=True),
                speaker=ID(stored=True),
                party=ID(stored=True),
                subid=ID(stored=True),
                active=BOOLEAN(stored=True),
                number=NUMERIC(stored=True),
                paragraph=NUMERIC(stored=True),
                text=TEXT(stored=True))

In [34]:
BASE_KEYS = ['videostatus', 'committee', 'type', 'debatepreamble', 'debatetexthtml', 'livestreamurl', 'activelivespeaker', 'id', 'dokid', 'title', 'debatename', 'debatedate', 'debatetype', 'debateurl', 'fromchamber', 'thumbnailurl', 'debateseconds']
def read_api_json(filename):
    infile = str(filename)
    with open(infile) as input:
        data = json.load(input)
    assert "videodata" in data
    print(f"Reading {filename}")

    if len(data["videodata"]) > 1:
        print(f"More than one 'videodata' in {infile}")

    base = {}
    for key in BASE_KEYS:
        base[key] = data["videodata"][0][key]

    if not "streams" in data["videodata"][0] or data["videodata"][0]["streams"] is None:
        print(f"No 'streams' key found in {filename}")
        return None, None
    assert "streams" in data["videodata"][0]
    if not "files" in data["videodata"][0]["streams"] or data["videodata"][0]["streams"]["files"] is None:
        print(f"No 'files' key found in {filename}")
    assert "files" in data["videodata"][0]["streams"]
    if len(data["videodata"][0]["streams"]["files"]) > 1:
        print(f"More than one stream: {infile}")
    assert "url" in data["videodata"][0]["streams"]["files"][0]
    base["streamurl"] = data["videodata"][0]["streams"]["files"][0]["url"]


    if not "speakers" in data["videodata"][0] or data["videodata"][0]["speakers"] is None:
        print(f"No 'speakers' key found in {filename}")
        return None, None
    speakers = []
    for speaker in data["videodata"][0]["speakers"]:
        cur = {}
        for key in ["start", "duration", "party", "subid", "active", "number"]:
            cur[key] = speaker[key]
        cur["speaker"] = speaker["text"]
        ending = f" ({cur['party']})"
        if cur["speaker"].endswith(ending):
            cur["speaker"] = cur["speaker"][:-len(ending)]
        html = speaker["anftext"]
        soup = BeautifulSoup(html, 'html.parser')
        count = 1
        for para in soup.find_all("p"):
            pg = cur
            pg["text"] = para.text
            pg["paragraph"] = count
            speakers.append(pg)
            count += 1
    return base, speakers

In [78]:
read_api_json("/sbtal/riksdag-video/api_output/GPC320160906CK1")

No 'streams' key found in /sbtal/riksdag-video/api_output/GPC320160906CK1


(None, None)

In [35]:
import os, os.path

if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

ix = index.create_in("indexdir", schema)

In [36]:
API_OUTPUT = Path("/sbtal/riksdag-video/api_output/")
writer = ix.writer()

for file in API_OUTPUT.glob("*"):
    doc, speakers = read_api_json(file)
    if doc is None or speakers is None:
        continue
    for speaker in speakers:
        try:
            writer.add_document(
                videostatus=doc["videostatus"],
                committee=doc["committee"],
                type=doc["type"],
                debatepreamble=doc["debatepreamble"],
                debatetexthtml=doc["debatetexthtml"],
                livestreamurl=doc["livestreamurl"],
                activelivespeaker=doc["activelivespeaker"],
                id=doc["id"],
                dokid=doc["dokid"],
                title=doc["title"],
                debatename=doc["debatename"],
                debatedate=doc["debatedate"],
                debatetype=doc["debatetype"],
                debateurl=doc["debateurl"],
                fromchamber=doc["fromchamber"],
                thumbnailurl=doc["thumbnailurl"],
                debateseconds=doc["debateseconds"],
                streamurl=doc["streamurl"],
                start=speaker["start"],
                duration=speaker["duration"],
                speaker=speaker["speaker"],
                party=speaker["party"],
                subid=speaker["subid"],
                active=speaker["active"],
                number=speaker["number"],
                paragraph=speaker["paragraph"],
                text=speaker["text"]
            )
        except ValueError as ve:
            print(ve, doc, speaker)
writer.commit()

Reading /sbtal/riksdag-video/api_output/GPC320160906CK1
No 'streams' key found in /sbtal/riksdag-video/api_output/GPC320160906CK1
Reading /sbtal/riksdag-video/api_output/H001AU10
Reading /sbtal/riksdag-video/api_output/H001AU11
Reading /sbtal/riksdag-video/api_output/H001AU12
Reading /sbtal/riksdag-video/api_output/H001AU7
Reading /sbtal/riksdag-video/api_output/H001AU8
Reading /sbtal/riksdag-video/api_output/H001AU9
Reading /sbtal/riksdag-video/api_output/H001CU10
Reading /sbtal/riksdag-video/api_output/H001CU11
Reading /sbtal/riksdag-video/api_output/H001CU12
Reading /sbtal/riksdag-video/api_output/H001CU13
Reading /sbtal/riksdag-video/api_output/H001CU15
Reading /sbtal/riksdag-video/api_output/H001CU17
Reading /sbtal/riksdag-video/api_output/H001CU18
No 'speakers' key found in /sbtal/riksdag-video/api_output/H001CU18
Reading /sbtal/riksdag-video/api_output/H001CU20
Reading /sbtal/riksdag-video/api_output/H001CU21
Reading /sbtal/riksdag-video/api_output/H001CU22
No 'speakers' key fou