In [5]:
from pathlib import Path

In [6]:
passes = Path("/home/joregan/passes")

In [7]:
noisy = passes / "sixth-pass" / "noisy"

In [8]:
def process_file(filename):
    if type(filename) is not Path:
        filename = Path(filename)
    ops = {}
    subs = []
    start = 0.0
    cur_start = 0.0
    cur_dur = 0.0
    video = ""
    with open(filename) as inf:
        for line in inf.readlines():
            line = line.strip()
            parts = line.split(" ")
            category = parts[-1]
            if not category in ops:
                ops[category] = 0
            ops[category] += 1
            if category == "sub":
                subs.append(f"{parts[4]}|{parts[6]}")
            if start == 0.0:
                start = float(parts[2])
            if video == "":
                video = parts[0]
            cur_start = float(parts[2])
            cur_dur = float(parts[3])

    return {
        "start": start,
        "end": cur_start + cur_dur,
        "video": video,
        "filename": filename.name,
        "substitutions": subs,
        "ops": ops
    }

In [9]:
process_file(noisy / "H9C120220120fs_0041_0001_0001_0001_0001_0001")

FileNotFoundError: [Errno 2] No such file or directory: '/home/joregan/passes/sixth-pass/noisy/H9C120220120fs_0041_0001_0001_0001_0001_0001'

```
match|mismahtch       # quicksort index to split on mathces and mismatches (allowing for a maste index)
startdate             # If available, presumed start date of the video (or better). Year is ok.
enddate               # If available, presumed en date of video. Year is ok, higher granularity is better.
voffset               # time offset from the start of video to the segment (we wont know this now)
starttime             # anchored at start of videofile
endtime (or duration) # anchored at start of videofile (or at starttime) but be consistent
filename              # of the file with the token data
vfilename             # name of the corresponding videofile
vfileorigin           # youtube... (no need to add entire URI, just an identofier that shows how the file was acquired)
refsrc                # Type of transcription used as reference (i.e. "official transcript")
compsrc               # Type of transcription being compared (e.g. wav2vec, whicper);
                      # use an ID that allows us to know which settings were used
#matching             # Count of comp tokens matching ref
#subs                 # Count of 1-on-1 substitutions between ref and comp (0 when match)
#additions            # Count of additions in comp (0 when match)
#substitutions        # Count of deletions in comp (0 when match)
#reftokens            # Total number of tokens in ref (presumably #matching+#substitutions+#deletions)
#comptokens           # Total number of tokens in comp (presumably #matching+#substitutions+#additions)
```

In [10]:
import pandas as pd

pd.Timedelta(seconds=3161.38).isoformat()

'P0DT0H52M41.38S'

In [11]:
def seconds_to_iso8601(seconds):
    total_seconds = int(seconds)  # Integer part of second
    fractional_seconds = seconds - total_seconds  # Fractional part
    
    minutes = total_seconds // 60  # Calculate minutes
    remaining_seconds = total_seconds % 60  # Remaining seconds
    
    # Format the result, ensuring fractional seconds are included
    if fractional_seconds > 0:
        return f"PT{minutes}M{remaining_seconds + fractional_seconds:.3f}S"
    else:
        return f"PT{minutes}M{remaining_seconds}S"

# Example usage
seconds = 3164.8199999999997
iso_duration = seconds_to_iso8601(seconds)
print(iso_duration)

PT52M44.820S


In [46]:
import datetime
import re

MONTHS = {
    "januari": 1,
    "februari": 2,
    "mars": 3,
    "april": 4,
    "maj": 5,
    "juni": 6,
    "juli": 7,
    "augusti": 8,
    "september": 9,
    "oktober": 10,
    "november": 11,
    "december": 12,
}

def parse_rd_date(datestring):
    if datestring is None:
        return ""
    if datestring == "":
        return None
    keys = "(" + "|".join(MONTHS.keys()) + ")"
    m = re.match(f"^([12]?[0-9]) {keys} ([12][0-9]*)", datestring)
    if m:
        day = int(m.group(1))
        month = MONTHS[m.group(2)]
        year = int(m.group(3))
        t = datetime.datetime(year=year, month=month, day=day)
        return t.strftime('%Y-%m-%d')
    else:
        return ""


In [47]:
parse_rd_date("13 juni 2014")

'2014-06-13'

In [48]:
def write_jensfully(data):
    match = ""
    # startdate = enddate = data.get("videodate", "")
    startdate = parse_rd_date(data.get("videodate", ""))

    voffset = ""
    starttime = seconds_to_iso8601(data["start"])
    endtime = seconds_to_iso8601(data["end"])
    filename = data["filename"]
    vfilename = data["video"]
    # vfileorigin = "Riksdag API"
    # refsrc = "official transcript"
    # compsrc = "VoxRex wav2vec"
    vfileorigin = "API"
    compsrc = "VoxRex"

    matching = data["ops"].get("cor", 0)
    subs = data["ops"].get("sub", 0)
    additions = data["ops"].get("ins", 0)
    deletions = data["ops"].get("del", 0)
    reftokens = matching + subs + deletions
    comptokens = matching + subs + additions

    return "\t".join([
        match,
        startdate,
        # enddate,
        voffset,
        starttime,
        endtime,
        filename,
        vfilename,
        vfileorigin,
        # refsrc,
        compsrc,
        str(matching),
        str(subs),
        str(additions),
        str(deletions),
        str(reftokens),
        str(comptokens)
    ])

In [49]:
import json

doc_dates = {}

def get_doc_date(filename):
    if isinstance(filename, Path):
        filename = filename.name
    else:
        if isinstance(filename, str) and "/" in filename:
            filename = filename.split("/")[-1]
    parts = filename.split("_")
    if parts[0] in doc_dates:
        return doc_dates[parts[0]]
    with open(f"/sbtal/riksdag-video/api_output/{parts[0]}") as jsonf:
        data = json.load(jsonf)
    if "videodata" in data:
        vd = data["videodata"]
        if "debatedate" in vd[0]:
            date = vd[0].get("debatedate", "")
            doc_dates[parts[0]] = date
            return date
    return ""

In [25]:
get_doc_date("H9C120220120fs_0041_0001_0001_0001_0001_0001")

'20 januari 2022'

In [50]:
for missfile in noisy.glob("H9C120220120fs*"):
    proc = process_file(missfile)
    proc["videodate"] = get_doc_date(missfile)
    print(write_jensfully(proc))

	2022-01-20		PT16M54.140S	PT17M12.279S	H9C120220120fs_0011_0001_0001_0001_0001_0001	2442204240010077721	API	VoxRex	26	2	5	0	28	33
	2022-01-20		PT58M3.060S	PT63M3.280S	H9C120220120fs_0047_0001_0001_0001_0001_0001	2442204240010077721	API	VoxRex	440	86	147	20	546	673
	2022-01-20		PT19M12.300S	PT20M14.759S	H9C120220120fs_0015_0001_0001_0001_0001_0001	2442204240010077721	API	VoxRex	97	18	30	7	122	145
	2022-01-20		PT56M22.260S	PT56M36.699S	H9C120220120fs_0045_0001_0001_0001_0001_0001	2442204240010077721	API	VoxRex	26	2	3	1	29	31
	2022-01-20		PT25M46.120S	PT28M21.759S	H9C120220120fs_0023_0001_0001_0001_0001_0001	2442204240010077721	API	VoxRex	246	36	52	9	291	334
	2022-01-20		PT42M33.400S	PT42M33.479S	H9C120220120fs_0034_0001_0001_0001_0002_0001	2442204240010077721	API	VoxRex	0	0	0	0	0	0
	2022-01-20		PT18M27.960S	PT19M9.299S	H9C120220120fs_0014_0001_0001_0001_0001_0001	2442204240010077721	API	VoxRex	51	9	36	9	69	96
	2022-01-20		PT68M29.979S	PT68M35.159S	H9C120220120fs_0052_0001_0001_0001_0001_

In [None]:
def extract_start(filename, nlines, sub_to_alt=True):
    if type(filename) is not Path:
        filename = Path(filename)
    with open(filename) as inf:
        lines = [x.strip() for x in inf.readlines()]
    

In [2]:
test = '{"debatedate": null}'
json.loads(test)

{'debatedate': None}