In [2]:
import json

in_path = "/Users/joregan/Playing/nst_swedish_tts/whisper-v3-aligned-to-sw_all.json"
out_path = "/Users/joregan/Playing/nst_swedish_tts/whisper-v3-aligned-to-sw_all_ordered.json"

with open(in_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Map int → (original_key, value)
items = [(int(k), v) for k, v in data.items()]
items.sort(key=lambda x: x[0])

# Reconstruct with the original (possibly zero-padded) key strings
ordered = {str(k).zfill(len(items[0][0].__str__())): v for k, v in items}

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(ordered, f, ensure_ascii=False, indent=2)

print(f"✅ Ordered JSON written to {out_path}")

✅ Ordered JSON written to /Users/joregan/Playing/nst_swedish_tts/whisper-v3-aligned-to-sw_all_ordered.json


In [None]:
!pip install num2words

In [86]:
from num2words import num2words

def approx_num(raw_number, text):
    try:
        number = int(raw_number)
    except ValueError:
        return False
    forms = set()
    if len(raw_number) == 4:
        a = num2words(int(number[0:2]), lang='sv')
        b = num2words(int(number[2]), lang='sv')
        forms.add(f"{a} hundra {b}")
        forms.add(f"{a}hundra {b}")
        forms.add(f"{a}hundra{b}")
    card = num2words(number, lang='sv', to='cardinal')
    ord = num2words(number, lang='sv', to='ordinal')
    forms.add(card)
    forms.add(ord)
    for inner in ["tusen", "hundra"]:
        if inner in card:
            forms.add(card.replace(inner, inner + " "))
            forms.add(card.replace(inner, " " + inner))
            forms.add(card.replace(inner, " " + inner + " "))
        if inner in ord:
            forms.add(ord.replace(inner, inner + " "))
            forms.add(ord.replace(inner, " " + inner))
            forms.add(ord.replace(inner, " " + inner + " "))
    for form in forms:
        if form in text:
            return True
    return False
        

In [12]:
OFFSETS = {
    4155: 1,
    5144: 2,
}

rejigged = {}

offset = 0
for k_str, v in ordered.items():
    k = int(k_str)
    if k > 5277:
        break
    if k in OFFSETS:
        offset = OFFSETS[k]
    new_k = k + offset
    rejigged[k_str.zfill(4)] = {
        "res": v["res"],
        "orig": ordered[str(new_k)]["orig"]
    }

In [15]:
for item in rejigged:
    res = rejigged[item]["res"]
    orig = rejigged[item]["orig"]
    if res == orig:
        rejigged[item]["norm"] = res
    else:
        norm_simple = res.replace(".", "").replace(",", "").replace("?", "").replace("!", "").lower()
        if orig == norm_simple:
            rejigged[item]["norm"] = res

In [93]:
def simple_norm(res):
    return res.replace(".", "").replace(",", "").replace("?", "").replace("!", "").lower()

In [94]:
import re

def check_numbers_match(res, orig):
    res_norm = simple_norm(res)
    res_words = res_norm.split()

    for word in res_words:
        if re.fullmatch(r'^\d+$', word):
            if approx_num(word, orig):
                return True
    return False

In [87]:
def capitalize_and_punctuate(text, punct="."):
    if not text:
        return text
    text = text[0].upper() + text[1:]
    text += punct
    return text

In [None]:
OK_WHISPER = [
    "0004", "0111", "3956", "4811", "4699", "4676", "4665", "4651",
    "0172", "0348", "0442", "0738", "0799", "0880", "0999", "1031", "1075", "1098", "1177", "1279", "1320", "1441", "1493", "1501", "1508", "1511", "1513", "1514", "1516", "1518", "1519", "1520", "1524"
]
UC_WHISPER = ["0079", "2537"]
PUNCT_WHISPER = ["0085", "4838"]
ORIG_UC_FS = [
    "0001", "0011", "0045", "0052", "0064", "0065", "0063", "0070", "0080", "0084", "0113", "0123",
    "0125", "0104", "0127", "2543", "3748", "0143", "0145", "5030", "5060", "5080", "5117", "5123",
    "4858", "4717", "4707", "4703", "4695", "4576", "4484", "0158", "0129", "0151", "0165", "0171"
]
FULL_NORM = {
    "0112": "Jag vill ta en titt i ert gevärsskåp. Det är allt.",
    "5202": "Per Henricson.",
    "5203": "Westergren presstödsnämnden.",
    "5208": "Actinvest är ett seriöst bolag, säger en analytiker i Milano.",
    "5185": "Microsoft-mus ingår.",
    "5147": "Skalbarhet under Windows NT har förut varit en black om foten.",
    "5138": "Palmquist som är vd på UB Networks i Sverige.",
    "5143": "Henrik Lind har anställts som utvecklingschef på Living Questions.",
    "3881": "I det nya EU är detta en omöjlighet.",
    "4869": "ModelQuest Expert och kan hantera upp till 60 olika indata.",
    "4728": "Norden Rolf Hallencreutz.",
    "4726": "Lars Öquist, vd, TCM.",
    "4706": "Lärarförbundets ordförande Christer Romilson.",
    "4581": "Med vänliga hälsningar två Vilhelminabor.",
    "5275": "Det gällde både konst, arkitektur, litteratur, musik, filosofi och vetenskap.",
}

In [100]:
a = """0172
0348
0442
0738
0799
0880
0999
1031
1075
1098
1177
1279
1320
1441
1493
1501
1508
1511
1513
1514
1516
1518
1519
1520
1524""".split("\n")
print(", ".join(['"' + x + '"' for x in a]))

"0172", "0348", "0442", "0738", "0799", "0880", "0999", "1031", "1075", "1098", "1177", "1279", "1320", "1441", "1493", "1501", "1508", "1511", "1513", "1514", "1516", "1518", "1519", "1520", "1524"


In [99]:
for item in rejigged:
    if "norm" not in rejigged[item]:
        if item in OK_WHISPER:
            rejigged[item]["norm"] = rejigged[item]["res"]
        elif item in UC_WHISPER:
            rejigged[item]["norm"] = capitalize_and_punctuate(rejigged[item]["res"], "")
        elif item in PUNCT_WHISPER:
            rejigged[item]["norm"] = capitalize_and_punctuate(rejigged[item]["res"])
        elif item in ORIG_UC_FS:
            rejigged[item]["norm"] = capitalize_and_punctuate(rejigged[item]["orig"])
        elif item in FULL_NORM:
            rejigged[item]["norm"] = FULL_NORM[item]
        else:
            # print(f"⚠️  Mismatch at item {item}:")
            # print(f"    res:  '{rejigged[item]['res']}'")
            # print(f"    orig: '{rejigged[item]['orig']}'")
            # print(f".   \"{item}\": \"{rejigged[item]['orig']}\",")
            if check_numbers_match(rejigged[item]['res'], rejigged[item]['orig']):
                print(item)
                print(f"    res:  '{rejigged[item]['res']}'")
                print(f"    orig: '{rejigged[item]['orig']}'")
                print("    ✅ Numbers match")


0172
    res:  'En liten ö med knappt 300 personer kvar.'
    orig: 'en liten ö med knappt tre hundra personer kvar'
    ✅ Numbers match
0348
    res:  'Det var på eftermiddagen den 12 oktober'
    orig: 'det var på eftermiddagen den tolfte oktober'
    ✅ Numbers match
0442
    res:  'Det fattas ännu 180 pesos till en ny.'
    orig: 'det fattas ännu etthundra åttio pesos till en ny'
    ✅ Numbers match
0672
    res:  '25 år gammal var han visst då'
    orig: 'tjugofem år gammal var han visst då'
    ✅ Numbers match
0738
    res:  'Bortåt 200 privata importörer har tagit två tredjedelar av marknaden.'
    orig: 'bortåt två hundra privata importörer har tagit två tredjedelar av marknaden'
    ✅ Numbers match
0799
    res:  'Det handlar om löner för ytterligare ungefär 20 personer under ett halvårstid.'
    orig: 'det handlar om löner för ytterligare ungefär tjugo personer under ett halvårs tid'
    ✅ Numbers match
0880
    res:  'Ett fat råolja kostade omkring 3 dollar.'
    orig: 'ett f

TypeError: 'int' object is not subscriptable