# Trying to use pocketsphinx to word align

> "Because timing accuracy in ASR is getting progressively worse, look backwards"

- branch: master
- comments: false
- categories: [pocketsphinx, polish, alignment]

In [2]:
from pydub import AudioSegment

In [7]:
%%capture
%pip install pocketsphinx

In [8]:
%%capture
%pip install lupa

In [3]:
PL_IPA = r"""
-- Barely modified version of
-- https://en.wiktionary.org/wiki/Module:pl-IPA
-- (so, CC-BY-SA)
-- This was deleted in 2024: https://en.wiktionary.org/w/index.php?title=Special:Log&type=delete&page=Module:pl-IPA
local export = {}

local letters2phones = {
	["a"] = {
		["u"  ] = { "a", "w" },
		[false] = "a",
	},
	["ą"] = {
		["ł"  ] = { "ɔ", "w" },
		[false] = "ɔ̃",
	},
	["b"] = {
		["i"  ] = {
			["a"  ] = { "bʲ", "a" },
			["ą"  ] = { "bʲ", "ɔ̃" },
			["e"  ] = { "bʲ", "ɛ" },
			["ę"  ] = { "bʲ", "ɛ̃" },
			["i"  ] = { "bʲ", "j", "i" },
			["o"  ] = { "bʲ", "ɔ" },
			["ó"  ] = { "bʲ", "u" },
			["u"  ] = { "bʲ", "u" },
			[false] = { "bʲ", "i" }
			},
		[false] = "b"
	},
	["c"] = {
		["i"  ] = {
			["ą"  ] = { "t͡ɕ", "ɔ̃" },
			["a"  ] = { "t͡ɕ", "a" },
			["e"  ] = { "t͡ɕ", "ɛ" },
			["ę"  ] = { "t͡ɕ", "ɛ̃" },
			["o"  ] = { "t͡ɕ", "ɔ" },
			["ó"  ] = { "t͡ɕ", "u" },
			["u"  ] = { "t͡ɕ", "u" },
			["y"  ] = { "t͡ɕ", "ɨ" },
			[false] = { "t͡ɕ", "i" }
		},
		["h"  ] = {
		["i" ] = {
			["a"  ] = { "xʲ", "j", "a" },
			["ą"  ] = { "xʲ", "j", "ɔ̃" },
			["e"  ] = { "xʲ", "j", "ɛ" },
			["ę"  ] = { "xʲ", "j", "ɛ̃" },
			["i"  ] = { "xʲ", "j", "i" },
			["o"  ] = { "xʲ", "j", "ɔ" },
			["ó"  ] = { "xʲ", "j", "u" },
			["u"  ] = { "xʲ", "j", "u" },
			[false] = { "xʲ", "i" }
			},
		[false] = "x"
                },
		["z"  ] = "t͡ʂ",
		[false] = "t͡s"
	},
	["ć"] = "t͡ɕ",
	["d"] = {
		["z"  ] = {
			["i"  ] = {
				["ą"  ] = { "d͡ʑ", "ɔ̃" },
				["a"  ] = { "d͡ʑ", "a" },
				["e"  ] = { "d͡ʑ", "ɛ" },
				["ę"  ] = { "d͡ʑ", "ɛ̃" },
				["o"  ] = { "d͡ʑ", "ɔ" },
				["ó"  ] = { "d͡ʑ", "u" },
				["u"  ] = { "d͡ʑ", "u" },
				["y"  ] = { "d͡ʑ", "ɨ" },
				[false] = { "d͡ʑ", "i" }
			},
			[false] = "d͡z"
		},
		["ż"  ] = "d͡ʐ",
		["ź"  ] = "d͡ʑ",
		[false] = "d"
	},
	["e"] = {
		["u"  ] = { "ɛ", "w" },
		["e"  ] = { "ɛ", "ʔ", "ɛ" }, -- reedukacja, reewaluacja, etc.
		[false] = "ɛ",
	},
	["ę"] = {
		["l"  ] = { "ɛ", "l" },
		["ł"  ] = { "ɛ", "w" },
		[false] = "ɛ̃",
	},
	["f"] = {
		["i"  ] = {
			["a"  ] = { "fʲ", "a" },
			["ą"  ] = { "fʲ", "ɔ̃" },
			["e"  ] = { "fʲ", "ɛ" },
			["ę"  ] = { "fʲ", "ɛ̃" },
			["i"  ] = { "fʲ", "j", "i" },
			["o"  ] = { "fʲ", "ɔ" },
			["ó"  ] = { "fʲ", "u" },
			["u"  ] = { "fʲ", "u" },
			[false] = { "fʲ", "i" }
			},
		[false] = "f"
	},
	["g"] = {
		["i" ] = {
			["a"  ] = { "ɡʲ", "j", "a" },
			["ą"  ] = { "ɡʲ", "ɔ̃" }, -- only forms of "giąć"
			["e"  ] = { "ɡʲ", "ɛ" },
			["ę"  ] = { "ɡʲ", "ɛ̃" }, -- only forms of "giąć" and "giętki"
			["i"  ] = { "ɡʲ", "j", "i" },
			["o"  ] = { "ɡʲ", "j", "ɔ" },
			["ó"  ] = { "ɡʲ", "j", "u" },
			["u"  ] = { "ɡʲ", "j", "u" },
			[false] = { "ɡʲ", "i" }
			},
		[false] = "ɡ"
	},
	["h"] = {
		["i" ] = {
			["a"  ] = { "xʲ", "j", "a" },
			["ą"  ] = { "xʲ", "j", "ɔ̃" },
			["e"  ] = { "xʲ", "j", "ɛ" },
			["ę"  ] = { "xʲ", "j", "ɛ̃" },
			["i"  ] = { "xʲ", "j", "i" },
			["o"  ] = { "xʲ", "j", "ɔ" },
			["ó"  ] = { "xʲ", "j", "u" },
			["u"  ] = { "xʲ", "j", "u" },
			[false] = { "xʲ", "i" }
			},
		[false] = "x"
        },
	["i"] = "i",
	["j"] = "j",
	["k"] = {
		["i" ] = {
			["a"  ] = { "kʲ", "j", "a" },
			["ą"  ] = { "kʲ", "j", "ɔ̃" },
			["e"  ] = { "kʲ", "ɛ" },
			["ę"  ] = { "kʲ", "j", "ɛ̃" },
			["i"  ] = { "kʲ", "j", "i" },
			["o"  ] = { "kʲ", "j", "ɔ" },
			["ó"  ] = { "kʲ", "j", "u" },
			["u"  ] = { "kʲ", "j", "u" },
			[false] = { "kʲ", "i" }
			},
		[false] = "k"
	},
	["l"] = {
		["i" ] = {
			["a"  ] = { "lʲ", "a" },
			["ą"  ] = { "lʲ", "ɔ̃" },
			["e"  ] = { "lʲ", "ɛ" },
			["ę"  ] = { "lʲ", "ɛ̃" },
			["i"  ] = { "lʲ", "j", "i" },
			["o"  ] = { "lʲ", "ɔ" },
			["ó"  ] = { "lʲ", "u" },
			["u"  ] = { "lʲ", "u" },
			[false] = { "lʲ", "i" }
			},
		[false] = "l"
        },
	["ł"] = "w",
	["m"] = {
		["i"  ] = {
			["a"  ] = { "mʲ", "a" },
			["ą"  ] = { "mʲ", "ɔ̃" },
			["e"  ] = { "mʲ", "ɛ" },
			["ę"  ] = { "mʲ", "ɛ̃" },
			["i"  ] = { "mʲ", "j", "i" },
			["o"  ] = { "mʲ", "ɔ" },
			["ó"  ] = { "mʲ", "u" },
			["u"  ] = { "mʲ", "u" },
			[false] = { "mʲ", "i" }
			},
		[false] = "m"
	},
	["n"] = {
		["i"  ] = {
			["ą"  ] = { "ɲ", "ɔ̃" },
			["a"  ] = { "ɲ", "a" },
			["e"  ] = { "ɲ", "ɛ" },
			["ę"  ] = { "ɲ", "ɛ̃" },
			["i"  ] = { "ɲ", "j", "i" },
			["o"  ] = { "ɲ", "ɔ" },
			["ó"  ] = { "ɲ", "u" },
			["u"  ] = { "ɲ", "u" },
			[false] = { "ɲ", "i" }
		},

		-- "bank", "bankowy", "bankowość" is [baŋk], [baŋˈkɔ.vɨ], [baŋˈko.voɕt͡ɕ]
		-- but "wybranka", "łapanka" and "zapinka" would be rather [vɨˈbran.ka], [waˈpan.ka] and [zaˈpin.ka].
		-- looks like "bank" and related should be manually transcribed.
		-- although [bank], etc. is not incorrect, even if somewhat posh. (In the regions where [nk] and [ŋk] can be distinguished, it's actually [baŋk] that is posh).

		-- ["g"  ] = { "ŋ", "ɡ" },
		-- ["k"  ] = { "ŋ", "k" },
		[false] = "n"
	},
	["ń"] = "ɲ",
	["o"] = {
		["o"  ] = { "ɔ", "ʔ", "ɔ" }, -- żaroodporny, ognioodporny, etc.
		[false] = "ɔ" ,
	},
	["ó"] = "u",
	["p"] = {
		["i"  ] = {
			-- piątek, piasek, etc.
			["a"  ] = { "pʲ", "a" },
			["ą"  ] = { "pʲ", "ɔ̃" },
			["e"  ] = { "pʲ", "ɛ" },
			["ę"  ] = { "pʲ", "ɛ̃" },
			["i"  ] = { "pʲ", "j", "i" },
			["o"  ] = { "pʲ", "ɔ" },
			["ó"  ] = { "pʲ", "u" },
			["u"  ] = { "pʲ", "u" },
			[false] = { "pʲ", "i" }
			},
		[false] = "p"
	},
	["r"] = {
		["i" ] = {
			["a"  ] = { "rʲ", "j", "a" },
			["ą"  ] = { "rʲ", "j", "ɔ̃" },
			["e"  ] = { "rʲ", "j", "ɛ" },
			["ę"  ] = { "rʲ", "j", "ɛ̃" },
			["i"  ] = { "rʲ", "j", "i" },
			["o"  ] = { "rʲ", "j", "ɔ" },
			["ó"  ] = { "rʲ", "j", "u" },
			["u"  ] = { "rʲ", "j", "u" },
			[false] = { "rʲ", "i" }
			},
		["z"  ] = "ʐ",
		[false] = "r"
	},
	["q"] = {
		["u"  ] = { "k", "v" },
		[false] = false
	},
	["s"] = {
		["i"  ] = {
			["ą"  ] = { "ɕ", "ɔ̃" },
			["a"  ] = { "ɕ", "a" },
			["e"  ] = { "ɕ", "ɛ" },
			["ę"  ] = { "ɕ", "ɛ̃" },
			["o"  ] = { "ɕ", "ɔ" },
			["ó"  ] = { "ɕ", "u" },
			["u"  ] = { "ɕ", "u" },
			["y"  ] = { "ɕ", "ɨ" },
			[false] = { "ɕ", "i" }
		},
		["z"  ] = "ʂ",
		[false] = "s",
	},
	["ś"] = "ɕ",
	["t"] = "t",
	["u"] = "u",
	["v"] = {
		["i"  ] = {
			["a"  ] = { "vʲ", "a" },
			["ą"  ] = { "vʲ", "ɔ̃" },
			["e"  ] = { "vʲ", "ɛ" },
			["ę"  ] = { "vʲ", "ɛ̃" },
			["i"  ] = { "vʲ", "j", "i" },
			["o"  ] = { "vʲ", "ɔ" },
			["ó"  ] = { "vʲ", "u" },
			["u"  ] = { "vʲ", "u" },
			[false] = { "vʲ", "i" }
			},
		[false] = "v"
	},
	["w"] = {
		["i"  ] = {
			["a"  ] = { "vʲ", "a" },
			["ą"  ] = { "vʲ", "ɔ̃" },
			["e"  ] = { "vʲ", "ɛ" },
			["ę"  ] = { "vʲ", "ɛ̃" },
			["i"  ] = { "vʲ", "j", "i" },
			["o"  ] = { "vʲ", "ɔ" },
			["ó"  ] = { "vʲ", "u" },
			["u"  ] = { "vʲ", "u" },
			[false] = { "vʲ", "i" }
			},
		["j" ] = { "vʲ", "j" }, -- e.g. wjazd,
		[false] = "v"
	},
	["x"] = { "k", "s" },
	["y"] = "ɨ",
	["z"] = {
		["i"  ] = {
			["ą"  ] = { "ʑ", "ɔ̃" },
			["a"  ] = { "ʑ", "a" },
			["e"  ] = { "ʑ", "ɛ" },
			["ę"  ] = { "ʑ", "ɛ̃" },
			["o"  ] = { "ʑ", "ɔ" },
			["ó"  ] = { "ʑ", "u" },
			["u"  ] = { "ʑ", "u" },
			[false] = { "ʑ", "i" }
		},
		[false] = "z"
	},
	["ź"] = "ʑ",
	["ż"] = "ʐ",
	["-"] = {},
}

local valid_phone = {
	["a" ] = true, ["b" ] = true, ["bʲ"] = true, ["d" ] = true, ["d͡z"] = true, ["d͡ʑ"] = true,
	["d͡ʐ"] = true, ["ɛ" ] = true, ["ɛ̃" ] = true, ["f" ] = true, ["fʲ"] = true, ["ɡ" ] = true,
	["ɡʲ"] = true, ["i" ] = true, ["ɨ" ] = true, ["j" ] = true, ["k" ] = true, ["kʲ"] = true,
	["l" ] = true, ["lʲ"] =true, ["m" ] = true, ["mʲ"] = true, ["n" ] = true, ["ŋ" ] = true,
 ["ɲ" ] = true, ["ɔ" ] = true, ["ɔ̃" ] = true, ["p" ] = true, ["pʲ"] = true, ["r" ] = true, ["rʲ"] = true,
["s" ] = true, ["ɕ" ] = true, ["ʂ" ] = true, ["t" ] = true, ["t͡s"] = true, ["t͡ɕ"] = true, ["t͡ʂ"] = true,
	["u" ] = true, ["v" ] = true, ["vʲ"] = true, ["w" ] = true, ["x" ] = true, ["xʲ"] = true, ["z" ] = true,
	["ʑ" ] = true, ["ʐ" ] = true, ["ʔ" ] = true
}

local sylmarks = {
	["."] = ".", ["'"] = "ˈ", [","] = "ˌ"
}

local vowel = {
	[ "a"] = true, [ "ɛ"] = true, [ "ɛ̃"] = true,
	[ "i"] = true, [ "ɨ"] = true, [ "ɔ"] = true,
	[ "ɔ̃"] = true, [ "u"] = true
}

local devoice = {
	["b" ] = "p" , ["d" ] = "t" , ["d͡z"] = "t͡s", ["d͡ʑ"] = "t͡ɕ",
	["d͡ʐ"] = "t͡ʂ", ["ɡ" ] = "k" , ["v" ] = "f" , ["vʲ"] = "fʲ",
	["z" ] = "s" , ["ʑ" ] = "ɕ" , ["ʐ" ] = "ʂ" ,

	-- non-devoicable
	["bʲ"] = "bʲ", ["ɡʲ"] = "ɡʲ", ["m" ] = "m" , ["mʲ"] = "mʲ",
	["n" ] = "n" , ["ɲ" ] = "ɲ" , ["ŋ" ] = "ŋ" , ["w" ] = "w" ,
	["l" ] = "l" , ["lʲ"] = "lʲ" , ["j" ] = "j" , ["r" ] = "r" , ["rʲ"] = "rʲ" ,
}

local denasalized = {
	[ "ɛ̃"] =  "ɛ",
	[ "ɔ̃"] =  "ɔ",
}

local nasal_map = {
	["p" ] = "m", ["pʲ"] = "m", ["b" ] = "m", ["bʲ"] = "m", -- zębu, klępa
	["k" ] = "ŋ", ["kʲ"] = "ŋ", ["ɡ" ] = "ŋ", ["ɡʲ"] = "ŋ", -- pąk, łęgowy
	["t" ] = "n", ["d" ] = "n", -- wątek, piątek, mądrość

	["t͡ɕ"] = "ɲ", ["d͡ʑ"] = "ɲ", -- pięć, pędziwiatr, łabędź
	-- gęsi, więzi
	["t͡ʂ"] = "n", ["d͡ʐ"] = "n", -- pączek, ?
	-- węszyć, mężny
	["t͡s"] = "n", ["d͡z"] = "n", -- wiedząc, pieniędzy
}

function export.convert_to_IPA(word)
	if type(word) == "table" then
		word = word.args[1]
	end

	-- convert letters to phones
	local phones = {}
	local l2ptab = letters2phones
	for ch in word:gmatch("[%z\1-\127\194-\244][\128-\191]*") do
		local value = l2ptab[ch]

		if value == nil then
			value = l2ptab[false]
			if value == false then
				return nil
			elseif type(value) == "table" then
				for _, phone in ipairs(value) do
					table.insert(phones, phone)
				end
			else
				table.insert(phones, value)
			end
			l2ptab = letters2phones
			value = l2ptab[ch]
		end

		if type(value) == "table" then
			if value[false] == nil then
				for _, phone in ipairs(value) do
					table.insert(phones, phone)
				end
				l2ptab = letters2phones
			else
				l2ptab = value
			end
		elseif type(value) == "string" then
			table.insert(phones, value)
			l2ptab = letters2phones
		else
			table.insert(phones, ch)
		end
	end

	if l2ptab ~= letters2phones then
		table.insert(phones, l2ptab[false])
	end

	-- simplify nasals
	local new_phones = {}
	for i, phone in ipairs(phones) do
		local pnext = phones[i + 1]
		if denasalized[phone] then
			if phone == "ɛ̃" and (not pnext or not valid_phone[pnext]) then
				-- denasalize word-final ę
				table.insert(new_phones, denasalized[phone])
			elseif nasal_map[pnext] then
				table.insert(new_phones, denasalized[phone])
				table.insert(new_phones, nasal_map[pnext])
			else
				table.insert(new_phones, phone)
			end
		else
			table.insert(new_phones, phone)
		end
	end
	phones = new_phones

	-- devoice
	for i = #phones, 1, -1 do
		local pprev, pcurr, pnext = phones[i - 1], phones[i]
		local j = i
		repeat
			j = j + 1
			pnext = phones[j]
		until not pnext or not sylmarks[pnext]
		if devoice[pcurr] and not devoice[pnext] and not vowel[pnext] and not denasalized[pnext] then
			phones[i] = devoice[pcurr]
		end
		-- prz, trz, krz, tw, kw(i)
		if ((pcurr == "v") or (pcurr == "vʲ") or (pcurr == "ʐ")) and valid_phone[pprev] and not devoice[pprev] and not vowel[pprev] and not denasalized[pprev] then
			phones[i] = devoice[pcurr]
		end
	end

	-- collect syllables
	local words, curword, sylmarked, sylbuf = {}, nil, false
	for i, pcurr in ipairs(phones) do
		local ppprev, pprev, pnext = phones[i - 2], phones[i - 1], phones[i + 1]

		if valid_phone[pcurr] then
			if not curword then
				curword, sylbuf, had_vowl, sylmarked = {}, '', false, false
				table.insert(words, curword)
			end

			local same_syl = true

			if vowel[pcurr] then
				if had_vowl then
					same_syl = false
				end
				had_vowl = true
			elseif had_vowl then
				if vowel[pnext] then
					same_syl = false
				elseif not vowel[pprev] and not vowel[pnext] then
					same_syl = false
				elseif ((pcurr == "s") and ((pnext == "t") or (pnext == "p") or (pnext == "k")))
				or (pnext == "r") or (pnext == "f") or (pnext == "w")
				or ((pcurr == "ɡ") and (pnext == "ʐ"))
				or ((pcurr == "d") and ((pnext == "l") or (pnext == "w") or (pnext == "ɲ")))
				then
					-- these should belong to a common syllable
					same_syl = false
				end
			end

			if same_syl then
				sylbuf = sylbuf .. pcurr
			else
				table.insert(curword, sylbuf)
				sylbuf, had_vowl = pcurr, vowel[pcurr]
			end
		elseif (curword or valid_phone[pnext]) and sylmarks[pcurr] then
			if not curword then
				curword, sylbuf, had_vowl = {}, '', false
				table.insert(words, curword)
			end
			sylmarked = true
			if sylbuf then
				table.insert(curword, sylbuf)
				sylbuf = ''
			end
			table.insert(curword, sylmarks[pcurr])
		else
			if sylbuf then
				if #curword > 0 and not had_vowl then
					curword[#curword] = curword[#curword] .. sylbuf
				else
					table.insert(curword, sylbuf)
				end
				if sylmarked then
					words[#words] = table.concat(curword)
				end
			end
			curword, sylbuf = nil, nil
			table.insert(words, pcurr)
		end
	end
	if sylbuf then
		if #curword > 0 and not had_vowl then
			curword[#curword] = curword[#curword] .. sylbuf
		else
			table.insert(curword, sylbuf)
		end
		if sylmarked then
			words[#words] = table.concat(curword)
		end
	end

	-- mark syllable breaks and stress
	for i, word in ipairs(words) do
		if type(word) == "table" then
			-- unless already marked
			if not ((word[2] == ".") or (word[2] == "ˈ") or (word[2] == "ˌ")) then
				for j, syl in ipairs(word) do
					if j == (#word - 1) then
						word[j] = "ˈ" .. syl
					elseif j ~= 1 then
						word[j] = "." .. syl
					end
				end
			end
			words[i] = table.concat(word)
		end
	end

	return table.concat(words)
end

return export
"""

In [4]:
from lupa import LuaRuntime

lua = LuaRuntime(unpack_returned_tuples=True, encoding="utf-8")

pl_ipa = lua.execute(PL_IPA)

convert_to_ipa = pl_ipa.convert_to_IPA

In [5]:
convert_to_ipa("zęby")

'ˈzɛm.bɨ'

In [6]:
MAPPING = {
    "a": "A",
    "b": "B",
    "t͡ʂ": "CZ",
    "d": "D",
    "d͡ʐ": "DRZ",
    "d͡z": "DZ",
    "d͡ʑ": "DZI",
    "ɛ": "E",
    "ɛ̃": "EN",
    "f": "F",
    "ɡ": "G",
    "i": "I",
    "j": "J",
    "ʲ": "J",
    "k": "K",
    "l": "L",
    "m": "M",
    "n": "N",
    "ɲ": "NI",
    "ɔ": "O",
    "ɔ̃": "ON",
    "p": "P",
    "r": "R",
    "ʐ": "RZ",
    "s": "S",
    "ɕ": "SI",
    "ʂ": "SZ",
    "t": "T",
    "t͡s": "TS",
    "t͡ɕ": "TSI",
    "u": "U",
    "v": "V",
    "w": "W",
    "x": "X",
    "ɨ": "Y",
    "z": "Z",
    "ʑ": "ZI",
    "ˈ": "",
    ".": "",
    "ʔ": ""
}

In [7]:
ipa_phones = list(MAPPING.keys())
ipa_phones.sort(key=len, reverse=True)
IPA_KEY_REGEX = fr"({'|'.join(ipa_phones)})"

In [8]:
import re

def convert_to_sphinx(text):
    tokens = []
    text = text.strip()
    while text:
        match = re.match(IPA_KEY_REGEX, text)
        if not match:
            raise ValueError(f"Could not match token in text: {text}")
        token = match.group(0)
        mapped = MAPPING[token]
        if mapped != "":
            tokens.append(mapped)
        text = text[len(token):].strip()
    return tokens

In [9]:
def sphinx_pronunciation(text):
    ipa = convert_to_ipa(text)
    return " ".join(convert_to_sphinx(ipa))

In [10]:
sphinx_pronunciation("sinus"), sphinx_pronunciation("s-inus")

('SI I N U S', 'S I N U S')

In [11]:
def clean_word(text):
    return text.strip(",;:!?—…„”\"“.«»*()[]‘/\\").lower()

In [12]:
from dataclasses import dataclass

@dataclass
class TextWord:
    raw: str
    word: str
    def __init__(self, raw):
        self.raw = raw
        self.word = clean_word(raw)

In [13]:
RAW_NORMS = "https://raw.githubusercontent.com/jimregan/wolnelektury-speech-corpus/refs/heads/main/specific-norms.tsv"
PRON_AS = "https://raw.githubusercontent.com/jimregan/wolnelektury-speech-corpus/refs/heads/main/pron-data/pronounce-as.tsv"

In [16]:
!pip install requests

Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.3-cp311-cp311-macosx_10_9_universal2.whl.metadata (36 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2025.8.3-py3-none-any.whl.metadata (2.4 kB)
Downloading requests-2.32.5-py3-none-any.whl (64 kB)
Downloading charset_normalizer-3.4.3-cp311-cp311-macosx_10_9_universal2.whl (204 kB)
Using cached idna-3.10-py3-none-any.whl (70 kB)
Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
Downloading certifi-2025.8.3-py3-none-any.whl (161 kB)
Installing collected packages: urllib3, idna, charset_normalizer, certifi, requests
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5/5[0m [

In [17]:
import requests

def download_get_lines(url):
    lines = []
    req = requests.get(url)
    if req.status_code != 200:
        return []
    for line in req.text.split("\n"):
        if line != "":
            lines.append(line)
    return lines

In [18]:
def pronounce_as_dict(url):
    lines = download_get_lines(url)
    data = {}
    for line in lines:
        parts = line.split("\t")
        if not parts[0] in data:
            data[parts[0]] = set()
        data[parts[0]].add(parts[1])
    return data

In [19]:
pronounce_as = pronounce_as_dict(PRON_AS)

In [None]:
espeak_to_cmudict = {}
for line in MAPPING.split("\n"):
    if line == "":
        continue
    line = line.strip()
    parts = line.split(" ")

    if len(parts) != 2:
        print(line)
        continue
    k, v = line.split(" ")
    if not k in espeak_to_cmudict:
        espeak_to_cmudict[k] = v


In [None]:
def normword(text):
    text = text.strip(",.;:!?")
    return text.lower()

def normphon(phon):
    phon = phon.strip(",.;:!?")
    return phon

def make_lexicon(text, phon):
    if phon.startswith("/") and phon.endswith("/"):
        phon = phon[1:-1]
    words = [normword(x) for x in text.split(" ")]
    phonwords = [cmudictify(normphon(x)) for x in phon.split(" ")]
    assert len(words) == len(phonwords)
    output = list(set(zip(words, phonwords)))
    return output


In [None]:
lex = make_lexicon(EGTEXT, EGPHON)

In [None]:
audio = AudioSegment.from_file(EGFILE)

In [None]:
audio = audio.set_frame_rate(16000)

In [None]:
seg = audio[int(EGSTART * 1000):int(EGEND * 1000)]

In [None]:
def make_ps_dict(entries):
    counts = {}
    output = []
    lex = sorted(entries)
    for entry in lex:
        count = 1
        if not entry[0] in counts:
            counts[entry[0]] = 1
        else:
            counts[entry[0]] += 1
            count = counts[entry[0]]
        if count != 1:
            subscript = f"({count})"
        else:
            subscript = ""
        output.append(f"{entry[0]}{subscript} {entry[1]}")
    return output

In [None]:
def make_fsg_transitions_from_text(text):
    words = [normword(x) for x in text.split(" ")]
    enum = [x for x in enumerate(words)]
    trans = [(x[0], x[0] + 1, 1.0, x[1]) for x in enum]
    return trans

In [None]:
fsgt = make_fsg_transitions_from_text(EGTEXT)

In [None]:
start_state = fsgt[0][0]
end_state = fsgt[-1][1]

In [None]:
import pocketsphinx

This was the first attempt. Adding `None` for the dictionary (as the docs suggested) didn't help: can't add words to a dictionary that doesn't exist.

This may yet be a thing, because I can't be sure that it really failed to align using the grammar: audio handling sucks, and I should maybe have passed ffmpeg parameters before writing the audio.

In [None]:
import tempfile
entries = make_ps_dict(lex)

with (
    tempfile.NamedTemporaryFile(suffix=".dict") as dictf,
    tempfile.NamedTemporaryFile(suffix=".wav") as wavf,
):
    with open(dictf.name, "w") as dictout:
        for entry in entries:
            dictout.write(entry + "\n")

    seg.export(wavf.name, format="wav")

    decoder = pocketsphinx.Decoder(lm=None, dict=dictf.name)
    fsg = decoder.create_fsg("dummy", start_state, end_state, fsgt)
    decoder.add_fsg("dummy", fsg)
    decoder.activate_search("dummy")
    decoder.start_utt()
    # decoder.process_raw(seg.get_array_of_samples('B'))
    decoder.process_raw(wavf.read(), full_utt=True)
    decoder.end_utt()

In [None]:
decoder.seg()

ERROR: "fsg_search.c", line 944: Final result does not match the grammar in frame 1082


In [None]:
fsg.writefile("/tmp/fsm")

In [None]:
!cat /tmp/fsm

FSG_BEGIN dummy
NUM_STATES 29
START_STATE 0
FINAL_STATE 28
TRANSITION 0 0 0.000000 [NOISE]
TRANSITION 0 0 0.005001 <sil>
TRANSITION 0 1 1.000000 yeah
TRANSITION 1 1 0.000000 [NOISE]
TRANSITION 1 1 0.005001 <sil>
TRANSITION 1 2 1.000000 that's
TRANSITION 2 2 0.000000 [NOISE]
TRANSITION 2 2 0.005001 <sil>
TRANSITION 2 3 1.000000 true
TRANSITION 3 3 0.000000 [NOISE]
TRANSITION 3 3 0.005001 <sil>
TRANSITION 3 4 1.000000 i(2)
TRANSITION 3 4 1.000000 i
TRANSITION 4 4 0.000000 [NOISE]
TRANSITION 4 4 0.005001 <sil>
TRANSITION 4 5 1.000000 mean
TRANSITION 5 5 0.000000 [NOISE]
TRANSITION 5 5 0.005001 <sil>
TRANSITION 5 6 1.000000 they
TRANSITION 6 6 0.000000 [NOISE]
TRANSITION 6 6 0.005001 <sil>
TRANSITION 6 7 1.000000 are(2)
TRANSITION 6 7 1.000000 are
TRANSITION 7 7 0.000000 [NOISE]
TRANSITION 7 7 0.005001 <sil>
TRANSITION 7 8 1.000000 the
TRANSITION 8 8 0.000000 [NOISE]
TRANSITION 8 8 0.005001 <sil>
TRANSITION 8 9 1.000000 same
TRANSITION 9 9 0.000000 [NOISE]
TRANSITION 9 9 0.005001 <sil>
TRA

In [None]:
with open("/tmp/mytmp.dict", "w") as dictout:
    for entry in entries:
        dictout.write(entry + "\n")


In [None]:
seg.export("/tmp/clip.wav", format="wav")

<_io.BufferedRandom name='/tmp/clip.wav'>

In [None]:
!sox /tmp/clip.wav $(pocketsphinx soxflags) > /tmp/ps.raw
psjson=!pocketsphinx align /tmp/ps.raw "yeah that's true i mean they are the same size and they are a little bit but i think i i should go more for something that style"

In [None]:
psjson

['{"b":0.000,"d":5.410,"p":1.000,"t":"yeah that\'s true i mean they are the same size and they are a little bit but i think i i should go more for something that style","w":[{"b":0.000,"d":0.250,"p":0.964,"t":"yeah"},{"b":0.250,"d":0.150,"p":0.937,"t":"that\'s"},{"b":0.400,"d":0.200,"p":0.978,"t":"true"},{"b":0.600,"d":0.060,"p":0.974,"t":"i"},{"b":0.660,"d":0.180,"p":0.980,"t":"mean"},{"b":0.840,"d":0.140,"p":0.979,"t":"they"},{"b":0.980,"d":0.070,"p":0.983,"t":"are(2)"},{"b":1.050,"d":0.110,"p":0.987,"t":"the"},{"b":1.160,"d":0.260,"p":0.974,"t":"same"},{"b":1.420,"d":0.380,"p":0.956,"t":"size"},{"b":1.800,"d":0.120,"p":0.975,"t":"and"},{"b":1.920,"d":0.110,"p":0.986,"t":"they"},{"b":2.030,"d":0.140,"p":0.983,"t":"are(2)"},{"b":2.170,"d":0.030,"p":0.990,"t":"a"},{"b":2.200,"d":0.180,"p":0.914,"t":"little"},{"b":2.380,"d":0.140,"p":0.980,"t":"bit"},{"b":2.520,"d":0.220,"p":0.935,"t":"but"},{"b":2.740,"d":0.110,"p":0.983,"t":"i"},{"b":2.850,"d":0.260,"p":0.968,"t":"think"},{"b":3.110,"

In [None]:
import json
data = json.loads(str(psjson[0]))

In [None]:
data

{'b': 0.0,
 'd': 5.41,
 'p': 1.0,
 't': "yeah that's true i mean they are the same size and they are a little bit but i think i i should go more for something that style",
 'w': [{'b': 0.0, 'd': 0.25, 'p': 0.964, 't': 'yeah'},
  {'b': 0.25, 'd': 0.15, 'p': 0.937, 't': "that's"},
  {'b': 0.4, 'd': 0.2, 'p': 0.978, 't': 'true'},
  {'b': 0.6, 'd': 0.06, 'p': 0.974, 't': 'i'},
  {'b': 0.66, 'd': 0.18, 'p': 0.98, 't': 'mean'},
  {'b': 0.84, 'd': 0.14, 'p': 0.979, 't': 'they'},
  {'b': 0.98, 'd': 0.07, 'p': 0.983, 't': 'are(2)'},
  {'b': 1.05, 'd': 0.11, 'p': 0.987, 't': 'the'},
  {'b': 1.16, 'd': 0.26, 'p': 0.974, 't': 'same'},
  {'b': 1.42, 'd': 0.38, 'p': 0.956, 't': 'size'},
  {'b': 1.8, 'd': 0.12, 'p': 0.975, 't': 'and'},
  {'b': 1.92, 'd': 0.11, 'p': 0.986, 't': 'they'},
  {'b': 2.03, 'd': 0.14, 'p': 0.983, 't': 'are(2)'},
  {'b': 2.17, 'd': 0.03, 'p': 0.99, 't': 'a'},
  {'b': 2.2, 'd': 0.18, 'p': 0.914, 't': 'little'},
  {'b': 2.38, 'd': 0.14, 'p': 0.98, 't': 'bit'},
  {'b': 2.52, 'd'

In [None]:
with open("/tmp/audacity.tsv", "w") as tsvf:
    for word in data["w"]:
        tsvf.write(f"{word['b']}\t{word['b']+word['d']:.2}\t{word['t']}\n")