In [31]:
import csv
import os
import librosa
import numpy as np
import unicodedata
from collections import defaultdict

In [32]:
INP_CSV = '../Data/NeuOl.cs.m/all.phn.csv'  # cesta k vstupnímu CSV souboru

punctuation = ';:,.!?¡¿—…"«»“” '
letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
# ipa_phones = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
# CZ:
ipa_phones = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼ̝ʰʱʲʷˠ̊˞↓↑→↗↘'̩'ᵻ"
missing = '̃'

In [33]:
# punctuation =  [
#       ';', ':', ',', '.', '!', '?', '¡', '¿', '—', '…', '"', '«', '»', '“', '”', ' ',
# ]
# letters = [
#       'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
#       'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
#       'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
# ]
# ipa_phones = [
#       'ɑ', 'ɐ', 'ɒ', 'æ', 'ɓ', 'ʙ', 'β', 'ɔ', 'ɕ', 'ç', 'ɗ', 'ɖ', 'ð', 'ʤ', 'ə', 'ɘ', 'ɚ', 
#       'ɛ', 'ɜ', 'ɝ', 'ɞ', 'ɟ', 'ʄ', 'ɡ', 'ɠ', 'ɢ', 'ʛ', 'ɦ', 'ɧ', 'ħ', 'ɥ', 'ʜ', 'ɨ', 'ɪ', 
#       'ʝ', 'ɭ', 'ɬ', 'ɫ', 'ɮ', 'ʟ', 'ɱ', 'ɯ', 'ɰ', 'ŋ', 'ɳ', 'ɲ', 'ɴ', 'ø', 'ɵ', 'ɸ', 'θ', 
#       'œ', 'ɶ', 'ʘ', 'ɹ', 'ɺ', 'ɾ', 'ɻ', 'ʀ', 'ʁ', 'ɽ', 'ʂ', 'ʃ', 'ʈ', 'ʧ', 'ʉ', 'ʊ', 'ʋ', 
#       'ⱱ', 'ʌ', 'ɣ', 'ɤ', 'ʍ', 'χ', 'ʎ', 'ʏ', 'ʑ', 'ʐ', 'ʒ', 'ʔ', 'ʡ', 'ʕ', 'ʢ', 'ǀ', 'ǁ', 
#       'ǂ', 'ǃ', 'ˈ', 'ˌ', 'ː', 'ˑ', 'ʼ', 'ʴ', 'ʰ', 'ʱ', 'ʲ', 'ʷ', 'ˠ', 'ˤ', '˞', '↓', '↑', 
#       '→', '↗', '↘', "'", '̩', "'", 'ᵻ'
# ]

In [34]:
def init_freq_dict(symbols):
    freq_dict = {}
    for s in symbols:
        freq_dict[s] = 0
    return freq_dict

In [35]:
punctuation_freq = init_freq_dict(punctuation)
letters_freq = init_freq_dict(letters)
ipa_phones_freq = init_freq_dict(ipa_phones)
unsupported = defaultdict(int)
unsupported_examples = {}

# Go through CSV file
with open(INP_CSV, newline='', encoding='utf-8') as ifile:
    reader = csv.reader(ifile, delimiter='|')

    # Read current line
    for idx, row in enumerate(reader):
        assert len(row) in (2, 3), f"Row {idx} must contain 2 or 3 items!"
        text = row[1] if len(row) == 3 else row[0]
        text = text.strip()

        # Look for individual characters
        for char in text:
            if char in punctuation_freq:
                punctuation_freq[char] += 1
            elif char in letters_freq:
                letters_freq[char] += 1
            elif char in ipa_phones_freq:
                ipa_phones_freq[char] += 1
            else:
                unsupported[char] += 1
                unsupported_examples[char] = text

print('Punctuation:')
for char in punctuation_freq.keys():
    print(f'{char}\t{punctuation_freq[char]}')
print()
print('Letters:')
for char in letters_freq.keys():
    print(f'{char}\t{letters_freq[char]}')
print()
print('IPA phones:')
for char in ipa_phones_freq.keys():
    print(f'{char}\t{ipa_phones_freq[char]}')
print()
print('Unsupported:')
for char in unsupported.keys():
    print(f'\n{char}\t{unsupported[char]}\n{unsupported_examples[char]}')

Punctuation:
;	0
:	43
,	7848
.	8962
!	46
?	1173
¡	0
¿	0
—	0
…	6
"	0
«	0
»	0
“	0
”	0
 	99398

Letters:
A	0
B	0
C	0
D	0
E	0
F	0
G	0
H	0
I	0
J	0
K	0
L	0
M	0
N	0
O	0
P	0
Q	0
R	0
S	0
T	0
U	0
V	0
W	0
X	0
Y	0
Z	0
a	55573
b	9357
c	5236
d	19847
e	68398
f	3587
g	0
h	7868
i	63675
j	15680
k	23072
l	26639
m	19263
n	26944
o	47461
p	20042
q	0
r	30766
s	39948
t	48259
u	18383
v	25314
w	0
x	5089
y	0
z	10575

IPA phones:
ɑ	0
ɐ	0
ɒ	0
æ	0
ɓ	0
ʙ	0
β	0
ɔ	0
ɕ	0
ç	0
ɗ	0
ɖ	0
ð	0
ʤ	0
ə	0
ɘ	0
ɚ	0
ɛ	0
ɜ	0
ɝ	0
ɞ	0
ɟ	3220
ʄ	0
ɡ	1735
ɠ	0
ɢ	0
ʛ	0
ɦ	0
ɧ	0
ħ	0
ɥ	0
ʜ	0
ɨ	0
ɪ	1396
ʝ	0
ɭ	0
ɬ	0
ɫ	0
ɮ	0
ʟ	0
ɱ	0
ɯ	0
ɰ	0
ŋ	726
ɳ	0
ɲ	12896
ɴ	0
ø	0
ɵ	0
ɸ	0
θ	0
œ	0
ɶ	0
ʘ	0
ɹ	0
ɺ	0
ɾ	0
ɻ	0
ʀ	0
ʁ	0
ɽ	0
ʂ	0
ʃ	12777
ʈ	0
ʧ	0
ʉ	0
ʊ	4726
ʋ	0
ⱱ	0
ʌ	0
ɣ	430
ɤ	0
ʍ	0
χ	0
ʎ	0
ʏ	0
ʑ	178
ʐ	0
ʒ	5409
ʔ	0
ʡ	0
ʕ	0
ʢ	0
ǀ	0
ǁ	0
ǂ	0
ǃ	0
ˈ	91406
ˌ	22572
ː	51804
ˑ	0
ʼ	0
̝	8085
ʰ	0
ʱ	0
ʲ	0
ʷ	0
ˠ	0
̊	4482
˞	0
↓	0
↑	0
→	0
↗	0
↘	0
'	0
̩	2752
ᵻ	0

Unsupported:
