In [None]:
import csv
import os
import librosa
import numpy as np
import unicodedata
from collections import defaultdict

In [None]:
INP_CSV = '../Data/Karolina.pl.f/all.phn.csv'  # cesta k vstupnímu CSV souboru

# phone_string = "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
# letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
# punctuation = ';:,.!?¡¿—…"«»“” '

# symbols = list(phone_string) + list(letters) + list(punctuation)

In [None]:
punctuation =  [
      ';', ':', ',', '.', '!', '?', '¡', '¿', '—', '…', '"', '«', '»', '“', '”', ' ',
]
letters = [
      'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',
      'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
      'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
]
ipa_phones = [
      'ɑ', 'ɐ', 'ɒ', 'æ', 'ɓ', 'ʙ', 'β', 'ɔ', 'ɕ', 'ç', 'ɗ', 'ɖ', 'ð', 'ʤ', 'ə', 'ɘ', 'ɚ', 
      'ɛ', 'ɜ', 'ɝ', 'ɞ', 'ɟ', 'ʄ', 'ɡ', 'ɠ', 'ɢ', 'ʛ', 'ɦ', 'ɧ', 'ħ', 'ɥ', 'ʜ', 'ɨ', 'ɪ', 
      'ʝ', 'ɭ', 'ɬ', 'ɫ', 'ɮ', 'ʟ', 'ɱ', 'ɯ', 'ɰ', 'ŋ', 'ɳ', 'ɲ', 'ɴ', 'ø', 'ɵ', 'ɸ', 'θ', 
      'œ', 'ɶ', 'ʘ', 'ɹ', 'ɺ', 'ɾ', 'ɻ', 'ʀ', 'ʁ', 'ɽ', 'ʂ', 'ʃ', 'ʈ', 'ʧ', 'ʉ', 'ʊ', 'ʋ', 
      'ⱱ', 'ʌ', 'ɣ', 'ɤ', 'ʍ', 'χ', 'ʎ', 'ʏ', 'ʑ', 'ʐ', 'ʒ', 'ʔ', 'ʡ', 'ʕ', 'ʢ', 'ǀ', 'ǁ', 
      'ǂ', 'ǃ', 'ˈ', 'ˌ', 'ː', 'ˑ', 'ʼ', 'ʴ', 'ʰ', 'ʱ', 'ʲ', 'ʷ', 'ˠ', 'ˤ', '˞', '↓', '↑', 
      '→', '↗', '↘', "'", '̩', "'", 'ᵻ'
]

In [None]:
def init_freq_dict(symbols):
    freq_dict = {}
    for s in symbols:
        freq_dict[s] = 0
    return freq_dict

In [None]:
punctuation_freq = init_freq_dict(punctuation)
letters_freq = init_freq_dict(letters)
ipa_phones_freq = init_freq_dict(ipa_phones)
unsupported = defaultdict(int)
unsupported_examples = {}

# Go through CSV file
with open(INP_CSV, newline='', encoding='utf-8') as ifile:
    reader = csv.reader(ifile, delimiter='|')

    # Read current line
    for idx, row in enumerate(reader):
        assert len(row) in (2, 3), f"Row {idx} must contain 2 or 3 items!"
        text = row[1] if len(row) == 3 else row[0]
        text = text.strip()

        # Convert string to list of characters
        chars = []
        for char in text:
            # Check "combining" character
            if unicodedata.combining(char):
                chars[-1] += char
            else:
                chars.append(char)

        # Look for individual characters
        for char in chars:
            if char in punctuation_freq:
                punctuation_freq[char] += 1
            elif char in letters_freq:
                letters_freq[char] += 1
            elif char in ipa_phones_freq:
                ipa_phones_freq[char] += 1
            else:
                unsupported[char] += 1
                unsupported_examples[char] = text

print('Punctuation:')
for char in punctuation_freq.keys():
    print(f'{char}\t{punctuation_freq[char]}')
print()
print('Letters:')
for char in letters_freq.keys():
    print(f'{char}\t{letters_freq[char]}')
print()
print('IPA phones:')
for char in ipa_phones_freq.keys():
    print(f'{char}\t{ipa_phones_freq[char]}')
print()
print('Unsupported:')
for char in unsupported.keys():
    print(f'\n{char}\t{unsupported[char]}\n{unsupported_examples[char]}')