## Phoneme Accuracy

In [15]:
import json 
def load_json(path):
    """
    Load a json file and return the content as a dictionary.
    """
    import json

    with open(path, "r", encoding= 'utf-8') as f:
        data = json.load(f)
    return data

def get_unique_tokens(data):
    """
    Get unique tokens from the dataset.
    """
    unique_tokens = set()
    for item in data:
        transcription = item["text"]
        for token in transcription.split():
            unique_tokens.add(token)
    return list(unique_tokens)

testset_path= "../dataset/LSVSC_test_word.json"
vocab_train_path = "../dataset/word_vocab_lsvsc.json"

vocab_train = load_json(vocab_train_path)
unique_token_test = get_unique_tokens(load_json(testset_path))
oov_word = []
for token in unique_token_test:
    if token not in vocab_train:
        oov_word.append(token)

print(f"Number of OOV words: {len(oov_word)}, Percentage: {len(oov_word)/len(unique_token_test)*100:.2f}%")

Number of OOV words: 116, Percentage: 3.78%


In [45]:


# num <unk> in test result
test_path_result = "./result/result-tasa-w2i-lsvsc.json"
test_result = load_json(test_path_result)
num_unk = 0
idx_list_data = []

for idx, item in enumerate(test_result):
    try:
        transcription = item["gold"]
        for idx_word, token in enumerate(transcription.split()):
            if token == "<unk>":
                num_unk += 1
                idx_list_data.append((idx, idx_word))
    except: 
        continue


# num correct fill by phoneme based model 

test_result_phoneme = load_json("./result/result-tasa-p2i_lsvsc.json")
phoneme_test_path = "../dataset/LSVSC_test_phoneme.json"
phoneme_test_data = load_json(phoneme_test_path)
num_correct_fill = 0

correct_filled = []
cannot_be_filled = []
tried_to_filled_but_wrong = []
total_num = 0
for idx, idx_word in idx_list_data:
    if idx_word < len(test_result_phoneme[idx]["predicted"].split(' ')):
        gold = test_result_phoneme[idx]["gold"].split(' ')[idx_word]
        pred = test_result_phoneme[idx]["predicted"].split(' ')[idx_word]
        # print(f"Gold: {gold}, Pred: {pred}")
        if gold == pred:
            correct_filled.append(gold)
            num_correct_fill += 1
        else:
            cannot_be_filled.append(gold)
            tried_to_filled_but_wrong.append(pred)
        total_num += 1

print(f"Number of OOV words: {len(oov_word)}, Percentage: {len(oov_word)/len(unique_token_test)*100:.2f}%")
print(f"Number of <unk> in transcript of test dataset: {total_num}")
print(f"Number of correct fill by phoneme based model: {num_correct_fill}")
print(f"Percentage of correct fill by phoneme based model: {num_correct_fill/total_num*100:.2f}%")
print(f"Correctly filled words: {set(correct_filled)}")
print(f"Cannot be filled words: {set(cannot_be_filled)}")
print(f"Tried to fill but wrong predictions: {set(tried_to_filled_but_wrong)}")

Number of OOV words: 116, Percentage: 3.78%
Number of <unk> in transcript of test dataset: 131
Number of correct fill by phoneme based model: 34
Percentage of correct fill by phoneme based model: 25.95%
Correctly filled words: {'hwɛ˨˩', 'tʰuŋ˧˥', 'ɲɯ˧˩', 'son˧˥', 'ki˧ˀ˩', 'χɯən~˧˥', 'lan~˧ˀ˩', 'kwiən~˧˥', 'bɔ˧ˀ˩', 'ŋun~˧˥', 'muəŋ˧˥', 'tʰwi˧˩', 'mi˧ˀ˩', 't͡ɕwɛ˧˥', 'ɲaːm~˧˥', 'ʂɔn˨˩', 'maːn~˧ˀ˩', 'vaːi˧˥', 'tʰɯən˨˩', 'vaːŋ̟˧ˀ˩', 'vɔi-', 'zɔi˨˩', 'nəːm-', 'nəŋ˧˥', 'haːŋ˧˥', 'biu˧˥', 'ɣen~˧˥', 'ŋɔn˨˩', 'ʈ͡ʂɛu˧˩', 'tʰɔm~˧˥', 'həːn˧˥', 't͡ɕwaːŋ˧˩', 'ŋaːi˧˥'}
Cannot be filled words: {'ʈ͡ʂɯəŋ˧ˀ˩', 'sum-', 'ɣi˨˩', 'hut-', 'tʰiə-', 'kəŋ~˧˥', 'ŋwɛu˧ˀ˩', 'ʝiu˨˩', 'veu-', 'tʰwəː˧˩', 'mɛn~˧ˀ˩', 'ɣəːm˨˩', 'kɯəŋ~˧ˀ˩', 'rəːm˨˩', 'nullɯŋ˧ˀ˥', 'ʂɔːŋ~˧˥', 'kiəŋ˨˩', 'raːn˧˥', 'ʝik̟-', 'kiəŋ˧ˀ˥', 'maːn~˧ˀ˩', 'laːn˧ˀ˩', 'sun-', 'vɔi˧˥', 'vɔi-', 'hon˧˩', 'ʂiu˧˩', 't͡ɕaːn~˧ˀ˩', 'muəi˨˩', 'ruəŋ˨˩', 'ɣaːp-', 'faːm˧˩', 'tɯn~˧˥', 'bɯəŋ-', 'huə˧˩', 'ʂaːt-', 'kɔːŋ-', 'tʰum-', 'fi˧ˀ˩', 'ʂəi˧˩', 'χəːi˧˥', 'ʈ͡ʂɔn~˧˥', 

In [46]:
def save_json(path, data):
    """
    Save a dictionary to a json file.
    """
    import json

    with open(path, "w", encoding= 'utf-8') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

save_json("./result/correctly_filled_words.json", list(set(correct_filled)))
save_json("./result/cannot_be_filled_words.json", list(set(cannot_be_filled)))
save_json("./result/tried_to_filled_but_wrong_predictions.json", list(set(tried_to_filled_but_wrong)))

## Pearson correlation