-
Notifications
You must be signed in to change notification settings - Fork 0
/
build_json_test.py
51 lines (40 loc) · 1.31 KB
/
build_json_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import json
import os
import pandas as pd
import sys
# path to the output CommonVoice-DEMAND trainset
OUT_ROOT = sys.argv[1]
# path to the original CommonVoice corpus
CV_ROOT = sys.argv[2]
# path to the original VoiceBank-DEMAND corpus
VB_ROOT = sys.argv[3]
OUT_FILE = "%s/test.json"%OUT_ROOT
CLEAN_ROOT = "%s/test_clean"%OUT_ROOT
NOISY_ROOT = "%s/test"%OUT_ROOT
word_info = pd.read_csv("%s/validated.tsv"%CV_ROOT,sep="\t")
with open("%s/test.json"%VB_ROOT) as f:
test_dict = json.load(f)
out_dict = {}
new_test_list = os.listdir(NOISY_ROOT)
print(new_test_list)
for f in test_dict:
print(f)
f_full = [x for x in new_test_list if f in x]
if len(f_full) == 0:
print("%s not found in new list"%f)
continue
print(f_full)
f_full = f_full[0]
print(f_full)
f_og,f_new = f_full.split("@")
print(f_og,f_new)
clean_path = os.path.join("{data_root}/test_clean_16k/",f_new)
noisy_path = os.path.join("{data_root}/test_16k/",f_full)
f_mp3 = f_new.replace(".wav",".mp3")
words = word_info[word_info["path"]==f_mp3]["sentence"].values[0].upper()
print(clean_path)
print(noisy_path)
print(words)
out_dict[f.strip(".wav")] = {"clean_wav":clean_path,"noisy_wav":noisy_path,"words":words}
with open(OUT_FILE,"w") as f:
json.dump(out_dict,f,indent=4)