In [1]:
from src.requirements import *
from src.ssl_model import *
from src.asr_model import *
from src.inference_model import *
from src.tokenizer import Tokenizer
import jiwer

In [2]:
ssl_ver = 100_000
asr_ver = 110_000

device = "cuda" if torch.cuda.is_available() else "cpu"

ssl_path = os.path.join("models", "ssl_model", f"ssl_model_prototype_{ssl_ver}.pth")
asr_path = os.path.join("models", "asr_model", f"asr_model_prototype_{asr_ver}.pth")
tokenizer_path = os.path.join("data", "tokenizer.json")

metadata = pd.read_csv(os.path.join('data', 'metadata.tsv'), sep='\t')

In [3]:
# tokenizer = Tokenizer.load(tokenizer_path)

In [4]:
infer_model = InferenceModel(ssl_path, asr_path, tokenizer_path, Tokenizer, device)

Final Vocabulary Size after filtering: 494
Blank ID: 0


In [5]:
num = torch.randint(low=0, high=metadata.shape[0], size=(10,1))
references = []
hypotheses = []

In [6]:
for idx in num:
    reference = metadata['transcript'].tolist()[idx]
    path = metadata['path'].tolist()[idx]
    waveform, sr = sf.read(path, always_2d=True)

    text = infer_model(waveform, sr)
    references.append(reference)
    hypotheses.append(text)

In [7]:
def calc_cer(references, hypotheses):
    cer = jiwer.cer(references, hypotheses)
    return cer

print(f"Character Error Rate: {calc_cer(references, hypotheses)*100:0.4f}%")

Character Error Rate: 85.8779%


In [8]:
references

['शाहीको सम्मानमा नेपाल',
 'उपत्यकाका विभिन्न सांस्कृतिक पर्वहरूमा नेवारहरूको समूह आफ्नो सांस्कृतिक पहिरनमा ककुवय् बजाउँदै हिँडेको देखिन्छ।',
 'झ्वाट्ट भनिहाल्नका लागि',
 'सामेल छ',
 'प्रोटोकल वमाोजिम',
 'बोगी ८ बनाए',
 'यसलाई बनाइन्छ',
 'सापिङ गाविस पर्दछन्',
 'स्वास्थ्यकर्मीहरूका हकहितको',
 'ब्लेन र कर्टको']

In [9]:
hypotheses

['सकसप',
 'पपपतकबससततत स ससतप पदज पदथछ',
 'सचपपनप',
 'समन छ',
 'सतन',
 'त प',
 'क भछ',
 'सगसक',
 'सपपपनकपतक',
 'य क']