In [3]:
import torch
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import IPython.display as ipd
import matplotlib.pyplot as plt
import plotly.express as px

from texts import test_text, poem_texts
from utils import inference

device = 'cuda:1'
SAMPLE_RATE = 22050

# Generation of test samples

## Generating hard long samples

In [None]:
for text in poem_texts:
    audio = inference(text, speed=1.0)
    ipd.display(ipd.Audio(audio[0, 0, 0]), rate=SAMPLE_RATE)

## Generating sample with different speed

In [None]:
for speed in [0.7, 1.0, 1.3]:
    audio = inference(test_text, speed=speed)
    ipd.display(ipd.Audio(audio[0, 0, 0]), rate=SAMPLE_RATE)

# Quality estimation

## Loading Mean Opinion Score (MOS) estimator
MOS - metric for audio quality estimation, range is between [0-5] 0 - Bad quality, 5 - Perfect quality.

Actually it was trained on English language, but i've found that for other languages it also correlates with quality.
P.S. it is expensive to train on Georgian, from my research i found that for Georgian language greater than 3.5 is already ok using this model.

In [4]:
device = 'cpu'
predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
predictor = predictor.to(device)
predictor.eval()
print("MOS estimator Loaded!")

Using cache found in /home/icegas/.cache/torch/hub/tarepan_SpeechMOS_v1.2.0





Loading test text phrases

In [5]:
with open('test_texts.txt', 'r') as f:
    texts = f.readlines()

1. Estimation of mos scores for each sample
2. Plotting histogram for scores

In [None]:
moses = []
for t in texts:
    #estimation with normal speed
    audio = inference(t, speed=1.0)
    moses.append(
        torch.from_numpy(
            audio[0][0, 0]
        )
    )

In [None]:
fig = px.histogram(x=moses)
fig.show()

# Performance analysis

1. Creating dataframe for 32, 64, 256, 512 characters
2. Adding results for cpu and gpu (i9 and RTX 4090 (24GB))
3. Plotting results

In [None]:
from time import time
df = {
    'device' : [], 
    'number of characters' : [], 
    'inference time' : [],
    'duration in seconds' : []
}
#Number of experiments for each 
#number of characters
run_samples = 100
character_sizes = [32, 64, 256, 512]
text = poem_texts[0]

for device in ['cpu', 'cuda:1']:
    for ch_size in tqdm(character_sizes):
        input_text = text[:ch_size]

        for _ in range(run_samples):
            t = time()
            audio = inference(input_text, speed=1.0)
            dt = time() - t

            df['device'].append(device)
            df['number of characters'].append(ch_size)
            df['inference time'].append(dt)
            df['duration in seconds'].append(
                audio.shape[-1] / SAMPLE_RATE
            )
df = pd.DataFrame(df)
df.head(2)

In [None]:
fig = px.box(df, x='number of characters', y='inference time', color='device')
fig.show()
fig = px.box(df, x='duration in seconds', y='inference time', color='device')
fig.show()