In [2]:
import torch
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import IPython.display as ipd
import matplotlib.pyplot as plt
import plotly.express as px

from texts import test_text, poem_texts
from utils import inference

device = 'cuda:1'
SAMPLE_RATE = 22050

# Samples Generation

## Generating long samples

In [3]:
for text in poem_texts:
    audio = inference(text, speed=1.0)
    ipd.display(ipd.Audio(audio, rate=SAMPLE_RATE))

## Generating one sample with different speed

In [4]:
for speed in [0.7, 1.0, 1.3]:
    audio = inference(test_text, speed=speed)
    ipd.display(ipd.Audio(audio, rate=SAMPLE_RATE))

# Quality estimation

## Loading Mean Opinion Score (MOS) estimator
MOS - metric for audio quality estimation, range is between [0-5] 0 - Bad quality, 5 - Perfect quality.

Actually it was trained on English language, but i've found that for other languages it also correlates with quality.
P.S. it is expensive to train on Georgian, from my research i found that for Georgian language greater than 3.5 is already ok using this model.

In [5]:
predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
predictor = predictor.to(device)
predictor.eval()
print("MOS estimator Loaded!")

Using cache found in /home/icegas/.cache/torch/hub/tarepan_SpeechMOS_v1.2.0


MOS estimator Loaded!


Loading test text phrases

In [6]:
with open('test_texts.txt', 'r') as f:
    texts = f.readlines()

1. Estimation of mos scores for each sample
2. Plotting histogram for scores

In [16]:
moses = []
for t in tqdm(texts):
    #estimation with normal speed
    audio = inference(t, speed=1.0)
    moses.append(
        predictor(
            torch.from_numpy(
            audio
            ).to(device)[None, :],
            SAMPLE_RATE
        ).detach().cpu().numpy()[0]
    )

  0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
fig = px.histogram(x=moses)
fig.update_layout(
    title={'text' : "MOS distribution of 1000 phrases", 'x' : 0.5, 'y' : 0.95},
    xaxis={'title' : 'MOS'},
    yaxis={'title' : 'number of samples'},
    font={'size' : 22},
)
fig.show()

# Performance analysis

1. Creating dataframe for 32, 64, 256, 512 characters
2. Adding results for cpu and gpu (i9 and RTX 4090 (24GB))
3. Plotting results

In [7]:
from time import time
df = {
    'device' : [], 
    'number of characters' : [], 
    'inference time' : [],
    'duration in seconds' : []
}
#Number of experiments for each 
#number of characters
run_samples = 100
character_sizes = [32, 64, 256, 512]
text = poem_texts[0]

for device in ['cuda:1']:
    for ch_size in tqdm(character_sizes):
        input_text = text[:ch_size]

        for _ in range(run_samples):
            t = time()
            audio = inference(input_text, speed=1.0)
            dt = time() - t

            df['device'].append(device)
            df['number of characters'].append(ch_size)
            df['inference time'].append(dt)
            df['duration in seconds'].append(
                audio.shape[-1] / SAMPLE_RATE
            )
df = pd.DataFrame(df)
df.head(2)

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,device,number of characters,inference time,duration in seconds
0,cuda:1,32,0.034765,3.784853
1,cuda:1,32,0.01667,3.970612


In [8]:
def quantile(n):
    def quantile_(x):
        return x.quantile(n)
    quantile_.__name__ = 'quantile_{:0.2f}'.format(n)
    return quantile_
out_df = df.copy()
out_df = out_df.groupby(['device', 'number of characters']).agg([
    'mean', 
    quantile(0.01), quantile(0.25),
    quantile(0.75), quantile(0.99)

])
out_df = out_df.rename(columns={'duration in seconds' : 'audio duration in seconds'})
#cpu_df = out_df.loc['cpu']
#display(cpu_df)

gpu_df = out_df.loc['cuda:1']
display(gpu_df)

Unnamed: 0_level_0,inference time,inference time,inference time,inference time,inference time,audio duration in seconds,audio duration in seconds,audio duration in seconds,audio duration in seconds,audio duration in seconds
Unnamed: 0_level_1,mean,quantile_0.01,quantile_0.25,quantile_0.75,quantile_0.99,mean,quantile_0.01,quantile_0.25,quantile_0.75,quantile_0.99
number of characters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
32,0.01905,0.016079,0.016348,0.017678,0.033628,3.967129,3.792167,3.889342,4.01415,4.241009
64,0.028719,0.027108,0.027718,0.028945,0.033125,6.405224,6.117297,6.278095,6.519002,6.626162
256,0.076064,0.072685,0.074295,0.077626,0.079519,20.796952,20.192653,20.506122,21.138866,21.442815
512,0.134205,0.120676,0.122291,0.141444,0.148867,38.701859,38.140865,38.475465,38.870204,39.362351
