In [1]:
import torch
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm
import IPython.display as ipd
import matplotlib.pyplot as plt
import plotly.express as px

from texts import test_text, poem_texts
from utils import inference

device = 'cuda:1'
SAMPLE_RATE = 22050

# Samples Generation

## Generating long samples

In [8]:
for text in poem_texts:
    audio = inference(text, speed=1.0)
    ipd.display(ipd.Audio(audio, rate=SAMPLE_RATE))

## Generating one sample with different speed

In [100]:
for speed in [0.7, 1.0, 1.3]:
    audio = inference(test_text, speed=speed)
    ipd.display(ipd.Audio(audio, rate=SAMPLE_RATE))

# Quality estimation

## Loading Mean Opinion Score (MOS) estimator
MOS - metric for audio quality estimation, range is between [0-5] 0 - Bad quality, 5 - Perfect quality.

Actually it was trained on English language, but i've found that for other languages it also correlates with quality.
P.S. it is expensive to train on Georgian, from my research i found that for Georgian language greater than 3.5 is already ok using this model.

In [9]:
predictor = torch.hub.load("tarepan/SpeechMOS:v1.2.0", "utmos22_strong", trust_repo=True)
predictor = predictor.to(device)
predictor.eval()
print("MOS estimator Loaded!")

Using cache found in /home/icegas/.cache/torch/hub/tarepan_SpeechMOS_v1.2.0


MOS estimator Loaded!


Loading test text phrases

In [10]:
with open('test_texts.txt', 'r') as f:
    texts = f.readlines()

1. Estimation of mos scores for each sample
2. Plotting histogram for scores

In [16]:
moses = []
for t in tqdm(texts):
    #estimation with normal speed
    audio = inference(t, speed=1.0)
    moses.append(
        predictor(
            torch.from_numpy(
            audio
            ).to(device)[None, :],
            SAMPLE_RATE
        ).detach().cpu().numpy()[0]
    )

  0%|          | 0/1000 [00:00<?, ?it/s]

In [24]:
fig = px.histogram(x=moses)
fig.update_layout(
    title={'text' : "MOS distribution of 1000 phrases", 'x' : 0.5, 'y' : 0.95},
    xaxis={'title' : 'MOS'},
    yaxis={'title' : 'number of samples'},
    font={'size' : 22},
)
fig.show()

# Performance analysis

1. Creating dataframe for 32, 64, 256, 512 characters
2. Adding results for cpu and gpu (i9 and RTX 4090 (24GB))
3. Plotting results

In [26]:
from time import time
df = {
    'device' : [], 
    'number of characters' : [], 
    'inference time' : [],
    'duration in seconds' : []
}
#Number of experiments for each 
#number of characters
run_samples = 100
character_sizes = [32, 64, 256, 512]
text = poem_texts[0]

for device in ['cpu', 'cuda:1']:
    for ch_size in tqdm(character_sizes):
        input_text = text[:ch_size]

        for _ in range(run_samples):
            t = time()
            audio = inference(input_text, speed=1.0)
            dt = time() - t

            df['device'].append(device)
            df['number of characters'].append(ch_size)
            df['inference time'].append(dt)
            df['duration in seconds'].append(
                audio.shape[-1] / SAMPLE_RATE
            )
df = pd.DataFrame(df)
df.head(2)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,device,number of characters,inference time,duration in seconds
0,cpu,32,0.037227,4.017052
1,cpu,32,0.016821,3.924172


In [69]:
out_df.columns

Index(['device', 'number of characters', 'inference time',
       'duration in seconds'],
      dtype='object')

0       4.017052
1       3.924172
2       4.051882
3       3.993832
4       4.075102
         ...    
795    38.452245
796    38.556735
797    39.102404
798    38.707664
799    39.299773
Name: duration in seconds, Length: 800, dtype: float64

In [77]:
def quantile(n):
    def quantile_(x):
        return x.quantile(n)
    quantile_.__name__ = 'quantile_{:0.2f}'.format(n)
    return quantile_
out_df = df.copy()
out_df = out_df.groupby(['device', 'number of characters']).agg([
    'mean', 
    quantile(0.01), quantile(0.25),
    quantile(0.75), quantile(0.99)

])
out_df = out_df.rename(columns={'duration in seconds' : 'audio duration in seconds'})
cpu_df = out_df.loc['cpu']
display(cpu_df)

gpu_df = out_df.loc['cuda:1']
display(gpu_df)

Unnamed: 0_level_0,inference time,inference time,inference time,inference time,inference time,audio duration in seconds,audio duration in seconds,audio duration in seconds,audio duration in seconds,audio duration in seconds
Unnamed: 0_level_1,mean,quantile_0.01,quantile_0.25,quantile_0.75,quantile_0.99,mean,quantile_0.01,quantile_0.25,quantile_0.75,quantile_0.99
number of characters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
32,0.320311,0.146636,0.157055,0.168328,4.10373,3.929861,3.657143,3.85161,3.982222,4.458928
64,0.675573,0.199406,0.210842,0.223745,5.381177,6.401858,6.141678,6.292608,6.50449,6.711263
256,10.147135,0.590234,0.629524,14.319341,14.778163,20.833988,20.211926,20.60771,21.008254,21.584922
512,21.829358,1.131615,26.419551,27.857851,28.568445,38.878563,38.056345,38.553832,39.154649,39.882014


Unnamed: 0_level_0,inference time,inference time,inference time,inference time,inference time,audio duration in seconds,audio duration in seconds,audio duration in seconds,audio duration in seconds,audio duration in seconds
Unnamed: 0_level_1,mean,quantile_0.01,quantile_0.25,quantile_0.75,quantile_0.99,mean,quantile_0.01,quantile_0.25,quantile_0.75,quantile_0.99
number of characters,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
32,0.024355,0.014626,0.015937,0.016883,0.407303,3.962485,3.691973,3.866122,4.017052,4.435476
64,0.03214,0.020465,0.021048,0.021985,0.536277,6.417995,6.094658,6.269388,6.548027,6.898068
256,0.344305,0.058727,0.060569,0.064237,1.504808,20.83898,20.23352,20.64254,21.066304,21.397653
512,1.085138,0.115699,0.118506,2.785288,2.876193,38.943695,37.695971,38.542222,39.256236,39.99939


In [98]:
import soundfile as sf
for i, t in enumerate(texts[:5]):
    audio = inference(t, speed=1.0)
    sf.write('/home/icegas/Downloads/audio_{}.wav'.format(i+4),audio, 22050)