In [19]:
import pandas as pd
import os
import torch
from TTS.api import TTS
from tqdm import tqdm

### Preprocessing of the scene
- Take only first scene
- for each character give an audio file for voice conversion
- For each cheracter get the number of the iteration for the next stage

In [20]:
df = pd.read_csv('script/processed/POH_processedV2.csv')
df

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier
0,0,consigne,,ACT I SCENE I,,,
1,1,consigne,,Scene: A hotel lobby. There are large comforta...,,,
2,2,consigne,,Enter JESSICA shaking off her umbrella.,,,
3,3,speech,JESSICA,Goodness me.,joy,gratitude,joy
4,4,speech,JESSICA,What weather!,surprise,neutral,surprise
...,...,...,...,...,...,...,...
1850,1850,consigne,,ALAN walks back to the table and picks up the ...,,,
1851,1851,speech,ALAN,I'm trying to cut down.,anger,neutral,neutral
1852,1852,consigne,,ALAN walks over to the door and exits.,,,
1853,1853,consigne,,Curtain falls.,,,


In [22]:
# Take the first scene
index = df[df['sentence'] == 'END OF ACT I SCENE I'].index.tolist()[0]
scene1 = df.iloc[:index]
scene1

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier
0,0,consigne,,ACT I SCENE I,,,
1,1,consigne,,Scene: A hotel lobby. There are large comforta...,,,
2,2,consigne,,Enter JESSICA shaking off her umbrella.,,,
3,3,speech,JESSICA,Goodness me.,joy,gratitude,joy
4,4,speech,JESSICA,What weather!,surprise,neutral,surprise
...,...,...,...,...,...,...,...
351,351,speech,JESSICA,Yes.,neutral,approval,neutral
352,352,speech,JESSICA,Maybe.,neutral,confusion,neutral
353,353,consigne,,She stands up and moves slowly over to BARMAN....,,,
354,354,speech,JESSICA,Bring my bags up to my room.,neutral,neutral,neutral


In [23]:
# get a list of characters
list_character = df['character'].dropna().unique().tolist()
list_character

['JESSICA',
 'BARMAN',
 'TREVOR',
 'MARGARET',
 'MARGERET',
 'ALAN',
 'PRIEST',
 'MR CLEETHORPES']

In [24]:
#for each cheracter link to an audio file
dico = {
    'JESSICA' : 'f_1.wav',
    'BARMAN' : 'm_1.wav',
    'TREVOR' : 'm_2.wav',
    'MARGARET' : 'f_2.wav',
    'MARGERET' : 'f_3.wav',
    'ALAN' : 'm_3.wav',
    'PRIEST' : 'm_4.wav',
    'MR CLEETHORPES' : 'm_5.wav',
}
scene1['audio'] = scene1['character'].map(dico)
scene1

In [26]:
# Drop non audio lines
scene1 = scene1.dropna(subset=['audio'])
scene1

Unnamed: 0,idx,type,character,sentence,j-hartmann/emotion-english-distilroberta-base,SamLowe/roberta-base-go_emotion,michellejieli/emotion_text_classifier,audio
3,3,speech,JESSICA,Goodness me.,joy,gratitude,joy,f_1.wav
4,4,speech,JESSICA,What weather!,surprise,neutral,surprise,f_1.wav
5,5,speech,BARMAN,"Good evening madam, are you with the Cleethorp...",neutral,curiosity,neutral,m_1.wav
6,6,speech,JESSICA,Well yes I am.,neutral,approval,neutral,f_1.wav
7,7,speech,JESSICA,How could you possibly know that?\n,surprise,curiosity,surprise,f_1.wav
...,...,...,...,...,...,...,...,...
349,349,speech,JESSICA,"So very, very boring.\n",sadness,disappointment,sadness,f_1.wav
350,350,speech,BARMAN,Perhaps the company of old friends will have a...,neutral,curiosity,neutral,m_1.wav
351,351,speech,JESSICA,Yes.,neutral,approval,neutral,f_1.wav
352,352,speech,JESSICA,Maybe.,neutral,confusion,neutral,f_1.wav


In [27]:
# Create a new column 'count_column' using a loop
counts = {}
count_column = []

for value in scene1['character']:
    if value in counts:
        counts[value] += 1
    else:
        counts[value] = 1
    count_column.append(counts[value])

scene1['count_column'] = count_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scene1['count_column'] = count_column


### Load model and make the TTS

In [29]:
# Create a folder for the scene audio
folder_path = 'audio/scene1'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [30]:
# Load the model
tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to("cuda")

 > tts_models/en/ljspeech/tacotron2-DDC is already downloaded.
 > vocoder_models/en/ljspeech/hifigan_v2 is already downloaded.
 > Using model: Tacotron2
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > resample:False
 | > num_mels:80
 | > log_func:np.log
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > pitch_fmin:1.0
 | > pitch_fmax:640.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > do_rms_norm:False
 | > db_level:None
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Model's reduction rate `r` is set to: 1
 > Vocoder Model: hifigan
 > Setting up Audio P

 > Generator Model: hifigan_generator
 > Discriminator Model: hifigan_discriminator




Removing weight norm...


In [32]:
# For each row, create the audio file and save it as character_count.wav in the folder of the scene
for index, row in tqdm(scene1.iterrows(), total=len(scene1), desc="Processing rows", unit="row"):
    audio_speaker = row['audio']
    sentence = row['sentence']
    audio_speaker_path = os.path.join('speakers', audio_speaker)
    character = row['character']
    if not os.path.exists(audio_speaker_path):
        print('Audio file not found: ', audio_speaker_path)
        continue
    audio_sentence_path = os.path.join(folder_path, character + '_' + str(row['count_column']) + '.wav')
    print('sentence is : ', sentence)
    tts.tts_with_vc_to_file(
        sentence,
        speaker_wav=audio_speaker_path,
        file_path= audio_sentence_path
    )
    



sentence is :  Goodness me.
 > Text splitted to sentences.
['Goodness me.']
 > Processing time: 0.0830831527709961
 > Real-time factor: 0.06329406849780486
 > voice_conversion_models/multilingual/vctk/freevc24 is already downloaded.
 > Using model: freevc
 > Loading pretrained speaker encoder model ...
Loaded the voice encoder model on cuda in 0.01 seconds.


Processing rows:   0%|          | 0/300 [00:03<?, ?row/s]
