In [None]:
"""
conda env remove -n spkF
conda create --name spkF python=3.9  ipywidgets ipykernel -y
conda activate spkF
pip install -U torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
pip install -U fairseq phonemizer sentencepiece g2p_en
pip install -U huggingface_hub
pip install -U pyttsx3
pip install -U bs4 nltk scipy
pip install -U langdetect playsound
"""
"""
sudo apt install espeak
"""

In [None]:

import string
import requests
import pyttsx3
import torch
import nltk

import wave  
import IPython.display as ipd


from langdetect       import detect
from bs4              import BeautifulSoup, __version__
from scipy.io.wavfile import write
from nltk.tokenize    import sent_tokenize
from playsound        import playsound

from fairseq.checkpoint_utils                    import load_model_ensemble_and_task_from_hf_hub
from fairseq.models.text_to_speech.hub_interface import TTSHubInterface


print(f"requests     : {requests.__version__ }")
print(f"torch        : {    torch.__version__}")
print(f"BeautifulSoup: {          __version__}")

engine = pyttsx3.init()
langs  = {v.languages[0].decode()[1:]: v.id for v in engine.getProperty('voices')}
engine.setProperty('rate', 200)

nltk.download('punkt')
nltk.download('wordnet')

models_fr, cfg_fr, task_fr = load_model_ensemble_and_task_from_hf_hub(
    "facebook/tts_transformer-fr-cv7_css10",
    arg_overrides={"vocoder": "hifigan", "fp16": False, "cpu":True}
)
model_fr     = models_fr[0]
TTSHubInterface.update_cfg_with_data_cfg(cfg_fr, task_fr.data_cfg)
generator_fr = task_fr.build_generator(models_fr, cfg_fr)


models_en, cfg_en, task_en = load_model_ensemble_and_task_from_hf_hub(
    "facebook/fastspeech2-en-ljspeech",
    arg_overrides={"vocoder": "hifigan", "fp16": False, "cpu":True}
)
model_en     = models_en[0]
TTSHubInterface.update_cfg_with_data_cfg(cfg_en, task_en.data_cfg)
generator_en = task_en.build_generator(models_en, cfg_en)

out_file = "tts_out.wav"
sr       = 22050


In [None]:
def filter_sentences(text):
    
    sent_raw   = sent_tokenize(text.replace('\t', ' ')) # Tokenize the string into sentences
    #print(sent_raw)
    sentences  = []
    #lemizer = nltk.WordNetLemmatizer()

    sentences = [" ".join([word for word in s.split() ]) for s in sent_raw ]
    sentences = list(map(lambda s: s.rstrip(), sentences))

    return sentences


In [None]:
raw_text   = "This is a test to check, if a simple text can be spoken"
sentenses  = filter_sentences(raw_text)
print(sentenses)

In [None]:
#url = "https://fr.wikipedia.org/wiki/Saison_cyclonique_2022-2023_dans_l%27oc%C3%A9an_Indien_sud-ouest"
#url = "https://en.wikipedia.org/wiki/Fab_lab"
#url = "https://artilect.fr"
url      = None
txt_file = None
#txt_file = "file.txt"
#sentenses = []
if url != None:
    try:
        response = requests.get(url)
    except Exception as e:
        print(f"Exception raised: {e}")
        sentenses = []

    if response.status_code==200: 
        print("Successfully opened the web page")

        soup       = BeautifulSoup(response.text,'html.parser')
        lang       = soup.find('lang')
        heading    = soup.select("#firstHeading")
        #print(heading)
        if len(heading) < 1:
            title    = soup.find("title").get_text()
        else:
            title    = heading[0].text
        #sentenses  = [s for para in paragraphs for s in sent_tokenize(para.text)]
        raw_text   = ""
        paragraphs = soup.select("p")
        for p in paragraphs: raw_text   += p.text
        sentenses  = filter_sentences(raw_text)

    else: 
        print(f"Error: {response.status_code}")                      
        sentenses = []


elif txt_file is not None:
    raw_text   = open(txt_file).read()
    sentenses  = filter_sentences(raw_text)
#elif sentenses is not none:

#print(langs)
if len(sentenses) > 0:
  
    print(sentenses)
    n = 0
    for s in sentenses:
        file        = str(n) + '_' + out_file
        try:
            lang = detect(s)  #only first line 
            #lang = 'fr'
            print(lang)
        except Exception as e:
            if lang==None: lang = 'en'
        
        print(f"[{lang}]: {s}")
        if lang=='en':

            sample_en = TTSHubInterface.get_model_input(task_en, s)
            audio, sr = TTSHubInterface.get_prediction(task_en, model_en, generator_en, sample_en)
            write(file, sr, audio.to('cpu').detach().numpy())
            playsound(file)
            #ipd.Audio(file)
        elif lang=='fr':
            sample    = TTSHubInterface.get_model_input(task_fr, s)
            audio, sr = TTSHubInterface.get_prediction(task_fr, model_fr, generator_fr, sample)
            write(file, sr, audio.to('cpu').detach().numpy())
            playsound(file)
        else:
            if not lang in (langs.keys() or ['en', 'fr']):
                lang = lang + '-' + lang
                if not lang in langs.keys():
                    print(f"WARNING! language {lang} not supported defaulting to english")
                    lang = 'en'
            engine.setProperty("voice", langs[lang])
            engine.say(s)
            engine.runAndWait()
        
        n += 1

else: print(f"Error: No text to say")
