Mac Mini with M4

to make this work for Mac Mini M4, you have to change few lines so it will work without Cuda. here's the updated infer.py file, which worked great for me:

```
import os
import re
import sys

import sentencepiece as spm
import torch
import torchaudio
from torch.nn.utils.rnn import pad_sequence
from omegaconf import OmegaConf

from indextts.BigVGAN.models import BigVGAN as Generator
from indextts.gpt.model import UnifiedVoice
from indextts.utils.checkpoint import load_checkpoint
from indextts.utils.feature_extractors import MelSpectrogramFeatures
from indextts.utils.common import tokenize_by_CJK_char
from indextts.vqvae.xtts_dvae import DiscreteVAE

from indextts.utils.front import TextNormalizer
class IndexTTS:
    def __init__(self, cfg_path='checkpoints/config.yaml', model_dir='checkpoints', is_fp16=False):
        self.cfg = OmegaConf.load(cfg_path)
        self.device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
        self.model_dir = model_dir
        self.is_fp16 = is_fp16
        self.stop_mel_token = self.cfg.gpt.stop_mel_token
        if self.is_fp16:
            self.dtype = torch.float16
        else:
            self.dtype = None
        self.dvae = DiscreteVAE(**self.cfg.vqvae)
        self.dvae_path = os.path.join(self.model_dir, self.cfg.dvae_checkpoint)
        load_checkpoint(self.dvae, self.dvae_path)
        self.dvae = self.dvae.to(self.device)
        if self.is_fp16:
            self.dvae.eval().half()
        else:
            self.dvae.eval()
        print(">> vqvae weights restored from:", self.dvae_path)

        self.gpt = UnifiedVoice(**self.cfg.gpt)
        self.gpt_path = os.path.join(self.model_dir, self.cfg.gpt_checkpoint)
        load_checkpoint(self.gpt, self.gpt_path)
        self.gpt = self.gpt.to(self.device)
        if self.is_fp16:
            self.gpt.eval().half()
        else:
            self.gpt.eval()
        print(">> GPT weights restored from:", self.gpt_path)
        if self.is_fp16:
            self.gpt.post_init_gpt2_config(use_deepspeed=True, kv_cache=True, half=True)
        else:
            self.gpt.post_init_gpt2_config(use_deepspeed=False, kv_cache=False, half=False)

        self.bigvgan = Generator(self.cfg.bigvgan)
        self.bigvgan_path = os.path.join(self.model_dir, self.cfg.bigvgan_checkpoint)
        vocoder_dict = torch.load(self.bigvgan_path, map_location='cpu')
        self.bigvgan.load_state_dict(vocoder_dict['generator'])
        self.bigvgan = self.bigvgan.to(self.device)
        self.bigvgan.eval()
        print(">> bigvgan weights restored from:", self.bigvgan_path)
        self.bpe_path = os.path.join(self.model_dir, self.cfg.dataset['bpe_model'])
        self.normalizer = TextNormalizer()
        self.normalizer.load()
        print(">> TextNormalizer loaded")

    def preprocess_text(self, text):
        # chinese_punctuation = "，。！？；：“”‘’（）【】《》"
        # english_punctuation = ",.!?;:\"\"''()[]<>"
        #
        # # 创建一个映射字典
        # punctuation_map = str.maketrans(chinese_punctuation, english_punctuation)

        # 使用translate方法替换标点符号
        # return text.translate(punctuation_map)
        return self.normalizer.infer(text)

    def remove_long_silence(self, codes, silent_token=52, max_consecutive=30):
        code_lens = []
        codes_list = []
        isfix = False
        for i in range(0, codes.shape[0]):
            code = codes[i]
            if self.cfg.gpt.stop_mel_token not in code:
                code_lens.append(len(code))
                len_ = len(code)
            else:
                # len_ = code.cpu().tolist().index(8193)+1
                len_ = (code == self.stop_mel_token).nonzero(as_tuple=False)[0] + 1
                len_ = len_ - 2

            count = torch.sum(code == silent_token).item()
            if count > max_consecutive:
                code = code.cpu().tolist()
                ncode = []
                n = 0
                for k in range(0, len_):
                    if code[k] != silent_token:
                        ncode.append(code[k])
                        n = 0
                    elif code[k] == silent_token and n < 10:
                        ncode.append(code[k])
                        n += 1
                    # if (k == 0 and code[k] == 52) or (code[k] == 52 and code[k-1] == 52):
                    #    n += 1
                len_ = len(ncode)
                ncode = torch.LongTensor(ncode)
                codes_list.append(ncode.cuda())
                isfix = True
                #codes[i] = self.stop_mel_token
                #codes[i, 0:len_] = ncode
            else:
                codes_list.append(codes[i])
            code_lens.append(len_)
        code_lens = torch.LongTensor(code_lens).to('mps')  # Use MPS for GPU acceleration on Apple Silicon

        if isfix:
            codes = pad_sequence(codes_list, batch_first=True)
        return codes, code_lens

    def infer(self, audio_prompt, text, output_path):
        print(f"origin text:{text}")
        text = self.preprocess_text(text)
        print(f"normalized text:{text}")


        audio, sr = torchaudio.load(audio_prompt)
        audio = torch.mean(audio, dim=0, keepdim=True)
        if audio.shape[0] > 1:
            audio = audio[0].unsqueeze(0)
        audio = torchaudio.transforms.Resample(sr, 24000)(audio)
        cond_mel = MelSpectrogramFeatures()(audio).to(self.device)
        print(f"cond_mel shape: {cond_mel.shape}")

        auto_conditioning = cond_mel

        tokenizer = spm.SentencePieceProcessor()
        tokenizer.load(self.bpe_path)

        punctuation = ["!", "?", ".", ";", "！", "？", "。", "；"]
        pattern = r"(?<=[{0}])\s*".format("".join(punctuation))
        sentences = [i for i in re.split(pattern, text) if i.strip() != ""]
        print(sentences)

        top_p = .8
        top_k = 30
        temperature = 1.0
        autoregressive_batch_size = 1
        length_penalty = 0.0
        num_beams = 3
        repetition_penalty = 10.0
        max_mel_tokens = 600
        sampling_rate = 24000
        lang = "EN"
        lang = "ZH"
        wavs = []
        wavs1 = []

        for sent in sentences:
            print(sent)
            # sent = " ".join([char for char in sent.upper()]) if lang == "ZH" else sent.upper()
            cleand_text = tokenize_by_CJK_char(sent)
            # cleand_text = "他 那 像 HONG3 小 孩 似 的 话 , 引 得 人 们 HONG1 堂 大 笑 , 大 家 听 了 一 HONG3 而 散 ."
            print(cleand_text)
            text_tokens = torch.IntTensor(tokenizer.encode(cleand_text)).unsqueeze(0).to(self.device)

            # text_tokens = F.pad(text_tokens, (0, 1))  # This may not be necessary.
            # text_tokens = F.pad(text_tokens, (1, 0), value=0)
            # text_tokens = F.pad(text_tokens, (0, 1), value=1)
            text_tokens = text_tokens.to(self.device)
            print(text_tokens)
            print(f"text_tokens shape: {text_tokens.shape}, text_tokens type: {text_tokens.dtype}")
            text_token_syms = [tokenizer.IdToPiece(idx) for idx in text_tokens[0].tolist()]
            print(text_token_syms)
            text_len = [text_tokens.size(1)]
            text_len = torch.IntTensor(text_len).to(self.device)
            print(text_len)
            with torch.no_grad():
                if self.is_fp16:
                    with torch.no_grad():
                        codes = self.gpt.inference_speech(auto_conditioning, text_tokens,
                                                          cond_mel_lengths=torch.tensor([auto_conditioning.shape[-1]],
                                                                                        device=text_tokens.device),
                                                          # text_lengths=text_len,
                                                          do_sample=True,
                                                          top_p=top_p,
                                                          top_k=top_k,
                                                          temperature=temperature,
                                                          num_return_sequences=autoregressive_batch_size,
                                                          length_penalty=length_penalty,
                                                          num_beams=num_beams,
                                                          repetition_penalty=repetition_penalty,
                                                          max_generate_length=max_mel_tokens)
                else:
                    codes = self.gpt.inference_speech(auto_conditioning, text_tokens,
                                                      cond_mel_lengths=torch.tensor([auto_conditioning.shape[-1]],
                                                                                    device=text_tokens.device),
                                                      # text_lengths=text_len,
                                                      do_sample=True,
                                                      top_p=top_p,
                                                      top_k=top_k,
                                                      temperature=temperature,
                                                      num_return_sequences=autoregressive_batch_size,
                                                      length_penalty=length_penalty,
                                                      num_beams=num_beams,
                                                      repetition_penalty=repetition_penalty,
                                                      max_generate_length=max_mel_tokens)
                #codes = codes[:, :-2]
                code_lens = torch.tensor([codes.shape[-1]])
                print(codes, type(codes))
                print(f"codes shape: {codes.shape}, codes type: {codes.dtype}")
                print(f"code len: {code_lens}")
                # remove ultra-long silence if exits
                # temporarily fix the long silence bug.
                codes, code_lens = self.remove_long_silence(codes, silent_token=52, max_consecutive=30)
                print(codes, type(codes))
                print(f"codes shape: {codes.shape}, codes type: {codes.dtype}")
                print(f"code len: {code_lens}")

                # latent, text_lens_out, code_lens_out = \
                if self.is_fp16:
                    with torch.no_grad():
                        latent = \
                            self.gpt(auto_conditioning, text_tokens,
                                     torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
                                     code_lens*self.gpt.mel_length_compression,
                                     cond_mel_lengths=torch.tensor([auto_conditioning.shape[-1]], device=text_tokens.device),
                                     return_latent=True, clip_inputs=False)
                        latent = latent.transpose(1, 2)
                        wav, _ = self.bigvgan(latent.transpose(1, 2), auto_conditioning.transpose(1, 2))
                        wav = wav.squeeze(1).cpu()

                else:
                    latent = \
                        self.gpt(auto_conditioning, text_tokens,
                                 torch.tensor([text_tokens.shape[-1]], device=text_tokens.device), codes,
                                 code_lens*self.gpt.mel_length_compression,
                                 cond_mel_lengths=torch.tensor([auto_conditioning.shape[-1]], device=text_tokens.device),
                                 return_latent=True, clip_inputs=False)
                    latent = latent.transpose(1, 2)
                    '''
                    latent_list = []
                    for lat, t_len in zip(latent, text_lens_out):
                        lat = lat[:, t_len:]
                        latent_list.append(lat)
                    latent = torch.stack(latent_list)
                    print(f"latent shape: {latent.shape}")
                    '''

                    wav, _ = self.bigvgan(latent.transpose(1, 2), auto_conditioning.transpose(1, 2))
                    wav = wav.squeeze(1).cpu()

                wav = 32767 * wav
                torch.clip(wav, -32767.0, 32767.0)
                print(f"wav shape: {wav.shape}")
                # wavs.append(wav[:, :-512])
                wavs.append(wav)

        wav = torch.cat(wavs, dim=1)
        torchaudio.save(output_path, wav.type(torch.int16), 24000)


if __name__ == "__main__":
    prompt_wav="test_data/input.wav"
    #text="晕 XUAN4 是 一 种 GAN3 觉"
    #text='大家好，我现在正在bilibili 体验 ai 科技，说实话，来之前我绝对想不到！AI技术已经发展到这样匪夷所思的地步了！'
    text="There is a vehicle arriving in dock number 7?"

    tts = IndexTTS(cfg_path="checkpoints/config.yaml", model_dir="checkpoints", is_fp16=False)
    tts.infer(audio_prompt=prompt_wav, text=text, output_path="gen.wav")

```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Mac Mini with M4 #68

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Mac Mini with M4 #68

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions