In [1]:
import os
from dotenv import load_dotenv
from llama_cpp import Llama

load_dotenv()

path = os.path.join(os.getenv("LLM_MODEL_PATH"), os.getenv("LLM_MODEL_FILE"))
model = Llama(
  model_path = path,
  n_ctx = int(os.getenv("MODEL_PARAM_CONTEXT_LEN")),
  n_batch = int(os.getenv("MODEL_PARAM_BATCH_SIZE")),
  use_mlock = os.getenv("MODEL_PARAM_MLOCK"),
  n_threads = int(os.getenv("MODEL_PARAM_THREADS")),
  n_gpu_layers = 0,
  f16_kv = True,
  verbose = False
)

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ./models/mistral-7b-instruct-v0.1.Q4_K_M.gguf (version GGUF V2 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q4_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:              blk.0.attn_q.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    2:              blk.0.attn_k.weight q4_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    3:              blk.0.attn_v.weight q6_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    4:         blk.0.attn_output.weight q4_K     [  4096,  4096,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_gate.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.ffn_up.weight q4_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    7:            blk.0.ffn_down.weight q6_K     [ 14336,  40

In [2]:
class LLMRequest:
  prompt: str
  temperature: float = os.getenv("MODEL_PARAM_TEMPERATURE")
  max_tokens: int = os.getenv("MODEL_PARAM_MAX_TOKENS")
  top_p: float = os.getenv("MODEL_PARAM_TOP_P")
  top_k: float = os.getenv("MODEL_PARAM_TOP_K")
  repeat_penalty: float = os.getenv("MODEL_PARAM_REPEAT_PENALTY")
  n_tokens_size: int = os.getenv("MODEL_PARAM_N_TOKENS_SIZE")

request = LLMRequest()
request.prompt = "Who is Duke Ellington?"

In [3]:
response = model(
  prompt = request.prompt,
  max_tokens = int(request.max_tokens),
  temperature = float(request.temperature),
  top_p = float(request.top_p),
  top_k = int(request.top_k),
  repeat_penalty = float(request.repeat_penalty),
  stream = False
)
response

{'id': 'cmpl-8924e961-842e-48d0-bb2f-3d3658f6b286',
 'object': 'text_completion',
 'created': 1698229769,
 'model': './models/mistral-7b-instruct-v0.1.Q4_K_M.gguf',
 'choices': [{'text': "\nA: Duke Ellington was an American composer, pianist, and bandleader. He is widely regarded as one of the greatest jazz composers and pianists of all time. Born in Washington, D.C., he spent much of his career in New York City, where he led one of the most famous and influential jazz orchestras of the 20th century. Ellington's music has had a lasting impact on jazz and popular music, and he is often credited with elevating the status of jazz from dance music to an art form.",
   'index': 0,
   'logprobs': None,
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 7, 'completion_tokens': 116, 'total_tokens': 123}}

In [4]:
answer = response["choices"][0]["text"].strip().replace("A:", "")
answer

" Duke Ellington was an American composer, pianist, and bandleader. He is widely regarded as one of the greatest jazz composers and pianists of all time. Born in Washington, D.C., he spent much of his career in New York City, where he led one of the most famous and influential jazz orchestras of the 20th century. Ellington's music has had a lasting impact on jazz and popular music, and he is often credited with elevating the status of jazz from dance music to an art form."

In [None]:
!pip install git+https://github.com/suno-ai/bark.git

In [11]:
import torch
from transformers import AutoProcessor, BarkModel

os.environ["SUNO_OFFLOAD_CPU"] = "True"
os.environ["SUNO_USE_SMALL_MODELS"] = "False"

device = "cuda:0" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained("suno/bark")
voiceModel = BarkModel.from_pretrained("suno/bark")
voiceModel = voiceModel.to(device)
preset = "v2/en_speaker_6"

inputs = processor(answer, voice_preset=preset)
audio = voiceModel.generate(**inputs.to(device))
audio = audio.cpu().numpy().squeeze()

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [13]:
# Bark is optimized to generate speech for only 12-14 seconds
# So, only part of the generated text from the LLM will be converted to audio
from IPython.display import Audio

sampleRate = voice.generation_config.sample_rate
Audio(audio, rate=sampleRate)

In [18]:
# Use a tokenizer to split the generated text into sentences. Generate audio for each sentence.
# Then combine the pieces into one numpy array of audio for replay
import nltk
import numpy as np
from bark import SAMPLE_RATE, generate_audio

nltk.download("punkt")
sentences = nltk.sent_tokenize(answer)
# Inject a brief pause between each audio fragment (i.e. each sentence)
silence = np.zeros(int(0.25 * SAMPLE_RATE))
pieces = []
for sentence in sentences:
  audio = generate_audio(sentence,history_prompt="v2/en_speaker_6")
  #inputs = processor(sentence, voice_preset=preset)
  #audio = voiceModel.generate(**inputs.to(device))
  pieces += [audio, silence.copy()]
# Build audio from the concatenated pieces
Audio(np.concatenate(pieces), rate=sampleRate)

[nltk_data] Downloading package punkt to /home/cdsw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
No GPU being used. Careful, inference might be very slow!


Downloading text_2.pt:   0%|          | 0.00/5.35G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading coarse_2.pt:   0%|          | 0.00/3.93G [00:00<?, ?B/s]

Downloading fine_2.pt:   0%|          | 0.00/3.74G [00:00<?, ?B/s]

Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /home/cdsw/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|███████████████████████████████████████| 88.9M/88.9M [00:00<00:00, 208MB/s]
100%|█████████████████████████████████████████| 373/373 [00:14<00:00, 26.00it/s]
100%|███████████████████████████████████████████| 19/19 [01:07<00:00,  3.55s/it]
100%|█████████████████████████████████████████| 448/448 [00:17<00:00, 25.84it/s]
100%|███████████████████████████████████████████| 23/23 [01:21<00:00,  3.55s/it]
100%|█████████████████████████████████████████| 645/645 [00:25<00:00, 25.09it/s]
100%|███████████████████████████████████████████| 33/33 [01:57<00:00,  3.55s/it]
100%|█████████████████████████████████████████| 739/739 [00:30<00:00, 24.17it/s]
100%|███████████████████████████████████████████| 37/37 [02:13<00:00,  3.61s/it]
