##**Load the LLM**

In [None]:
!pip install bitsandbytes accelerate



In [None]:
from huggingface_hub import login
login("")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available

model_id = "google/gemma-2b-it"
use_quantization_config = False

from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)


if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = model_id # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id,
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU
    llm_model.to("cuda")

We've got an LLM!

Let's check it out.

In [None]:
llm_model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): GELUActivation()
        )
        (input_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): GemmaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm): GemmaRMSNorm((2048,), 

How about we get the number of parameters in our model?

In [None]:
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

2506172416

####**Generating text with our LLM(gemma)**

In [None]:
input_text = "how can I improve speaking skills"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
how can I improve speaking skills

Prompt (formatted):
<bos><start_of_turn>user
how can I improve speaking skills<end_of_turn>
<start_of_turn>model



In [None]:
%%time

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[    2,     2,   106,  1645,   108,  1139,   798,   590,  4771, 13041,
          7841,   107,   108,   106,  2516,   108]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

Model output (tokens):
tensor([     2,      2,    106,   1645,    108,   1139,    798,    590,   4771,
         13041,   7841,    107,    108,    106,   2516,    108,    688, 235274,
        235265,  19670, 186522,  66058,    108, 235290, 100922,    575,  30893,
           675,  11634,  22660,    689,   6016,    675,    476,   5255,   9670,
        235265,    108, 235290,  20470,    476,   5255,  10036,   2778,    689,
          3650,  16875, 235265,    108, 235290,  15940,   5804,  13041,    578,
         10724,   1355,    577,  11441,   4516,    604,  13194, 235265,    109,
           688, 235284, 235265,  26349,    611,  58212,  42535,  66058,    108,
        235290,   8138,   6137,    577,    573,   97

In [None]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
how can I improve speaking skills<end_of_turn>
<start_of_turn>model
**1. Practice Regularly:**
- Engage in conversations with native speakers or practice with a language partner.
- Join a language exchange group or online forum.
- Record yourself speaking and listen back to identify areas for improvement.

**2. Focus on Pronunciation:**
- Pay attention to the sounds of the language, including intonation, rhythm, and stress.
- Use pronunciation tools and recordings to learn correct pronunciation.
- Practice speaking words and phrases out loud, focusing on different accents.

**3. Expand Your Vocabulary:**
- Read extensively in the target language, both fiction and non-fiction.
- Listen to podcasts, audiobooks, and music in the language.
- Use flashcards and spaced repetition techniques to learn new words.

**4. Read Fluently:**
- Start with children's books or simple texts and gradually progress to more complex materials.
- Pay atten

In [None]:
print(f"Input text: {input_text}\n")
print(f"Output text:\n{outputs_decoded.replace(prompt, '').replace('<bos>', '').replace('<eos>', '')}")

Input text: how can I improve speaking skills

Output text:
**1. Practice Regularly:**
- Engage in conversations with native speakers or practice with a language partner.
- Join a language exchange group or online forum.
- Record yourself speaking and listen back to identify areas for improvement.

**2. Focus on Pronunciation:**
- Pay attention to the sounds of the language, including intonation, rhythm, and stress.
- Use pronunciation tools and recordings to learn correct pronunciation.
- Practice speaking words and phrases out loud, focusing on different accents.

**3. Expand Your Vocabulary:**
- Read extensively in the target language, both fiction and non-fiction.
- Listen to podcasts, audiobooks, and music in the language.
- Use flashcards and spaced repetition techniques to learn new words.

**4. Read Fluently:**
- Start with children's books or simple texts and gradually progress to more complex materials.
- Pay attention to grammar, punctuation, and sentence structure.
- Use a 

Augmentation.

In [None]:
# IELTS-style questions generated with GPT-4
gpt4_questions = [
    "What are the best strategies to improve your IELTS speaking score?",
    "How can you effectively manage time during the IELTS reading test?",
    "Describe techniques to write a high-scoring IELTS essay.",
    "What role does vocabulary play in the IELTS listening section?",
    "Explain the importance of practicing mock tests for the IELTS exam.",
    "How can you build fluency for the IELTS speaking test?",
]

# Manually created question list
manual_questions = [
    "What are common mistakes to avoid in the IELTS writing task?",
    "How should you prepare for the IELTS listening section?",
    "What is the ideal structure for an IELTS Task 2 essay?",
    "What are the key differences between IELTS Academic and General Training?",
    "How can you improve your band score in the IELTS reading test?",
]

# Combine GPT-4 generated and manually created questions
query_list = gpt4_questions + manual_questions

And now let's check if our `retrieve_relevant_resources()` function works with our list of queries.

In [None]:
import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Query: What are common mistakes to avoid in the IELTS writing task?
[INFO] Time taken to get scores on 30 embeddings: 0.00009 seconds.


(tensor([0.4694, 0.4677, 0.4606, 0.4605, 0.4586], device='cuda:0'),
 tensor([12, 23,  4, 17,  0], device='cuda:0'))

####**Augmenting our prompt with context items**

In [None]:
def prompt_formatter(query: str, context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into a single paragraph
    context = "\n\n".join([item["sentence_chunk"] for item in context_items])

    # Create the improved base prompt
    base_prompt = f"""Based on the following context items, provide the most helpful and detailed answer to the query below.
    If you cannot find relevant information in the provided context, use your general knowledge and logical reasoning to generate a well-informed, accurate, and practical answer.
    Ensure that your response remains factual, logical, and does not speculate beyond reasonable assumptions.

    Before generating the final answer, show your thought process step by step. These steps should include:
    1. Identifying relevant information from the provided context (if available).
    2. Explaining how the context or your reasoning is applied to answer the query.
    3. Highlighting any assumptions made if the context is insufficient.

    Finally, provide your answer in a clear and concise manner. Use the following examples as a reference for the ideal answer style. Your answer should not include the examples themselves, only follow their structure and tone.

    Example 1:
    Query: What are the best strategies to improve your IELTS speaking score?
    Answer: To improve your IELTS speaking score, focus on fluency and coherence by practicing speaking with friends or recording yourself and listening for areas of improvement. Expand your vocabulary by learning phrases and idioms relevant to common IELTS topics, such as education, environment, and technology. Additionally, practice answering past IELTS speaking questions under timed conditions to simulate the test environment.

    Example 2:
    Query: How can you effectively manage time during the IELTS reading test?
    Answer: To manage time effectively during the IELTS reading test, start by quickly skimming the passage to get a general idea of its content. Then, read the questions and underline key information. Divide your time equally across the three sections, spending no more than 20 minutes per section. If you encounter difficult questions, move on and return to them later if time permits.

    Example 3:
    Query: What is the ideal structure for an IELTS Writing Task 2 essay?
    Answer: A high-scoring IELTS Writing Task 2 essay should include an introduction that clearly states your position, two or three body paragraphs with arguments supported by examples, and a conclusion that summarizes your key points. Ensure coherence and cohesion by using linking words such as "however," "therefore," and "in addition." Also, proofread your essay to avoid grammatical mistakes and spelling errors.

    Context:
    {context}

    Query: {query}

    Explain your thought process step by step:
    1. ...
    2. ...
    3. ...

    Final Answer:"""

    # Update the base prompt with context items and query
    dialogue_template = [
        {"role": "user", "content": base_prompt}
    ]

    # Generate the prompt
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

Let's try our function out.

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)

# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

Query: What are the key differences between IELTS Academic and General Training?
[INFO] Time taken to get scores on 30 embeddings: 0.00008 seconds.
<bos><start_of_turn>user
Based on the following context items, provide the most helpful and detailed answer to the query below.
    If you cannot find relevant information in the provided context, use your general knowledge and logical reasoning to generate a well-informed, accurate, and practical answer.
    Ensure that your response remains factual, logical, and does not speculate beyond reasonable assumptions.

    Before generating the final answer, show your thought process step by step. These steps should include:
    1. Identifying relevant information from the provided context (if available).
    2. Explaining how the context or your reasoning is applied to answer the query.
    3. Highlighting any assumptions made if the context is insufficient.

    Finally, provide your answer in a clear and concise manner. Use the following exam

We can tokenize this and pass it straight to our LLM.

In [None]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True,
                             max_new_tokens=256) # how many new tokens to generate from prompt

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Query: What are the key differences between IELTS Academic and General Training?
RAG answer:
<bos>**Thought Process:**

**1. Identifying Relevant Information**

* The passage does not provide any directly relevant information about IELTS Academic and General Training, so I cannot identify any key differences between the two programs from the context.

**2. Explaining Reasoning**

I am unable to generate a response because the context does not provide any information about the key differences between IELTS Academic and General Training.

**3. Assumptions Made**

The context does not provide any assumptions, so I cannot generate a response.<eos>
CPU times: user 3.31 s, sys: 5.68 ms, total: 3.31 s
Wall time: 3.31 s


How about we functionize the generation step to make it easier to use?

In [None]:
def ask(query,
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True,
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """

    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)

    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU

    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)

    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)

    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text

    return output_text, context_items

Let's try it out.

In [None]:
# random.choice(query_list)

In [None]:
query = random.choice(query_list)
print(f"Query: {query}")

# Answer query with context and return context
answer, context_items = ask(query=query,
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
# print(f"Context items:")
#context_items

Query: What are the best strategies to improve your IELTS speaking score?
[INFO] Time taken to get scores on 30 embeddings: 0.00007 seconds.
Answer:

## Thought process:  **Step 1: Identifying relevant information**  * The context
mentions that improving your IELTS speaking score requires practicing speaking
with friends or recording yourself and listening for areas of improvement. * It
also suggests learning phrases and idioms relevant to common IELTS topics. *
These suggest that practicing speaking in a social setting, learning vocabulary,
and being familiar with idiomatic expressions are key strategies for improving
speaking skills.  **Step 2: Applying the context**  The context advises
practicing speaking with friends, recording yourself, and listening for areas of
improvement. It also suggests learning vocabulary and idioms relevant to common
IELTS topics.  **Step 3: Assumptions**  * The context does not provide any
specific information or guidelines for practicing speaking in a s

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Using the gemma-2b-it model
model_id = "google/gemma-2b-it"

# Loading the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Set chat_template correctly
tokenizer.chat_template = "{% for message in messages %}{{ '<|im_start|>' + message['role'] + '\\n' + message['content'] + '\\n' }}{% endfor %}"

# Loading the model
llm_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

# Move the model to the GPU (if available)
llm_model.to("cuda")

print("The model and tokenizer were successfully loaded, and the chat_template was set!")


####**Text to speech model(facebook/mms-tts-eng)**

In [None]:
!pip install transformers==4.51.3 accelerate==1.6.0 --no-warn-script-location --quiet

In [None]:
#!pip install --upgrade transformers accelerate

In [None]:
from transformers import VitsModel, AutoTokenizer
import torch

model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")

text = answer
inputs = tokenizer(text, return_tensors="pt")

with torch.no_grad():
    output = model(**inputs).waveform


In [None]:
from IPython.display import Audio

Audio(output.numpy(), rate=model.config.sampling_rate)

In [None]:
from scipy.io.wavfile import write
from google.colab import files
import numpy as np

# Assume `output` is the generated audio data (Tensor) with sampling rate `model.config.sampling_rate`
sampling_rate = model.config.sampling_rate
output_filename = "generated_audio.wav"

# 1. Convert audio data to numpy array
audio_data = output.numpy()  # 转换为 numpy 数组

# 2. Check the range of the audio data and normalize it to [-1.0, 1.0]
# If the data range is not [-1.0, 1.0], you need to normalize it first
if audio_data.min() < -1.0 or audio_data.max() > 1.0:
    audio_data = audio_data / np.max(np.abs(audio_data))  # Normalized to [-1.0, 1.0]

# 3. Make sure the audio data is a one-dimensional array (mono)
if len(audio_data.shape) > 1:
    audio_data = audio_data.squeeze()  # Remove redundant dimensions

# 4. Convert data from [-1.0, 1.0] to int16 range [-32768, 32767]
audio_data = (audio_data * 32767).astype(np.int16)

# 5. Save audio to .wav file
write(output_filename, sampling_rate, audio_data)

# 6. Download the audio file to your local computer
files.download(output_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
print(tokenizer)  # Check if the current tokenizer is correct
print(hasattr(tokenizer, "chat_template"))  # Check if chat_template is set

VitsTokenizer(name_or_path='facebook/mms-tts-eng', vocab_size=38, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '<unk>', 'pad_token': 'k'}, clean_up_tokenization_spaces=True, added_tokens_decoder={
	0: AddedToken("k", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	38: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)
True


####**SoVITS model**

In [None]:
%%writefile /content/setup.sh
set -e
cd /content
rm -rf GPT-SoVITS
git clone https://github.com/RVC-Boss/GPT-SoVITS.git
cd GPT-SoVITS

if conda env list | awk '{print $1}' | grep -Fxq "GPTSoVITS"; then
    :
else
    conda create -n GPTSoVITS python=3.10 -y
fi

source activate GPTSoVITS

bash install.sh --source HF --download-uvr5

In [None]:
%pip install -q condacolab
import condacolab
condacolab.install_from_url("https://repo.anaconda.com/archive/Anaconda3-2024.10-1-Linux-x86_64.sh")
!cd /content && bash setup.sh

In [None]:
!cd /content/GPT-SoVITS && source activate GPTSoVITS && export is_share=True && python webui.py