In [None]:
!pip install --quiet git+https://github.com/huggingface/transformers sentencepiece

In [None]:
# Import the required libraries
from transformers import SeamlessM4TModel
import torch

# Load the pre-trained SeamlessM4T model from the 🤗 Transformers Hub
model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")

# Check if CUDA is available, if yes, set the device to "cuda:0", else use the CPU
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Move the model to the specified device (CUDA if available, otherwise CPU)
model = model.to(device)

In [None]:
# Import the necessary library for loading the AutoProcessor
from transformers import AutoProcessor

# Load the pre-trained SeamlessM4T medium checkpoint using the AutoProcessor
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")

# Extracting the sample rate from the model's configuration
sample_rate = model.config.sampling_rate

### Speech-to-Text Translation

In [None]:
# Import the necessary libraries
import torchaudio

# Load the audio file
audio_sample, audio_sampling_rate = torchaudio.load("/content/download.wav")

# Check if the audio's sampling rate is different from the model's sampling rate and resample if necessary
if audio_sampling_rate != model.config.sampling_rate:
    audio_sample = torchaudio.functional.resample(audio_sample, 
                                                  orig_freq=audio_sampling_rate, 
                                                  new_freq=model.config.sampling_rate)

# Process the audio inputs using the specified processor, device, and sampling rate
audio_inputs = processor(audios=audio_sample, return_tensors="pt", sampling_rate=sample_rate).to(device)

# Generate text from the processed audio inputs, targeting French as the output language and disabling speech generation
output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False)

# Decode the output tokens to obtain the translated text from the audio
translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)

# Print the translated text obtained from the audio
print(f"Translated Text: {translated_text_from_audio}")

### Text-to-Speech Translation

In [None]:
# Importing the Audio module for displaying the generated audio
from IPython.display import Audio

# Processing the text input
text_inputs = processor(text="Hello, How are you?", src_lang="eng", return_tensors="pt").to(device)

# Generating audio from the processed text
audio_array_from_text = model.generate(**text_inputs, tgt_lang="eng")[0].cpu().numpy().squeeze()

# Displaying the generated audio using IPython's Audio function
Audio(audio_array_from_text, rate=sample_rate)

### Text-to-Text Translation

In [None]:
# Processing the text input
text_inputs = processor(text="Hello, How are you?", src_lang="eng", return_tensors="pt").to(device)

# Generating text from the processed text
text_array = model.generate(**text_inputs, tgt_lang="hin", generate_speech=False)

print(f"Translated Text:- {processor.decode(text_array[0].tolist()[0], skip_special_tokens=True)}")

### Speech-to-Speech Translation

In [None]:
import torchaudio

# Load the audio file
audio_sample, audio_sampling_rate = torchaudio.load("/content/download.wav")

# Resample the audio if the sampling rate is different from the model's sampling rate
if audio_sampling_rate != model.config.sampling_rate:
    audio_sample = torchaudio.functional.resample(audio_sample,
                                                  orig_freq=audio_sampling_rate,
                                                  new_freq=model.config.sampling_rate)

# Process the audio inputs
audio_inputs = processor(audios=audio_sample, 
                         return_tensors="pt",
                         sampling_rate=sample_rate).to(device)

# Generate speech from the processed audio inputs
audio_array_from_audio = model.generate(**audio_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze()

# Displaying the generated audio using IPython's Audio function
Audio(audio_array_from_audio, rate=sample_rate)

## Tips and Tricks

### Use dedicated models

In [None]:
# Import the SeamlessM4TForSpeechToSpeech model from the transformers library
from transformers import SeamlessM4TForSpeechToSpeech

# Load the SeamlessM4TForSpeechToSpeech model
model = SeamlessM4TForSpeechToSpeech.from_pretrained("facebook/hf-seamless-m4t-medium")

### Modify the speaker's identity

In [None]:
import torchaudio
from IPython.display import Audio

# Load the audio file
audio_sample, audio_sampling_rate = torchaudio.load("/content/download.wav")

# Resample the audio if the sampling rate is different from the model's sampling rate
if audio_sampling_rate != model.config.sampling_rate:
    audio_sample = torchaudio.functional.resample(audio_sample,
                                                  orig_freq=audio_sampling_rate,
                                                  new_freq=model.config.sampling_rate)

# Process the audio inputs
audio_inputs = processor(audios=audio_sample, return_tensors="pt",sampling_rate=sample_rate).to(device)

# Generate speech from the processed audio inputs
audio_array_from_audio = model.generate(**audio_inputs, tgt_lang="rus",spkr_id=7)[0].cpu().numpy().squeeze()

# Displaying the generated audio using IPython's Audio function
Audio(audio_array_from_audio, rate=sample_rate)

### Modify generation strategy

In [None]:
# Processing the text input
text_inputs = processor(text="Hello, How are you?", src_lang="eng", return_tensors="pt").to(device)

# Generating text from the processed text
text_array = model.generate(**text_inputs,
                            tgt_lang="hin",
                            generate_speech=False,
                            text_num_beams=4,
                            speech_do_sample=True)

print(f"Translated Text:- {processor.decode(text_array[0].tolist()[0], skip_special_tokens=True)}")

### Leveraging Batch Processing for Enhanced Efficiency

In [None]:
# Processing the text input
text_inputs = processor(text=["Hello, how are you?", "I am fine, thank you."], src_lang="eng", return_tensors="pt").to(device)

# Generating text from the processed text
text_array = model.generate(**text_inputs,
                            tgt_lang="hin",
                            generate_speech=False,
                            text_num_beams=4,
                            speech_do_sample=True)

print(f"Sentence 1:- {processor.decode(text_array[0].tolist()[0], skip_special_tokens=True)}")
print(f"Sentence 2:- {processor.decode(text_array[0].tolist()[1], skip_special_tokens=True)}")

### Generate both speech and text

In [None]:
# Processing the text input
text_inputs = processor(text="Hello, How are you?", src_lang="eng", return_tensors="pt").to(device)

# Generating audio/text from the processed text
audio_text_output = model.generate(**text_inputs,
                            tgt_lang="rus",
                            text_num_beams=4,
                            speech_do_sample=True,
                            return_intermediate_token_ids=True)

# Displaying the generated audio using IPython's Audio function
audio_array = audio_text_output[0].cpu().numpy().squeeze()
Audio(audio_array, rate=sample_rate)

text_array = audio_text_output[2]
translated_text_from_text = processor.decode(text_array.tolist()[0], skip_special_tokens=True)
print(f"Translated Text: {translated_text_from_text}")