In [None]:
!pip install openai-whisper

In [None]:
!pip install pytube

In [None]:
!pip install sentencepiece

In [1]:
import whisper
import pytube
import re
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

## **Transcribe**:

In [4]:
class TranscribeVideo:
    def __init__(self):
        self.model = whisper.load_model("base")

    def _get_audio_from_yt(self, video_url: str):
        # Create a YouTube object
        yt = pytube.YouTube(video_url)

        # Download the audio stream
        audio_stream = yt.streams.filter(only_audio=True).first()
        self.audio_path = f"{video_url.split('watch?v=')[-1]}.m4a"
        audio_stream.download(filename=self.audio_path)

    def _write_text(self, text: str):
        self.text_path = f"transcribe_{self.audio_path.split('.')[0]}.txt"
        with open(self.text_path, "w") as fp:
            fp.write(text)

    def get_audio(self):
        # Extract the audio from the file
        with open(self.audio_path, "rb") as f:
            return f.read()

    def get_text(self):
        # Extract the audio from the file
        with open(self.text_path, "r") as f:
            return f.read()

    def to_text(self, video_url: str):
        print(f"Downloading audio from: {video_url}")

        # Download audio stream from YouTube
        self._get_audio_from_yt(video_url)

        print(f"Audio saved to: {self.audio_path}")
        print("Transcription started, please wait...")

        # Transcribe audio
        result = self.model.transcribe(self.audio_path, fp16=False)

        # Save text in a file
        self._write_text(result['text'])
        print(f"Text saved to: {self.text_path}")

        return result['text']



In [5]:
# Get the video URL
video_url = "https://www.youtube.com/watch?v=TRjq7t2Ms5I"
tv = TranscribeVideo()

In [None]:
tv.to_text(video_url)

## **Translate**:

In [34]:
class TranslateText:
    def __init__(self):
        model_name = 'Helsinki-NLP/opus-mt-en-de'
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

    def split_text_into_sentences(self, text):
        """
        Split a text into sentences using regular expressions.

        Parameters:
        - text: The input text.

        Returns:
        A list of sentences.
        """
        sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
        return sentences

    def split_text_into_chunks(self, text, chunk_size=512):
        """
        Split a large text into fixed-size chunks based on characters.

        Parameters:
        - text: The input text to be split.
        - chunk_size: The desired size of each chunk.

        Returns:
        A list of text chunks.
        """
        sentences = self.split_text_into_sentences(text)
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            if len(current_chunk) + len(sentence) <= chunk_size:
                current_chunk += sentence + " "
            else:
                chunks.append(current_chunk.strip())
                current_chunk = sentence + " "

        # Add the last chunk if it's not empty
        if current_chunk:
            chunks.append(current_chunk.strip())

        return chunks

    def translate_chunk(self, chunk_text: str):
        input_ids = self.tokenizer.encode(chunk_text, return_tensors="pt")
        outputs = self.model.generate(input_ids)
        decoded = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        return decoded

    def en2de(self, text: str, chunk_size: int):
        # Split text into chunks
        chunks = self.split_text_into_chunks(text, chunk_size)

        # translate texts
        translated = []
        for chunk in tqdm(chunks):
            translated.append(self.translate_chunk(chunk))

        # return translated text
        return " ".join(translated)

In [35]:
tt = TranslateText()
translated_text = tt.en2de(tv.get_text(), chunk_size=512)
translated_text

100%|██████████| 48/48 [04:27<00:00,  5.57s/it]


"Hey, jeder. Mein Name ist Jerry, Mitgründer und CEO von Womendex, und heute werden wir darüber reden, wie man Produktion-ready, Rag-Anwendungen bauen. Ich denke, es gibt noch Zeit für eine Verlosung für den Eimer Hut, so wenn Sie Jungs an unserem Stand stoppen, füllen Sie bitte das Google-Formular. Okay. Lassen Sie uns zurück zum Thema. Jeder weiß, dass es eine Tonne von erstaunlichen Anwendungsfällen in Genai in letzter Zeit. Wissenssuche in QA, Gesprächsagenten, Workflow-Automatisierung, Dokumentenverarbeitung. Dies sind alles Dinge, die Sie bauen können, vor allem mit Hilfe der Argumentationsfähigkeiten von LMs, über Ihre Daten. Also, wenn wir nur eine schnelle Auffrischung in Bezug auf wie Paradigmen, wie Sie tatsächlich Sprachmodelle, um Daten zu verstehen, die nicht über trainiert wurde, gibt es wirklich wie zwei Paradigmen. Eines ist das Abrufen Augmentation, wo Sie das Modell reparieren mögen und Sie im Grunde erstellen eine Datenpipeline, um Kontext in die Eingabeaufforderung

In [36]:
with open(f"de_{tv.text_path}", "w") as fp:
    fp.write(translated_text)

## **Text to Speech**:

In [None]:
!pip install gtts

In [2]:
from gtts import gTTS
import os

In [42]:
def text_to_speech(text, language='en', output_file='output.mp3'):
    """
    Convert text to speech using gTTS (Google Text-to-Speech).

    Parameters:
    - text: The input text to be converted.
    - language: The language of the text (default is English).
    - output_file: The name of the output audio file (default is 'output.mp3').
    """
    tts = gTTS(text=text, lang=language, slow=False)
    tts.save(output_file)

In [41]:
# Text2Speech
text_to_speech(translated_text, language='de', output_file='output.mp3')

In [44]:
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.36.1-py3-none-any.whl (8.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed transformers-4.36.1


In [None]:
!pip install datasets

In [7]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import soundfile as sf

In [8]:
tts_processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
tts_vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

# load xvector containing speaker's voice characteristics from a dataset
tts_embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.9M [00:00<?, ?B/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [9]:
def text_to_speech_hf(text, output_file='output.wav'):
    """
    Convert text to speech using Hugging Face's Text2Speech models.

    Parameters:
    - text: The input text to be converted.
    - output_file: The name of the output audio file (default is 'output.wav').
    """

    inputs = tts_processor(text=text, return_tensors="pt")

    speaker_embeddings = torch.tensor(tts_embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    speech = tts_model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=tts_vocoder)

    sf.write(output_file, speech.numpy(), samplerate=16000)

In [None]:
text_to_speech_hf(translated_text)