# Step1 : Install yt-dlp and import necessary libraries

In [None]:
!pip install yt-dlp google-cloud-aiplatform webvtt-py

import os
import yt_dlp

# Step 2: Define a function to download audio and transcript

In [14]:
def download_audio_and_transcript(youtube_link):
    opts = {
        'format': 'bestaudio/best',
        'outtmpl': '%(id)s.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '128',
        }],
        'writesubtitles': True,
        'writeautomaticsub': True,
        'subtitlesformat': 'txt',
        'skip_download': False
    }

    with yt_dlp.YoutubeDL(opts) as ydl:
        info_dict = ydl.extract_info(youtube_link, download=True)
        audio_file = f"{info_dict['id']}.mp3"

        subtitles = info_dict.get('subtitles')
        transcript_file = None
        if subtitles:
            if 'en' in subtitles:
                transcript_file = f"{info_dict['id']}_en.txt"
            else:
                first_lang = list(subtitles.keys())[0]
                transcript_file = f"{info_dict['id']}_{first_lang}.txt"
        return audio_file, transcript_file


In [15]:
youtube_link = "https://www.youtube.com/watch?v=3A855rN_9pE&t=162s"

In [16]:
audio_file, transcript_file = download_audio_and_transcript(youtube_link)

if audio_file:
    print(f"Audio downloaded: {audio_file}")
else:
    print("Audio download failed")

if transcript_file:
    print(f"Transcript downloaded: {transcript_file}")
else:
    print("No transcript found for this video.")

[youtube] Extracting URL: https://www.youtube.com/watch?v=3A855rN_9pE&t=162s
[youtube] 3A855rN_9pE: Downloading webpage
[youtube] 3A855rN_9pE: Downloading ios player API JSON
[youtube] 3A855rN_9pE: Downloading web creator player API JSON
[youtube] 3A855rN_9pE: Downloading m3u8 information
[info] 3A855rN_9pE: Downloading subtitles: en




[info] 3A855rN_9pE: Downloading 1 format(s): 251
Deleting existing file 3A855rN_9pE.en.vtt
[info] Writing video subtitles to: 3A855rN_9pE.en.vtt
[download] Destination: 3A855rN_9pE.en.vtt
[download] 100% of  333.25KiB in 00:00:00 at 2.42MiB/s
[download] 3A855rN_9pE.webm has already been downloaded
[download] 100% of   36.70MiB
[ExtractAudio] Destination: 3A855rN_9pE.mp3
Deleting original file 3A855rN_9pE.webm (pass -k to keep)
Audio downloaded: 3A855rN_9pE.mp3
No transcript found for this video.


In [32]:
import os
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting
from webvtt import WebVTT

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

def generate_transcript(audio_file, vtt_file):
    vertexai.init(project="hyunuk-ai-dev", location="us-central1")
    SI = """## TASK: Generate Natural Transcript

        I have an audio file and its corresponding transcript in VTT format.
        I want you to generate a natural-souding transcript of the audio, improving upon
        the provided VTT captions, and then translate the improved transcript.

        ### Instructions:
        1. **Listen** to the audio and use the VTT transcript as a starting point.
        2. **Improve** the transcript by making it sound more natural and conversational.
           This might involve:
           - Correcting any errors in the VTT captions.
           - Removing unnecessary pauses or filler words.
           - Rephrasing sentences to improve clarity and flow.
        3. **Output** the final translation, sentence by sentence.
        """

    model = GenerativeModel(
        "gemini-pro-experimental",
        system_instruction=[SI],
        )

    audio = Part.from_uri(
        mime_type="audio/mpeg",
        uri="gs://visual_learner/3A855rN_9pE.mp3",
    )

    text = Part.from_text(
        text=vtt_file
    )

    safety_settings = [
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
        ),
        SafetySetting(
            category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
            threshold=SafetySetting.HarmBlockThreshold.BLOCK_ONLY_HIGH
        ),
    ]

    responses = model.generate_content(
        [audio, text],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )
    for response in responses:
        print(response.text, end="")

audio_file = "3A855rN_9pE.mp3"
vtt_file = "3A855rN_9pE.en.vtt"
generate_transcript(audio_file, vtt_file)


```
00:00
Good morning, everyone. Thank you so much for coming to this breakout session on data governance in the age of AI.

We're very excited to have with us two customer speakers today. 
We have Cynthia Gums, who is the manager of global data insights and analytics at Ford, driving key initiatives around the new data factory at Ford. 
We also have Steve Jarrett, who is the chief AI officer at Orange, leading AI and data strategy for Orange across 26 countries. 
And my name is Lou Ann, I'm a product manager for Dataplex, here at Google Cloud. 

So, we're very excited to be sharing with you how we think about data governance in this age of AI and what do our journeys each look like.

So, here's our agenda for today. We're gonna start with an introduction and a product overview followed by case studies from Ford and Orange.
And then we'll talk about what's new, what's upcoming in Dataplex.

So, as all of us have experienced recently, generative AI is really this paradigm shift that is