# Import library
those are the requirement for the application

In [2]:
!pip install youtube-transcript-api google-generativeai googlesearch-python

Collecting googlesearch-python
  Downloading googlesearch_python-1.3.0-py3-none-any.whl.metadata (3.4 kB)
Downloading googlesearch_python-1.3.0-py3-none-any.whl (5.6 kB)
Installing collected packages: googlesearch-python
Successfully installed googlesearch-python-1.3.0



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
import os
import re
import json
import time
import requests
import google.generativeai as genai
from dotenv import load_dotenv
from IPython.display import Markdown, display
from youtube_transcript_api import YouTubeTranscriptApi
from googlesearch import search
from bs4 import BeautifulSoup



# Google search

In [32]:
class YoutubeSearch:
    def __init__(self, keyword, num_results=10, lang="en"):
        self.site = "www.youtube.com"
        self.keywork = keyword
        self.num_results = num_results
        self.lang = lang
        self.results = self.get_link()
        self.titles = self.get_titles()
        

    def get_link(self):
        results = [ result for result in search(f"site:{self.site} {self.keywork}", 
                                               num_results=self.num_results, 
                                              lang=self.lang)][1:]
        return results
        
    def get_titles(self):
        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
        titles=[]
        for url in self.results:
            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.content, 'html.parser')
            title = soup.title.string if soup.title else "No title found"
            titles.append((title,url))
        return titles
        

In [33]:
ytLinks = YoutubeSearch(keyword="AI")
ytLinks.titles

[('Artificial Intelligence Full Course 2025 | Artificial Intelligence Tutorial | AI Course |Simplilearn - YouTube',
  'https://www.youtube.com/watch?v=vScGLqbeWbs'),
 ('AI is changing war. Just not with killer robots - YouTube',
  'https://www.youtube.com/watch?v=geaXM1EwZlg'),
 ('DeepSeek’s Lessons for Chinese AI - YouTube',
  'https://www.youtube.com/watch?v=hFTqQ4boR-s'),
 ("Innovating medicine with AI: Here's what to know - YouTube",
  'https://www.youtube.com/watch?v=wAqg7FtySNk'),
 ('Neural Networks & AI Explained - YouTube',
  'https://www.youtube.com/watch?v=_ZE4B0sV7BM'),
 ('Impact of artificial intelligence: AI advancement changes businesses in unexpected ways - YouTube',
  'https://www.youtube.com/watch?v=qWxjoBgd84k'),
 ('How Infrastructure is Powering the Age of AI - YouTube',
  'https://www.youtube.com/watch?v=BCc29YKiOGk'),
 ('AI Trends for 2025 - YouTube',
  'https://www.youtube.com/watch?v=5zuF4Ys1eAw'),
 ('How China’s New AI Model DeepSeek Is Threatening U.S. Dominanc

# load the google API key 

In [34]:
# Load environment variables in a file called .env
load_dotenv()
os.environ['GOOGLE_API_KEY'] = os.getenv('GOOGLE_API_KEY', 'your-key-if-not-using-env')

In [35]:
# Start Google API

load_dotenv(override=True)
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
MODEL = "gemini-1.5-flash"
genai.configure(api_key=GOOGLE_API_KEY)

In [7]:
patterns = [
        r"(?:v=|/)([a-zA-Z0-9_-]{11})",  # Matches v=... or /...
        r"(?:embed/|v/|watch\?v=|youtu.be/|/v/)([a-zA-Z0-9_-]{11})", # Matches embed, v/, watch?v=, youtu.be, /v/
        r"youtu\.be/([a-zA-Z0-9_-]{11})", # Matches youtu.be short links
        r"(?:/shorts/|/live/)([a-zA-Z0-9_-]{11})", # Matches youtube shorts or live
    ]

In [36]:
# Define the youtube transcript class 
class YTVideoTranscripter:
    def __init__(self, url, languages=["en"]):
        self.url = url
        self.language = languages
        self.video_id = self.extract_video_id()
        self.transcription = self.get_transcript()
        self.systemPrompt = """
        You are a helpful assistant who provides concise and accurate summaries of text. Your task is to:
        
        - Capture the key points of the content.
        - Keep the summary brief and easy to understand.
        - Avoid summarizing overly lengthy texts or breaking them into excessively short summaries.
        - Use bullet points where appropriate to enhance clarity and structure.
        """
        self.translatePrompt = """
        You are a tranditional chinese translator. 
        You can translate the text to correctly an easily to read charactor in tradition chinese in a nice sentences.
        """
        
    def extract_video_id(self):
        """
        Extracts the YouTube video ID from a given URL.
        Supports both regular and shortened URLs.
        """
        # Regular expression to match YouTube video URL and extract the video ID
        regex = r"(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|\S*\?v=)|(?:youtu\.be\/))([a-zA-Z0-9_-]{11})"
        match = re.match(regex, self.url)
        
        if match:
            self.match = match
            return match.group(1)
        else:
            raise ValueError("Invalid YouTube URL")

    def get_transcript(self):
        try:
            # Try to get the transcript in the desired language (Indonesian by default)
            transcript = YouTubeTranscriptApi.get_transcript(self.video_id, languages=[self.language])
            # Join all the 'text' fields into a single string
            return " ".join([item['text'] for item in transcript])
        except Exception as e:
            print(f"Error fetching transcript: {e}")
            return None
            
    def transcriptSummarizer(self):
        model = genai.GenerativeModel(model_name=MODEL,system_instruction=self.systemPrompt)
        response = model.generate_content(self.transcription)
        return response.text

    def chinese(self):
        model = genai.GenerativeModel(model_name=MODEL,system_instruction=self.translatePrompt)
        response = model.generate_content(self.transcription)
        return response.text
        
    def __str__(self):
        return f"Video ID: {self.video_id}"

In [11]:
# Example usage
video_url = "https://www.youtube.com/watch?v=LUttsadgSEY"

yt_video = YTVideoTranscripter(video_url, languages="en")
print(len(yt_video.transcription))

9243


In [26]:
yt_video.transcriptSummarizer()

"This video highlights five free online Harvard courses:\n\n*   **CS50: Introduction to Computer Science:** A highly popular course covering various programming languages (Scratch, C, Python, SQL, HTML, CSS, JavaScript) and fostering problem-solving skills applicable across fields.\n\n*   **R Basics:** Part of Harvard's data science program, this course teaches the R programming language for data analysis, visualization, and manipulation, preparing students for a high-demand field.\n\n*   **Digital Humanities:** This course explores the intersection of technology and humanities, using digital tools for research, data visualization, text analysis, and cultural preservation.\n\n*   **The Art of Persuasive Writing and Public Speaking:** Focusing on rhetoric, this course enhances communication skills crucial for various professions, emphasizing argument construction, storytelling, and body language.\n\n*   **Introduction to Artificial Intelligence with Python:**  This course introduces AI 

In [23]:
yt_video.transcription

"all right we this one keeps coming up certifications certifications are not certificant of completion cations or when you study something that a company has that they offer then you travel to a proed location take that exam you either pass it or fail it if you pass it you are certified if you fail it you aren't they are certifications now again here's an easy way to remember if the certifications count is the person offering the certification the creator of the product so python.org python.org they are in charge of python they have no certifications no does that mean if corsera offers a python certification is valid no it is worthless worth list I promise we will throw out your resume if you put a python certification on it because we know for sure you have no idea what you're talking about so let's go back to it again did the person create the product how about all the snowflake certification snowflake shts are they valid yes why because snowflake created the product how about Micros

In [27]:
yt_video.chinese()

'今日我們將深入探討一些可能讓你大開眼界的內容：想像一下，免費獲得哈佛大學的教育！不，這不是《駭客任務》裡的錯誤，你也不需要時光機或一大筆錢。我們要談的是五門令人難以置信的免費哈佛線上課程，它們能讓你事業騰飛，比你說出「Veritas」（真理）還快！無論你是科技愛好者、數據高手，還是未來作家，這裡總有一門課程適合你。但在開始之前，讓我們先解決一個問題：是的，這些課程確實來自哈佛；是的，它們完全免費；不，你不需要是天才才能修讀它們，你只需要好奇心和學習的意願。所以，繫好安全帶，拿出你最喜歡的筆記工具，讓我們開始這段常春藤聯盟的冒險吧！\n\n首先，我們要介紹哈佛線上課程的皇冠上的明珠：CS50 計算機科學導論。我知道有些人可能在想：「計算機科學不是只有科技迷才學嗎？」等等，別急著脫下你的帽衫，這門課將比你說出「Hello, world！」還要快地改變你的看法。CS50 不只是一門課程，它是一種現象，是哈佛校園裡最大的課程，也成為全球最受歡迎的大規模線上公開課程（MOOCs）之一。根據 Class Central 的數據，截至 2023 年，已有超過 370 萬人線上報名 CS50，這比一些國家的總人口還多！但是什麼讓 CS50 如此特別呢？它就像瑞士軍刀一樣，涵蓋了多種科技課程的內容。你將學習 Scratch（一種對初學者友好的程式語言）、C 語言（現代程式語言的鼻祖）、Python（編碼界的瑞士軍刀）、SQL（讓數據庫運作的語言），以及 HTML、CSS 和 JavaScript（網頁開發的聖三一）。而這僅僅只是冰山一角而已！別只聽我說，讓我們聽聽一位修過 CS50 的學員 Sarah 的感想：「在修讀 CS50 之前，我認為演算法只是天才電腦科學家才懂的東西。現在，我將演算法思維應用到日常生活中，從優化我的購物清單到管理工作中的項目，就好像我獲得了一雙新的眼睛去看待世界。」\n\n我知道你可能在想：聽起來很棒，但它真的適合初學者嗎？答案是肯定的！David J. Malan 教授，CS50 背後的魅力人物，擅長將複雜的概念化繁為簡。他曾用電話簿和鋸子來解釋記憶體配置，是的，你沒聽錯！但 CS50 不僅僅是學習編碼，更是培養解決問題的能力，這可以應用於任何領域。無論你是試圖優化廣告活動的行銷人員、開發課程的教師，還是正在打造下一個大項目的企業家，你從 CS50 中獲

In [None]:
test.add()