In [63]:
import os
import re
import requests
import pandas as pd
from datetime import datetime, timedelta
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

# GET YOUR OWN API KEY WITH BELOW INSTRUCTIONS

In [66]:
# Replace with your own API key
API_KEY = "YOUR_API_KEY"

# COLLECT ALL DATA

In [None]:
def youtube_search(query, max_results=10, published_after=None):
    """Search for videos on YouTube."""
    youtube = build("youtube", "v3", developerKey=API_KEY)
    
    search_response = youtube.search().list(
        q=query,
        part="id,snippet",
        maxResults=max_results,
        type="video",  # Only return video results
        publishedAfter=published_after  # Filter by published date if provided
    ).execute()

    video_links = []
    for item in search_response.get("items", []):
        video_id = item["id"]["videoId"]
        video_title = item["snippet"]["title"]
        video_link = f"https://www.youtube.com/watch?v={video_id}"
        date_posted = item["snippet"]["publishedAt"].split("T")[0]  # Extract date without timestamp
        video_links.append((video_id, video_title, video_link, date_posted))

    return video_links

def get_comments(video_id, max_comments=10):
    """Retrieve the last N comments for a given video ID."""
    youtube = build("youtube", "v3", developerKey=API_KEY)
    comments = []
    page_token = None
    
    try:
        while len(comments) < max_comments:
            response = youtube.commentThreads().list(
                part="snippet",
                videoId=video_id,
                textFormat="plainText",
                maxResults=min(max_comments - len(comments), 100),  # Adjust to not exceed max_comments
                order='time',
                pageToken=page_token
            ).execute()

            # Fetch comments and format them with numbering
            for idx, item in enumerate(response.get("items", [])):
                comment = item["snippet"]["topLevelComment"]["snippet"]["textDisplay"]
                comments.append(f"{len(comments) + 1}. {comment}")  # Format with numbering
            
            # Check if there are more comments to fetch
            page_token = response.get("nextPageToken")
            if not page_token:  # Exit if there are no more pages
                break

    except HttpError as e:
        if e.resp.status == 403:
            return ["Comments are disabled for this video."]
        else:
            return [f"Error fetching comments: {e}"]

    return comments[:max_comments]  # Return only up to max_comments

def download_transcript(video_id):
    """
    Download the transcript, translate to English if necessary, and return as a string.
    Args:
        video_id (str): The YouTube video ID.
    Returns:
        str: The transcript text in English or an empty string if an error occurs.
    """
    try:
        # Retrieve available transcripts
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        
        transcript = None

        # Try to find a manually created or generated transcript in English first
        try:
            transcript = transcript_list.find_transcript(['en'])
        except:
            try:
                # Fall back to generated transcript if manually created isn't available
                transcript = transcript_list.find_generated_transcript(['en'])
            except:
                # If no English transcript, pick the first available transcript in any language
                for available_transcript in transcript_list:
                    if available_transcript.is_translatable:
                        transcript = available_transcript
                        break
        
        if transcript is None:
            raise Exception("No translatable transcript available")

        # If transcript isn't in English, translate it to English
        if transcript.language_code != 'en':
            transcript = transcript.translate('en')
        
        # Fetch the transcript and format it
        formatter = TextFormatter()
        transcript_text = formatter.format_transcript(transcript.fetch())

        # Clean up the transcript by removing timecodes and speaker names
        transcript_text = re.sub(r'\[\d+:\d+:\d+\]', '', transcript_text)
        transcript_text = re.sub(r'<\w+>', '', transcript_text)
        transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()

        return transcript_text
    except Exception as e:
        # print(f"Error downloading transcript: {e}")
        return ""

def sanitize_filename(filename):
    """Sanitize a filename by removing or replacing invalid characters."""
    # Remove characters that are invalid for filenames
    sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
    
    # Optionally replace spaces with underscores (optional)
    sanitized = sanitized.replace(' ', '_')
    
    # Keep only alphanumeric characters, spaces, underscores, and hyphens
    return "".join(c for c in sanitized if c.isalnum() or c in ('_', '-')).rstrip()

def main():
    user_query = input("Enter a search term for YouTube: ")

    num_videos_input = input("Enter the number of videos to retrieve (or 'all' for maximum): ")
    num_videos = 50 if num_videos_input.lower() == "all" else int(num_videos_input)

    num_comments_input = input("Enter the number of comments to retrieve for each video (or 'all' for maximum): ")
    num_comments = 100 if num_comments_input.lower() == "all" else int(num_comments_input)

    years_back = int(input("How many years back do you want to search for videos? "))
    published_after_date = (datetime.now() - timedelta(days=years_back * 365)).isoformat("T") + "Z"

    # Ask the user for the directory where they want to save the output file
    save_directory = input("Enter the directory path where you'd like to save the output file: ").strip()

    # Search for videos
    video_links = youtube_search(user_query, num_videos, published_after=published_after_date)

    # Prepare data for DataFrame
    data = []

    if video_links:
        for video_id, title, link, date_posted in video_links:
            # Get last N comments for each video
            comments = get_comments(video_id, num_comments)
            comments_text = '\n'.join(comments)  # Join comments into a single string, one per line

            # Get transcript for each video
            transcript_text = download_transcript(video_id)
            transcript_text = transcript_text if transcript_text else "Transcript not available."

            # Append data
            data.append({
                "Serial Number": len(data) + 1,
                "Video Name": title,
                "Video Link": link,
                "Date Posted": date_posted,
                "Comments": comments_text,
                "Transcript": transcript_text
            })

        # Create DataFrame
        df = pd.DataFrame(data)

        # Sanitize the filename and create the full output path
        sanitized_query = sanitize_filename(user_query)
        output_file = os.path.join(save_directory, f"{sanitized_query}_YouTube_Data.xlsx")

        # Save to Excel
        df.to_excel(output_file, index=False)

        print(f"Data saved to {output_file}")
    else:
        print("No videos found.")

if __name__ == "__main__":
    main()


MAXIMUN COMMENTS RETRIEVED ARE 100
MAXIMUN VIDEOS RETRIEVED ARE 50

# COLLECT TRANSCRIPT ONLY USING VIDEO LINK

In [43]:
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import requests
import re
import os

def get_video_id(youtube_url):
    """
    Extract the video ID from a YouTube URL.
    Args:
        youtube_url (str): The YouTube URL.
    Returns:
        str: The extracted video ID or None if not found.
    """
    pattern = r'(?:https?:\/\/)?(?:www\.)?(?:youtube\.com\/(?:[^\/\n\s]+\/\S+\/|(?:v|e(?:mbed)?)\/|\S*?[?&]v=)|youtu\.be\/)([a-zA-Z0-9_-]{11})'
    match = re.search(pattern, youtube_url)
    return match.group(1) if match else None

def get_video_title(video_id):
    """
    Get the title of the YouTube video.
    Args:
        video_id (str): The YouTube video ID.
    Returns:
        str: The title of the video or "Unknown" if not found.
    """
    url = f"https://www.youtube.com/watch?v={video_id}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        matches = re.findall(r'<title>(.*?)</title>', response.text)
        return matches[0].replace(" - YouTube", "") if matches else "Unknown"
    except requests.RequestException as e:
        print(f"Error fetching video title: {e}")
        return "Unknown"


def download_transcript(video_id):
    """
    Download the transcript, translate to English if necessary, and return as a string.
    Args:
        video_id (str): The YouTube video ID.
    Returns:
        str: The transcript text in English or an empty string if an error occurs.
    """
    try:
        # Retrieve available transcripts
        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        
        transcript = None

        # Try to find a manually created or generated transcript in English first
        try:
            transcript = transcript_list.find_transcript(['en'])
        except:
            try:
                # Fall back to generated transcript if manually created isn't available
                transcript = transcript_list.find_generated_transcript(['en'])
            except:
                # If no English transcript, pick the first available transcript in any language
                for available_transcript in transcript_list:
                    if available_transcript.is_translatable:
                        transcript = available_transcript
                        break
        
        if transcript is None:
            raise Exception("No translatable transcript available")

        # If transcript isn't in English, translate it to English
        if transcript.language_code != 'en':
            transcript = transcript.translate('en')
        
        # Fetch the transcript and format it
        formatter = TextFormatter()
        transcript_text = formatter.format_transcript(transcript.fetch())

        # Clean up the transcript by removing timecodes and speaker names
        transcript_text = re.sub(r'\[\d+:\d+:\d+\]', '', transcript_text)
        transcript_text = re.sub(r'<\w+>', '', transcript_text)
        transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()

        return transcript_text
    except Exception as e:
        print(f"Error downloading transcript: {e}")
        return ""


def main():
    youtube_url = input("Enter the YouTube video link: ")
    video_id = get_video_id(youtube_url)

    if video_id:
        transcript_text = download_transcript(video_id)
        if transcript_text:
            video_title = get_video_title(video_id)
            file_name = f"{video_id}_{video_title}.txt"
            file_name = re.sub(r'[\\/*?:"<>|]', '', file_name)  # Remove invalid characters

            with open(file_name, 'w', encoding='utf-8') as file:
                file.write(transcript_text)

            print(f"Transcript saved to {file_name}")
        else:
            print("Unable to download transcript.")
    else:
        print("Invalid YouTube URL.")

if __name__ == "__main__":
    main()

Enter the YouTube video link:  https://www.youtube.com/watch?v=kSmcQiO7z4Y


Transcript saved to kSmcQiO7z4Y_Complete Power BI tutorial for Beginners 2024 🚀🚀(All material 🎁 included).txt
