In [None]:
pip install google-api-python-client youtube-transcript-api

In [2]:
import json

# Load API key from configuration file
with open('config.json') as config_file:
    config = json.load(config_file)
    API_KEY = config.get('YT_API_KEY')

In [23]:
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
from collections import deque
import time
from youtube_transcript_api import YouTubeTranscriptApi, NoTranscriptFound, TranscriptsDisabled
from xml.etree.ElementTree import ParseError

# Initialize YouTube API client
youtube = build('youtube', 'v3', developerKey=API_KEY)

# Keywords to filter videos
keywords = [
    "Resume",
    "Resume Advice",
    "Resume Writing",
    "Resume Tips",
    "How to Write a Resume",
    "Best Resume Practices",
    "Resume Formatting",
    "Resume Examples",
    "Resume Templates",
    "Professional Resume",
    "Resume Design",
    "Resume Optimization",
    "Resume Building",
    "Resume Help",
    "Resume Guide",
    "Resume Writing Tips",
    "Crafting a Resume",
    "Resume Content",
    "Resume Strategy",
    "Resume Review",
    "Resume Critique",
    "Resume Updates",
    "Resume Editing",
    "Resume and Cover Letter",
    "Resume Writing Skills",
    "Effective Resume",
    "Resume for Job Applications",
    "Tailoring Your Resume",
    "Resume Keywords",
    "Resume Structure",
    "Resume Sections",
    "Resume Personalization",
    "Resume Objective",
    "Resume Summary",
    "Resume Experience",
    "Resume Accomplishments",
    "Resume Achievements",
    "Resume for Career Change",
    "Resume for Entry-Level Jobs",
    "Resume for Experienced Prof"
]

def search_videos_by_keywords(keywords, max_results=20):
    try:
        search_results = []
        for keyword in keywords:
            request = youtube.search().list(
                part='snippet',
                q=keyword,
                type='video',
                maxResults=max_results
            )
            response = request.execute()
            search_results.extend(response['items'])
        return search_results
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred: {e.content}")
        return []

def get_video_details(video_ids):
    try:
        video_details = []
        for video_id in video_ids:
            request = youtube.videos().list(
                part='snippet',
                id=video_id
            )
            response = request.execute()
            video_details.extend(response['items'])
        return video_details
    except HttpError as e:
        print(f"An HTTP error {e.resp.status} occurred: {e.content}")
        return []

def get_transcript(video_id, retries=3, delay=5):
    attempt = 0
    while attempt < retries:
        try:
            transcript = YouTubeTranscriptApi.get_transcript(video_id)
            transcript_text = ' '.join([t['text'] for t in transcript])
            print(f"Transcript found for video ID: {video_id}")
            return transcript_text
        except NoTranscriptFound:
            print(f"No transcript found for video ID: {video_id}")
            return None
        except TranscriptsDisabled:
            print(f"Transcripts are disabled for video ID: {video_id}")
            return None
        except ParseError as e:
            print(f"ParseError occurred for video ID: {video_id}. Error: {str(e)}")
            attempt += 1
            time.sleep(delay)  # Wait before retrying
        except Exception as e:
            print(f"An unexpected error occurred for video ID: {video_id}. Error: {str(e)}")
            return None
    print(f"Failed to retrieve transcript after {retries} attempts for video ID: {video_id}")
    return None

def filter_videos(videos, keywords):
    filtered_videos = []
    for video in videos:
        if any(keyword.lower() in video['snippet']['title'].lower() for keyword in keywords):
            filtered_videos.append({
                'title': video['snippet']['title'],
                'videoId': video['id'],
                'transcript': ''
            })
    return filtered_videos

def crawl_videos(start_video_id, keywords, max_depth=2, max_videos=50):
    crawled_videos = []
    video_queue = deque([(start_video_id, 0)])
    visited_videos = set()

    while video_queue and len(crawled_videos) < max_videos:
        current_video_id, depth = video_queue.popleft()
        if depth > max_depth or current_video_id in visited_videos:
            continue
        
        visited_videos.add(current_video_id)

        # Get video details and related videos
        search_results = search_videos_by_keywords(keywords)
        video_ids = [item['id']['videoId'] for item in search_results if 'videoId' in item['id']]
        video_details = get_video_details(video_ids)

        filtered_videos = filter_videos(video_details, keywords)
        
        for video in filtered_videos:
            if video['videoId'] not in visited_videos:
                transcript = get_transcript(video['videoId'])
                if transcript:
                    video['transcript'] = transcript
                crawled_videos.append(video)
                video_queue.append((video['videoId'], depth + 1))
                time.sleep(1)  # To avoid hitting rate limits
    
    return crawled_videos

# Function to save transcripts to a file
def save_transcripts_to_file(videos, filename='corpus_from_web_crawler.txt'):
    with open(filename, 'w', encoding='utf-8') as file:
        for video in videos:
            title = video['title']
            video_id = video['videoId']
            transcript = video.get('transcript', 'No transcript available')

            # Write video title and ID
            file.write(f"Title: {title}\n")
            file.write(f"Video ID: {video_id}\n")
            
            # Write transcript
            file.write(f"Transcript:\n{transcript}\n")
            
            # Add a separator between different video transcripts
            file.write("\n" + "-"*80 + "\n\n")

# Start Crawling
start_video_id = 'Tt08KmFfIYQ'  # Replace with the actual starting video ID
crawled_videos = crawl_videos(start_video_id, keywords)

# Save transcripts to file
save_transcripts_to_file(crawled_videos)

An HTTP error 403 occurred: b'{\n  "error": {\n    "code": 403,\n    "message": "The request cannot be completed because you have exceeded your \\u003ca href=\\"/youtube/v3/getting-started#quota\\"\\u003equota\\u003c/a\\u003e.",\n    "errors": [\n      {\n        "message": "The request cannot be completed because you have exceeded your \\u003ca href=\\"/youtube/v3/getting-started#quota\\"\\u003equota\\u003c/a\\u003e.",\n        "domain": "youtube.quota",\n        "reason": "quotaExceeded"\n      }\n    ]\n  }\n}\n'
