In [3]:
import pandas as pd
from googleapiclient.discovery import build

df = pd.read_csv('onefoodie_videos.csv')

In [6]:
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter
import re
from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# Function to extract video ID from YouTube URL
def extract_video_id(url):
    # Handle different URL formats
    patterns = [
        r'(?:youtube\.com\/watch\?v=|youtu.be\/)([\w-]+)',
        r'(?:youtube\.com\/embed\/)([\w-]+)'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, url)
        if match:
            return match.group(1)
    return None

# Function to get Traditional Chinese subtitles
def get_chinese_subtitles(video_id):
    try:
        logger.info(f"Fetching subtitles for video ID: {video_id}")
        transcript = YouTubeTranscriptApi.get_transcript(
            video_id, 
            languages=['zh-Hant', 'zh-TW']
        )
        
        formatter = TextFormatter()
        formatted_transcript = formatter.format_transcript(transcript)
        logger.info(f"Successfully retrieved subtitles for video ID: {video_id}")
        
        return formatted_transcript
    except Exception as e:
        logger.error(f"Error fetching subtitles for video ID {video_id}: {str(e)}")
        return f"Error: {str(e)}"

# Read the video URLs
df = pd.read_csv('onefoodie_videos.csv')
logger.info(f"Loaded {len(df)} videos from CSV")

# Add a new column for subtitles
logger.info("Extracting video IDs...")
df['video_id'] = df['url'].apply(extract_video_id)

# Process videos with progress bar
logger.info("Starting subtitle collection...")
tqdm.pandas(desc="Collecting subtitles")
df['chinese_subtitles'] = df['video_id'].progress_apply(get_chinese_subtitles)

# Count successful and failed attempts
success_count = df['chinese_subtitles'].str.startswith('Error').value_counts()
logger.info(f"\nCollection complete. Results summary:")
logger.info(f"Successful retrievals: {success_count.get(False, 0)}")
logger.info(f"Failed retrievals: {success_count.get(True, 0)}")

# Save the results
logger.info("Saving results to CSV...")
df.to_csv('videos_with_subtitles.csv', index=False)
logger.info("Process complete! Results saved to 'videos_with_subtitles.csv'")

# Display first few rows
print("\nFirst few results:")
print(df[['url', 'chinese_subtitles']].head())

2025-01-24 16:00:14 - INFO - Loaded 501 videos from CSV
2025-01-24 16:00:14 - INFO - Extracting video IDs...
2025-01-24 16:00:14 - INFO - Starting subtitle collection...
Collecting subtitles:   0%|          | 0/501 [00:00<?, ?it/s]2025-01-24 16:00:14 - INFO - Fetching subtitles for video ID: f9IZo98Ev2o
2025-01-24 16:00:15 - ERROR - Error fetching subtitles for video ID f9IZo98Ev2o: 
Could not retrieve a transcript for the video https://www.youtube.com/watch?v=f9IZo98Ev2o! This is most likely caused by:

Subtitles are disabled for this video

If you are sure that the described cause is not responsible for this error and that a transcript should be retrievable, please create an issue at https://github.com/jdepoix/youtube-transcript-api/issues. Please add which version of youtube_transcript_api you are using and provide the information needed to replicate the error. Also make sure that there are no open issues which already describe your problem!
Collecting subtitles:   0%|          | 2/


First few results:
                                           url  \
0  https://www.youtube.com/watch?v=f9IZo98Ev2o   
1  https://www.youtube.com/watch?v=vxgJ4hFnZ2U   
2  https://www.youtube.com/watch?v=ixgiLajFIxs   
3  https://www.youtube.com/watch?v=9lh5htiVrtM   
4  https://www.youtube.com/watch?v=vAEaH8_h_x4   

                                   chinese_subtitles  
0  Error: \nCould not retrieve a transcript for t...  
1  大家好我是一画\n今天给大家分享一道非常好吃的\n葱香虾球\n你要是学会了\n身边的大厨吃了...  
2  大家好我是一花\n豆腐和鸡蛋不要总是炒着吃了\n今天给大家解锁一道新吃法\n出锅以后鲜香滑嫩...  
3  大家好我是一华\n今天用鸡胸肉\n来给大家分享一道非常好吃的下酒菜\n外酥里嫩嘎嘣脆\n首先...  
4  大家好我是一画\n面条不要总是煮着吃了\n天气那么炎热\n今天就教大家一个非常简单\n又好吃...  
