#### D'NOVA YouTube Shorts Comments & Transcript Crawling

##### Libraries

In [1]:
import time
import random
import pandas as pd
import pickle
import undetected_chromedriver as uc
import random
import re
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os
from datetime import datetime, timedelta


##### Parameters

In [2]:
today = datetime.today().strftime('%Y-%m-%d')
# Define the path where the Excel files are stored
db_path = 'DB'  # Replace with the correct path to the folder containing the files

# Get a list of all Excel files in the directory
files = [f for f in os.listdir(db_path) if f.startswith('loaded_db_100_') and f.endswith('.xlsx')]
# Extract the dates from the filenames
dates = [f.split('_')[3].split('.')[0] for f in files]

# Find the most recent date
latest_date = max(dates)

# Build the full file path for the most recent file
latest_file = f'loaded_db_100_{latest_date}.xlsx'
latest_file_path = os.path.join(db_path, latest_file)

##### Functions

In [3]:

# 우회를 위해 random하게 시간 배정
def random_sleep():
    """1~2초 사이의 랜덤한 대기"""
    time.sleep(random.uniform(1, 3))

# comments 끝까지 수집을 위해 scroll down
def scroll_to_bottom(driver):
    """페이지를 끝까지 스크롤 다운하는 함수"""
    
    # Perform a single scroll down to load comments
    driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
    last_height = driver.execute_script("return document.documentElement.scrollHeight")
    
    while True:
        driver.find_element(By.TAG_NAME, "body").send_keys(Keys.END)
        random_sleep()
        
        new_height = driver.execute_script("return document.documentElement.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

def get_youtube_comments(url):
    # uc driver로 브라우저 실행
    driver = uc.Chrome()
    driver.get(url)
    random_sleep()  # 페이지가 로드될 시간을 줍니다.
    
    # 페이지 끝까지 스크롤 다운
    scroll_to_bottom(driver)
    
    # 모든 댓글 수집
    comments_elements = driver.find_elements(By.XPATH, '//*[@id="content-text"]/span')
    comments = [comment.text for comment in comments_elements]
    random_sleep()  # 데이터 수집 중간에도 랜덤 대기
    
    # 댓글 작성 날짜 수집
    date_elements = driver.find_elements(By.XPATH, '//*[@id="published-time-text"]/a')
    dates = [date.text for date in date_elements]
    random_sleep()  # 데이터 수집 중간에도 랜덤 대기
    
    # 결과를 리스트의 튜플 형태로 반환
    comments_data = list(zip(comments, dates))
    
    # 드라이버 종료
    driver.quit()
    
    return comments_data

# Define the youtube_transcript function
def youtube_transcript_music(driver):
    """Extract transcript text from YouTube Shorts video page."""
    music_texts = ''
    try:
        driver.implicitly_wait(15)  # Wait for page to load
        # Step 1: Click the 'more' button
        more_button = driver.find_element(By.XPATH, '//*[@id="expand"]')
        more_button.click()
        time.sleep(random.uniform(1, 2))  # Random delay to ensure smooth loading
        driver.implicitly_wait(10)  # Wait for page to load
        # music 확인
        try:
            # Find all elements matching the XPath
            music_elements = driver.find_elements(By.XPATH, '//*[@id="items"]/yt-video-attribute-view-model/div/a')
            # Extract text from each element
            music_texts = [element.text for element in music_elements][0]
        except:
            print("No music")
            music_texts = ''
        # Step 2: Click the 'show transcript' button
        transcript_button = driver.find_element(By.XPATH, '//*[@id="primary-button"]/ytd-button-renderer/yt-button-shape/button/yt-touch-feedback-shape/div/div[2]')
        transcript_button.click()
        time.sleep(random.uniform(1, 2))  # Random delay for loading transcript

        # Step 3: Collect all transcript text
        transcript_elements = driver.find_elements(By.XPATH, '//*[@id="body"]/ytd-transcript-segment-list-renderer')
        transcript_texts = [element.text for element in transcript_elements]
        
        # Combine all transcript texts into a single string
        transcript_data = " ".join(transcript_texts) if transcript_texts else "No transcript found"
        transcript_data = process_transcript(transcript_data)
        return transcript_data, music_texts  # Return the combined transcript text

    except Exception as e:
        print(f"Error occurred while extracting transcript or music: {e}")
        return '', music_texts
    

def process_transcript(transcript_text):
    # Remove time markers (e.g., "0:00")
    transcript_text = re.sub(r'\d+:\d+\s*', '', transcript_text)
    
    # Split by newline or sentence-ending punctuation
    sentences = re.split(r'(?<=[.?!])\s+', transcript_text.strip())
    
    # Remove any empty entries and return as a list of sentences
    return [sentence for sentence in sentences if sentence]

# 완료되는 부분까지 데이터 저장
def save_progress(df, output_file):
    """현재 진행 상태를 엑셀 파일로 저장하는 함수"""
    try:
        # Load existing data, if any
        existing_df = pd.read_excel(output_file) if os.path.exists(output_file) else pd.DataFrame()
        
        # Append new data
        df = pd.concat([existing_df, df], ignore_index=True)
        
        # Remove duplicate rows if needed
        df.drop_duplicates(subset=['channel_name', 'shorts_url', 'check_date'], keep='last', inplace=True)
        
        # Save the updated DataFrame back to the file
        df.to_excel(output_file, index=False)
        print(f"Progress saved to {output_file}")
    except Exception as e:
        print(f"Error saving progress: {e}")

# URL 접속 후 comments, transcript, music 수집 진행
def process_youtube_comments_transcript_music(driver, top_100):
    """Process YouTube video comments and save progress whenever an error occurs or the program ends."""
    
    for index, row in top_100.iterrows():
        random_sleep()        
        try:
            url = row['shorts_url']
            title = row['channel_name']
            driver.get(url)
            driver.implicitly_wait(10)
            
            # 수집할 script 및 comments
            transcript_music_data = youtube_transcript_music(driver)  # your youtube_transcript function should be defined
            random_sleep()
            # 여기서 comment num 확인 
            scroll_to_bottom(driver)
            
            comments_elements = driver.find_elements(By.XPATH, '//*[@id="content-text"]/span')
            comments = [comment.text for comment in comments_elements]
            random_sleep()
            
            date_elements = driver.find_elements(By.XPATH, '//*[@id="published-time-text"]/a')
            dates = [date.text for date in date_elements]
            random_sleep()
            
            comments_data = list(zip(comments, dates)) if comments else [(None, None)]
            
            # Append data to DataFrame
            top_100.at[index, 'shorts_comments_time'] = [comments_data]
            top_100.at[index, 'transcript_data'] = transcript_music_data[0]
            top_100.at[index, 'music'] = transcript_music_data[1]


        except Exception as err:
            print(f"Error processing {title or 'Unknown'} at row {index}: {err}")
            save_progress(top_100, latest_file_path)
            raise
    
    save_progress(top_100, "loaded_db_100_"+today+"xlsx")
    return top_100


# Main function to run the transcript extraction example
def main(df):
    # YouTube Shorts URL to scrape
    options = uc.ChromeOptions()
    options.headless = False
    options.add_experimental_option('prefs', {'intl.accept_languages': 'en'})
    
    driver = uc.Chrome(options=options, use_subprocess=True)

    # Call the youtube_transcript function to get the transcript text
    # transcript_text = youtube_transcript(driver)
    data = process_youtube_comments_transcript_music(driver, df)
    # Close the driver
    driver.quit()
    return data

#### Load Data

In [4]:
# Load the most recent file into a DataFrame
top_100_df = pd.read_excel(latest_file_path)


In [9]:
top_100_df

Unnamed: 0,channel_name,shorts_url,shorts_title,shorts_description,shorts_thumbnail,shorts_view,shorts_likes,shorts_comments_num,shorts_comments_time,shorts_published_date,check_date,transcript_data,music
0,Marc Brunet,https://www.youtube.com/watch?v=Qc6NO19u3MM,,👨‍🎨 Serious about your art? Check out my ART S...,https://support.google.com/youtube/answer/3037...,121372,17000,94,[[('The audacity of assuming I can draw those ...,,2024-10-24 21:44:30.453,"[""what most artists struggle with when\ndrawin...",
1,Jen_ny69,https://www.youtube.com/watch?v=ixq_pOK7jk0,,Watch full video on ‪@adayinthelifeofa69‬,https://support.google.com/youtube/answer/3037...,6270,328,13,[[('Sweet boy. he’s well-loved and it shows.'...,,2024-10-24 21:44:30.453,"[""what's up YouTube Welcome Back to the\nchann...",
2,DawateIslami,https://www.youtube.com/watch?v=JhocFEigq3k,,Kabhi Ye Bhi Try Karen | #dawateislami #whatsa...,https://support.google.com/youtube/answer/3037...,926,229,3,"[[('', '2 weeks ago'), ('Very very very very v...",,2024-10-24 21:44:30.453,['बैर मुल्क छुटियां गुजारना एक ऑसम\nएक्सपीरियं...,
3,Go Natural English with Gabby Wallace,https://www.youtube.com/watch?v=v9dcf7OF1Wg,,,https://support.google.com/youtube/answer/3037...,3897,72,3,"[[(' nice, in Colombia many many nice restaura...",,2024-10-24 21:44:30.453,"[""I lived abroad outside the US for many\nyear...",
4,ReligionForBreakfast,https://www.youtube.com/watch?v=WZn6dF9C1NQ,,,https://support.google.com/youtube/answer/3037...,82627,4700,206,"[[(""I'm pretty sure the Catholic Church has a ...",,2024-10-24 21:44:30.453,"[""this Mustachio fellow is Jesus mde a\npopula...",
5,Medicosis Perfectionalis,https://www.youtube.com/watch?v=wFVlzZUWFcw,,,https://support.google.com/youtube/answer/3037...,2977,284,5,[[('السلام عليكم ورحمة الله وبركاته \nجزاك الل...,,2024-10-24 21:44:30.453,"[""Intro\nCardiology in 1 minute with meosis\np...",
6,理科太太 Li Ke Tai Tai,https://www.youtube.com/watch?v=E1789fuBkkU,貓咪個性這麼差，原來都是基因決定的！,,https://rr3---sn-vgqsrnsr.googlevideo.com/vide...,15447,327,12,"[[('兔子非常嗜甜，個性也特好', '9 months ago'), ('這個循環很是巧妙...",2024-01-17 22,2024-10-24 21:44:30.453,,
7,Brian Tracy,https://www.youtube.com/watch?v=C6HaBSkA6xU,Shift Your Mindset,Imagine the possibilities if failure weren’t a...,https://i.ytimg.com/vi/C6HaBSkA6xU/hq2.jpg?sqp...,4477,562,8,"[[('It is purpose that created us, that binds ...",2024-10-23 09,2024-10-24 21:44:30.453,"[""what one great thing would you dare to\ndrea...","stellar (Slowed + Reverb)\n.diedlonely, énouem..."
8,Now You See It,https://www.youtube.com/watch?v=g6lcT4EhNzM,The Best Steven Spielberg Interview Question E...,Movies: The Fablemans (2022) and Close Encount...,https://rr1---sn-f58xn2xxq-aj5l.googlevideo.co...,47959,2400,32,[[('You know you’re a good interviewer when yo...,2023-01-07 09,2024-10-24 21:44:30.453,"[""your father was a computer scientist\nyour m...",


##### Excute

In [5]:
# Run the main function
data = main(top_100_df)

No music
No music
No music
No music
No music
No music
No music
Error occurred while extracting transcript or music: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="primary-button"]/ytd-button-renderer/yt-button-shape/button/yt-touch-feedback-shape/div/div[2]"}
  (Session info: chrome=131.0.6778.70); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00D433E3+25059]
	(No symbol) [0x00CCCDE4]
	(No symbol) [0x00BABEC3]
	(No symbol) [0x00BEFD86]
	(No symbol) [0x00BEFFCB]
	(No symbol) [0x00C2D952]
	(No symbol) [0x00C11F44]
	(No symbol) [0x00C2B51E]
	(No symbol) [0x00C11C96]
	(No symbol) [0x00BE3FAC]
	(No symbol) [0x00BE4F3D]
	GetHandleVerifier [0x01035543+3113795]
	GetHandleVerifier [0x0104A20A+3198986]
	GetHandleVerifier [0x010429E2+3168226]
	GetHandleVerifier [0x00DE3250+680016]
	(No symbol) [0x00CD572D]
	(No symbol) [0x

##### Save Data

In [None]:
data.to_excel("DB/loaded_db_100_" + today + ".xlsx", index=False)