In [None]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.chrome.options import Options
# from webdriver_manager.chrome import ChromeDriverManager
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

# # Set up Chrome options (you can add more options as needed)
# chrome_options = Options()
# chrome_options.add_argument("--headless")  # Run in headless mode

# # Initialize WebDriver (this automatically handles the WebDriver installation)
# driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# # URL of the blog
# url = "https://vitalik.eth.limo/general/2024/09/28/alignment.html"

# # Open the webpage
# driver.get(url)

# # Wait for the page to load and for the blog content to be visible
# WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div#doc.container-fluid.markdown-body")))

# # Extract the blog title
# title = driver.find_element(By.CSS_SELECTOR, "h1").text  # Assuming the title is in an <h1> tag

# # Extract the blog content
# blog_content = driver.find_element(By.CSS_SELECTOR, "div#doc.container-fluid.markdown-body").text

# # Remove redundant title, date, and other unnecessary parts
# cleaned_content = blog_content.replace(title, "").replace("Dark Mode Toggle", "").replace("See all posts", "").strip()

# # Highlight the title and combine it with the cleaned content
# final_content = f"??? - Blog Title: {title}\n\n{cleaned_content}"

# # Save the cleaned text to a .txt file
# with open("blog_content.txt", "w", encoding="utf-8") as file:
#     file.write(final_content)

# # Close the driver
# driver.quit()

# print("Blog content saved to 'blog_content.txt'")


In [None]:
import re
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

# File paths
input_file = "links.txt"  # Replace with your .txt file containing links
output_file = "aggregated_blog_content.txt"

# Set up Chrome options (headless mode)
chrome_options = Options()
chrome_options.add_argument("--headless")

# Initialize WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)

# Function to clean up content and remove excess blank lines
def clean_content(content):
    # Replace more than two consecutive newlines with exactly two
    return re.sub(r"\n{3,}", "\n\n", content)

# Function to process a single link and return cleaned content
def process_link(link):
    try:
        # Open the webpage
        driver.get(link.strip())

        # Wait for the content to load
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "div#doc.container-fluid.markdown-body"))
        )

        # Extract title
        title = driver.find_element(By.CSS_SELECTOR, "h1").text

        # Extract blog content
        blog_content = driver.find_element(By.CSS_SELECTOR, "div#doc.container-fluid.markdown-body").text

        # Clean and format the content
        cleaned_content = blog_content.replace(title, "").replace("Dark Mode Toggle", "").replace("See all posts", "").strip()
        final_content = f"??? Blog Title: {title}\n\n{clean_content(cleaned_content)}\n\n{'-'*80}\n\n"
        
        return final_content

    except Exception as e:
        print(f"Error processing link {link}: {e}")
        return None

# Open the links file and read all links
with open(input_file, "r", encoding="utf-8") as file:
    links = file.readlines()

# Open the output file in append mode
with open(output_file, "a", encoding="utf-8") as file:
    for link in links:
        print(f"Processing link: {link.strip()}")
        content = process_link(link)
        if content:
            file.write(content)

# Close the WebDriver
driver.quit()

print(f"All content has been aggregated and saved to {output_file}")


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Function to save transcript to a .txt file
def save_transcript_to_file(transcript_text, filename="transcript.txt"):
    with open(filename, "w", encoding="utf-8") as file:
        file.write(transcript_text)

# Initialize WebDriver
driver_path = "chromedriver.exe"  # Update with the actual path to your ChromeDriver
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

try:
    # Open the YouTube video URL
    video_url = "https://www.youtube.com/watch?v=pB-_-WgDgCU"  # Replace with your video ID
    driver.get(video_url)

    # Wait for the page to load
    WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//tp-yt-paper-button[@id="expand"]'))
    )

    # Step 1: Click the "More" button
    more_button = driver.find_element(By.XPATH, '//tp-yt-paper-button[@id="expand"]')
    more_button.click()

    # Step 2: Click "Show Transcript"
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//tp-yt-paper-button[.//div[contains(@class, "yt-spec-touch-feedback-shape__fill")]]'))
    )
    show_transcript_button = driver.find_element(By.XPATH, '//tp-yt-paper-button[.//div[contains(@class, "yt-spec-touch-feedback-shape__fill")]]')
    show_transcript_button.click()

    # Step 3: Click the three-dot menu (SVG element)
    WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[name()="svg" and @height="24" and @width="24"]'))
    )
    three_dot_button = driver.find_element(By.XPATH, '//*[name()="svg" and @height="24" and @width="24"]')
    three_dot_button.click()

    # Step 4: Click "Toggle timestamps" option
    toggle_timestamps = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//tp-yt-paper-item/yt-formatted-string[contains(text(), "Toggle timestamps")]'))
    )
    toggle_timestamps.click()

    # Step 5: Get the transcript text
    transcript_container = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.XPATH, '//ytd-engagement-panel-section-list-renderer//div[@id="header"]'))
    )
    transcript_text = transcript_container.text

    # Save the transcript to a file
    save_transcript_to_file(transcript_text)
    print("Transcript saved to transcript.txt")

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    # Close the browser
    driver.quit()
