In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from bs4 import BeautifulSoup
import time

def scrape_transcript_as_str(
    url: str,
    max_scrolls: int = 250,
    sleep_after_scroll: float = 1.2,
    no_growth_limit: int = 8,
    include_speaker: bool = True,
) -> str:
    """
    RollCall transcript 페이지 1개에서 발화 블록을 끝까지 스크롤하며 수집 후
    전체 대본을 하나의 문자열(str)로 반환.
    """

    opt = webdriver.ChromeOptions()
    opt.add_argument("--headless=new")
    opt.add_argument("--no-sandbox")
    opt.add_argument("--disable-dev-shm-usage")
    opt.add_argument("--window-size=1920,1080")
    opt.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opt)

    seen = set()
    lines = []

    try:
        driver.get(url)
        WebDriverWait(driver, 15).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )
        time.sleep(2)

        no_growth = 0
        prev_count = 0
        last_height = driver.execute_script("return document.body.scrollHeight")

        for _ in range(max_scrolls):
            soup = BeautifulSoup(driver.page_source, "html.parser")

            # ✅ 발화 컴포넌트 단위
            blocks = soup.select("div.flex.gap-4.py-2")

            for b in blocks:
                speaker_el = b.select_one("h2.text-md.inline")
                text_el = b.select_one("div.flex-auto.text-md.text-gray-600.leading-loose")

                if not text_el:
                    continue

                speaker = speaker_el.get_text(strip=True) if speaker_el else ""
                text = text_el.get_text(" ", strip=True)
                if not text:
                    continue

                # ✅ 중복 방지 키
                key = (speaker, text)
                if key in seen:
                    continue
                seen.add(key)

                if include_speaker and speaker:
                    lines.append(f"{speaker}: {text}")
                else:
                    lines.append(text)

            # ✅ 성장 멈춤 체크
            if len(lines) == prev_count:
                no_growth += 1
            else:
                no_growth = 0
                prev_count = len(lines)

            if no_growth >= no_growth_limit:
                break

            # ✅ 스크롤
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(sleep_after_scroll)

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height

    finally:
        driver.quit()

    # str 반환 (없으면 빈 문자열)
    return "\n".join(lines).strip()


In [17]:
import pandas as pd

url_list=pd.read_csv('/home/hyuksu/teamproject2/data/total_url')

In [18]:
url_list.drop(columns='Unnamed: 0',inplace=True)

In [26]:
url_list.info()

<class 'pandas.DataFrame'>
RangeIndex: 3481 entries, 0 to 3480
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   date    3481 non-null   str  
 1   url     3481 non-null   str  
 2   title   3481 non-null   str  
dtypes: str(3)
memory usage: 81.7 KB


In [19]:
my_type=url_list.iloc[:3,1]

In [20]:
!pip install tqdm



In [21]:
from tqdm.auto import tqdm

In [22]:
empty=[]
for url in tqdm(my_type):
    script=scrape_transcript_as_str(url)
    empty.append(script)


len(empty)

  0%|          | 0/3 [00:00<?, ?it/s]

3

In [28]:
len(empty[2])

4008