#### 토스 커뮤니티는 동적으로 생성되므로, selenium

In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

In [2]:
import datetime
from dateutil.relativedelta import relativedelta
import time

In [4]:
import FinanceDataReader as fdr

In [5]:
# 종목명으로 업종코드 찾기

def search_code_by_name(market, name) :
  market_list = ['KOSPI','KOSDAQ','ETF']
  if market in market_list:
    if market in ['KOSPI','KOSDAQ']:
      df = fdr.StockListing(market)[['Code','Name']]
    else:
      df = fdr.StockListing('ETF/KR')[['Symbol','Name']]
  else:
    raise ValueError('시장입력오류')
  
  code = df[df['Name'] == name]['Code'].values[0]
  
  return code   
  

In [6]:
# 특정 시장의 종목 코드 리스트를 받는 함수

def code_list_by_market(market) :
  market_list = ['KOSPI','KOSDAQ','ETF']
  if market in market_list:
    if market in ['KOSPI','KOSDAQ']:
      df = fdr.StockListing(market).sort_values(by='Marcap', ascending=False)
      df = df.iloc[:1]
    else:
      df = fdr.StockListing('ETF/KR').sort_values(by='MarCap', ascending=False)
      df = df.iloc[100:400]
      df.rename(columns={'Symbol': 'Code'}, inplace=True)
  else:
    raise ValueError('시장입력오류')
  
  code_list = df['Code'].tolist()
  return code_list

In [82]:
fdr.StockListing('KOSPI').sort_values(by='Marcap', ascending=False).head(5)

Unnamed: 0,Code,ISU_CD,Name,Market,Dept,Close,ChangeCode,Changes,ChagesRatio,Open,High,Low,Volume,Amount,Marcap,Stocks,MarketId
0,5930,KR7005930003,삼성전자,KOSPI,,55700,2,-400,-0.71,56100,56500,55200,10502816,584917256700,332516888035000,5969782550,STK
1,660,KR7000660001,SK하이닉스,KOSPI,,207500,1,2500,1.22,208000,212500,206500,3742728,782465580500,151060490737500,728002365,STK
2,373220,KR7373220003,LG에너지솔루션,KOSPI,,350000,2,-8500,-2.37,356000,357500,348500,163515,57326309500,81900000000000,234000000,STK
3,207940,KR7207940008,삼성바이오로직스,KOSPI,,1011000,1,5000,0.5,1001000,1020000,999000,25028,25249684000,71956914000000,71174000,STK
4,5380,KR7005380001,현대차,KOSPI,,225000,1,12000,5.63,215000,227000,215000,1320309,294050720000,47118642975000,209416191,STK


In [7]:
# 오늘 날짜 구하기
today = datetime.datetime.today()
today_str = today.strftime('%Y-%m-%d')

In [8]:
# 토스 커뮤니티 스크래핑
def crawling_community(market=None, start_date=None, end_date=today_str):
    
    if start_date is None :
        raise ValueError('시작일은 반드시 입력해야 합니다.')
    
    code_list = code_list_by_market(market)
    all_post_data = []
    
    for code in code_list:
        print(f"Processing {code} crawling...")
        url = f'https://tossinvest.com/stocks/A{code}/community'
        # 웹드라이버 초기화
        print(url)
        
        options = Options()
        options.add_argument('--headless')  # 헤드리스 모드
        driver = webdriver.Chrome(options=options)
        
        try:
            driver.get(url)  # 실제 URL로 변경
            WebDriverWait(driver, 15).until(
                    EC.presence_of_element_located((By.CSS_SELECTOR, '#stock-content > div > div > section > section > ul > div > div'))
                )
        except Exception as e:
            print(f"URL 접근 중 오류 발생: {e}. 종목 코드 {code}를 건너뜁니다.")
            driver.quit()  # 드라이버 종료
            continue  # 다음 종목 코드로 건너뜀
        
        # 버튼 요소 찾기
        button = driver.find_element(By.CSS_SELECTOR, '#stock-content > div > div > section > section > section > button')

        # data-contents-label 속성 값 확인
        label = button.get_attribute('data-contents-label')

        # '인기순'이라면 버튼 클릭하여 '최신순'으로 변경
        if label != '최신순':
            button.click()
            time.sleep(2)  # 변경 후 로딩 시간 대기
            
        # 게시글 리스트를 저장할 리스트
        all_contents = []
        post_nickname = []
        post_date = []

        # 날짜 기준 설정
        start_date = pd.to_datetime(start_date).date()  # tz-naive로 설정
        end_date = pd.to_datetime(end_date).date()

        # 스크롤 반복을 위한 변수
        new_posts_found = True
        processed_indices = set()

        while new_posts_found:    
            # 게시글 리스트 추출
            all_posts = driver.find_elements(By.CSS_SELECTOR, '#stock-content > div > div > section > section > ul > div > div')  # 실제 클래스명에 맞게 수정

            # article 태그가 포함된 div만 필터링
            filtered_posts = [post for post in all_posts if post.find_elements(By.TAG_NAME, 'article')]

            new_posts_found = False  # 새로운 게시글 발견 여부 초기화
            
            for post in filtered_posts:
                # data-index 추출
                index = post.get_attribute('data-index')
                
                # print(index)
                # 이미 처리한 인덱스인지 확인
                if index in processed_indices:
                    continue  # 이미 처리한 게시글은 건너뜀
                
                content = post.find_element(By.CSS_SELECTOR, 'article > div > a > span:nth-of-type(2) > span ')
                datetime = post.find_element(By.CSS_SELECTOR, 'article > div > header > div > label > time')
                date = datetime.get_attribute('datetime')
                nickname = post.find_element(By.CSS_SELECTOR, 'article > div > header > div > label > span:nth-child(1)')
                
                # print(content.text)
                # print(nickname.text)
                # 날짜 변환 (tz-naive)
                date_obj = pd.to_datetime(date).tz_localize(None).date()  # 날짜만 추출
                
                # 날짜 필터링
                if start_date <= date_obj <= end_date:
                    all_contents.append(content.text)
                    post_nickname.append(nickname.text)
                    post_date.append(date)
                    processed_indices.add(index)  # 인덱스를 집합에 추가
                    new_posts_found = True  # 새로운 게시글이 발견됨

            # 스크롤을 내림
            driver.execute_script("window.scrollBy(0, 3000);")
            time.sleep(2)  # 로딩 시간 대기

            # 특정 요소가 로드될 때까지 대기 (예: 게시글 목록의 특정 요소)
            try:
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, '#stock-content > div > div > section > section > ul > div > div'))
                )
            except Exception as e:
                print("로딩 대기 중 오류 발생:", e)
                break  # 로딩 중 오류가 발생하면 루프 종료

        # 데이터프레임 생성
        post_infos = {
            '닉네임': post_nickname,
            '날짜': post_date,
            '내용': all_contents,
            '종목': code
        }

        post_df = pd.DataFrame(post_infos)
        all_post_data.append(post_df)
        driver.quit()
        
    final_df = pd.concat(all_post_data, ignore_index=True)   
    final_df.to_csv(f'Toss_community_{market}_{start_date}_{today_str}.csv', index=False, encoding='utf-8-sig') 
    return final_df

(시가총액 상위 5종목) - 8분30초( 약 1700건)

In [9]:
result = crawling_community('KOSPI', start_date='2025-01-17')

Processing 005930 crawling...
https://tossinvest.com/stocks/A005930/community
