In [2]:
import pandas as pd
import time
import re
import selenium
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [3]:
# Selenium 옵션 설정
options = Options()
options.add_argument("--start-maximized") 
options.add_experimental_option('detach', True)  # 스크립트가 종료된 후에도 브라우저가 닫히지 않도록 설정

# ChromeDriver 초기화
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# url Point
url = 'https://www.yes24.com/Product/Category/MonthWeekBestSeller?categoryNumber={}&pageSize=50&type=month&sex={}&age={}'

categories = {
    # "001001001": "가정 살림",
    "001001002": "자연과학",
    # "001001003": "IT 모바일",
    # "001001005": "청소년",
    # "001001009": "여행",
    # "001001010": "역사",
    # "001001011": "건강 취미",
    # "001001015": "수험서 자격증",
    # "001001016": "어린이",
    # "001001019": "인문",
    # "001001022": "사회 정치",
    # "001001025": "경제 경영",
    # "001001026": "자기계발",
    # "001001046": "소설/시/희곡"
}

genders = ['F','M']
all_data = []

# Yes24 페이지 접속
for category in categories.keys() :
    for gender in genders:
        for age in range(10, 21, 10): # 10, 61, 10
            driver.get(url.format(category, gender, age))
                # 책 정보 전부 로드될 때까지 대기
            try:
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.gd_name')))  # 책제목

                # 책 정보 추출
                book_titles = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'a.gd_name')]  # 책제목
                book_authors = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.authPub.info_auth')]  # 지은이
                book_publisher = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.authPub.info_pub')]  # 출판사
                book_prices = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'strong.txt_num')]  # 책가격
                book_sales = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.saleNum')]  # 판매부수
                book_ratings = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.rating_grade em.yes_b')]  # 평점

                time.sleep(1)

                # 나중에 크롤링 완료 후 데이터 전처리시 결측치 없을시 제거가능------------------
                min_length = min(len(book_titles), len(book_authors), len(book_publisher), len(book_prices), len(book_sales), len(book_ratings))
                book_titles = book_titles[:min_length]
                book_authors = book_authors[:min_length]
                book_publisher = book_publisher[:min_length]
                book_prices = book_prices[:min_length]
                book_sales = book_sales[:min_length]
                book_ratings = book_ratings[:min_length]
                #-----------------------------------------------------------------

                data = {
                    'Title': book_titles,
                    'Author': book_authors,
                    'Publisher': book_publisher,
                    'Price': book_prices,
                    'Sales': book_sales,
                    'Rating': book_ratings
                }

                # 데이터프레임 생성 및 칼럼 데이터 값 정리
                df = pd.DataFrame(data)
                df['Category'] = categories[category]
                df['Author'] = df['Author'].apply(lambda x:x.split(' 저')[0])
                df['Gender'] = 'Female' if gender == 'F' else 'Male'
                df['Sales'] = df['Sales'].apply(lambda x:x.split(' ')[1].replace(',',''))
                df['Price'] = df['Price'].str.replace('[,원]', '', regex=True).astype(int)
                                
                if age == 10:
                    df['Age'] = 'Teenager'
                elif age == 20:
                    df['Age'] = 'Young Adult'
                elif age == 30:
                    df['Age'] = 'Adult'
                elif age == 40:
                    df['Age'] = 'Middle Age'
                elif age == 50:
                    df['Age'] = 'Late Middle Age'
                else:
                    df['Age'] = 'Senior'
                
                # df_filled = df.fillna('Unknown')
                all_data.append(df)
                            
            except Exception as e:
                print(f"Error : {e}")

driver.quit()

# DataFrame으로 합치기
all_data_df = pd.concat(all_data, ignore_index=True)

# CSV 파일로 저장
all_data_df.to_csv('Yes24_books_data.csv', index=False, encoding='utf-8-sig')
