In [None]:
!pip install selenium
!pip install webdriver-manager
!pip install python-dateutil

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from datetime import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import time

## 인터파크 도서 데이터 크롤링

In [None]:
# Chrome WebDriver 초기화
options = Options()
options.add_argument("--start-maximized")
options.add_experimental_option('detach', True)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

url_template = "https://book.interpark.com/display/collectlist.do?_method=Bestseller201312&sc.page=1&sc.row={}&sc.shopNo=0000400000&sc.dispNo=028&sc.highDispNo=028&sc.category=age&sc.cltTp={}&sc.addNewBook=N&sc.addRiseBook=N&sc.filterSaleSt=&sc.orderByTp=&sc.cltWeek={}&sc.cltDate=&sc.ebSaleTp=01&sc.bestFirst=&bookblockname=bestseller&booklinkname=%C1%D6%B0%A3%BA%A3%BD%BA%C6%AE&weekYear1=2024&weekMonth1=01&week1=4&weekYear=2024&weekMonth=01&bid1=Best_zone&bid2=028&bid3=age&bid4=Search"

# 데이터를 저장할 리스트 초기화
book_data = []
date_log = []
age_log = []
log_history = []

# 기간 설정 ( 2017.01.01 ~ 현재까지 )
start_date = datetime.strptime('2017-01-01', '%Y-%m-%d')
end_date = datetime.now()
loop_date = start_date

# 날짜, 연령대 변수를 URL 형식에 맞춰 선언.     ex) '2017014', '2017023' ...
while loop_date < end_date :

    page_size = 50
    date_query = ''
    age_query = ''

    strYear = loop_date.strftime('%Y')
    strMonth = loop_date.strftime('%m')

    if strMonth == '02':
        date_query = strYear + strMonth + '3'
    else:
        date_query = strYear + strMonth + '4'

    for age in range(1, 7):
        age_query = f'0{age}'

        # URL 생성 / DRIVE 생성
        url = url_template.format(page_size, age_query, date_query)
        driver.get(url)

        try:
            WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'p.bName')))

            # 책 정보 추출
            book_titles = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'p.bName')]  # 책제목
            book_categories = [element.text.strip() for element in driver.find_elements(By.XPATH, '//*[@id="divRlist"]/table/tbody/tr/td[4]')]  # 카테고리
            book_publishers = [element.text.strip() for element in driver.find_elements(By.XPATH, '//*[@id="divRlist"]/table/tbody/tr/td[5]/p[1]')]  # 출판사
            book_sales_volume = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'div.stepBlue_num')]  # 실판매량

            # 데이터를 리스트에 추가
            for i in range(len(book_titles)):
                book_data.append({
                    'Publisher': book_publishers[i],
                    'Category': book_categories[i],
                    'Title': book_titles[i],
                    'Sales': book_sales_volume[i],
                    'Year': strYear,
                    'Month': strMonth,
                    'Age': age
                })

            log_message = "Crawling Successed."

        except Exception as e:
            log_message = "Crawling Failed. No Data."

        finally:
            date_log.append(f'{strYear}년 {int(strMonth)}월')
            age_log.append(f'{age}0대')
            log_history.append(log_message)

    loop_date += relativedelta(months=1)

driver.quit()

# 데이터프레임 생성
df = pd.DataFrame(book_data)

log_df = pd.DataFrame({
    'Date': date_log,
    'Age': age_log,
    'History': log_history
})


In [None]:
df.to_csv('Interpark_books_dataset.csv', index=False)
log_df.to_csv('Interpark_crawling_history.csv', index=False)

## 예스24 도서 데이터 크롤링

In [None]:
# Selenium 옵션 설정
options = Options()
options.add_argument("--start-maximized")
options.add_experimental_option('detach', True)  # 스크립트가 종료된 후에도 브라우저가 닫히지 않도록 설정

# ChromeDriver 초기화
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# url Point
url = 'https://www.yes24.com/Product/Category/MonthWeekBestSeller?categoryNumber={}&pageNumber=1&pageSize=10&type=month&saleYear={}&saleMonth={}&sex={}&age={}'

categories = {
    "001001001": "가정 살림",
    "001001002": "자연과학",
    "001001003": "IT 모바일",
    "001001010": "역사",
    "001001011": "건강 취미",
    "001001016": "어린이",
    "001001019": "인문",
    "001001025": "경제 경영",
    "001001026": "자기계발",
    "001001046": "소설/시/희곡"
}

genders = ['F','M']
all_data = []
log_data = []

# Yes24 페이지 접속
for category in categories.keys() :
    for year in range(2023, 2025):
        for month in range(1, 13):
            if (year == 2023 and month < 6) or (year == 2024 and month > 6):
                continue
            current_month = month if month <= 12 else month - 12
            for gender in genders:
                for age in range(10, 61, 10): # 연령대 순회 설정
                    driver.get(url.format(category,year, month, gender, age))
                        # 책 정보 전부 로드될 때까지 대기
                    try:
                        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.gd_name')))  # 책제목

                        # 책 정보 추출
                        book_titles = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'a.gd_name')]  # 책제목
                        book_authors = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.authPub.info_auth')]  # 지은이
                        book_publisher = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.authPub.info_pub')]  # 출판사
                        book_prices = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'strong.txt_num')]  # 책가격
                        book_date = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.authPub.info_date')]  # 출판일
                        book_sales = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.saleNum')]  # 판매지수
                        book_ratings = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.rating_grade em.yes_b')]  # 평점

                        time.sleep(1)


                        data = {
                            'Title': book_titles,
                            'Author': book_authors,
                            'Publisher': book_publisher,
                            'Price': book_prices,
                            'Publish_Date' : book_date,
                            'Sales Index': book_sales,
                            'Rating': book_ratings
                        }

                        # 데이터프레임 생성 및 칼럼 데이터 값 정리
                        df = pd.DataFrame(data)

                        if year == 2023:
                            df['Year'] = 2023
                        else:
                            df['Year'] = 2024
                        df['Month'] = current_month
                        df['Category'] = categories[category]
                        df['Author'] = df['Author'].apply(lambda x:x.split(' 저')[0])
                        df['Gender'] = '1' if gender == 'F' else '0'
                        df['Sales Index'] = df['Sales Index'].apply(lambda x:x.split(' ')[1].replace(',',''))
                        df['Price'] = df['Price'].str.replace('[,원]', '', regex=True).astype(int)
                        df['Age'] = age

                        all_data.append(df)

                    except Exception as e:
                        err_log = f"Event. Category: {categories[category]}, Date: {year}.{month}, Age: {age}, Sex: {gender}"
                        log_data.append(err_log)
                        print(err_log)
driver.quit()

# DataFrame으로 합치기
all_data_df = pd.concat(all_data, ignore_index=True)
log_df = pd.DataFrame(log_data, columns=["History"])

In [None]:
all_data_df.to_csv('Yes24_books_dataset.csv', index=False)
log_df.to_csv('Yes24_crawling_history.csv', index=False)