In [7]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import selenium
import numpy as np
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchWindowException
import warnings

# Selenium 옵션 설정
options = Options()
options.add_argument("--start-maximized")
options.add_experimental_option('detach', True)  # 스크립트가 종료된 후에도 브라우저가 닫히지 않도록 설정

# ChromeDriver 초기화
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

# url Point
url = 'https://www.yes24.com/Product/Category/MonthWeekBestSeller?categoryNumber={}&pageNumber=1&pageSize=10&type=month&saleYear={}&saleMonth={}&sex={}&age={}'

categories = {
    "001001001": "가정 살림",
    "001001002": "자연과학",
    "001001003": "IT 모바일",
    "001001010": "역사",
    "001001011": "건강 취미",
    "001001016": "어린이",
    "001001019": "인문",
    "001001025": "경제 경영",
    "001001026": "자기계발",
    "001001046": "소설/시/희곡"
}

genders = ['F','M']
all_data = []
log_data = []

# Yes24 페이지 접속
for category in categories.keys() :
    for year in range(2023, 2025):
        for month in range(1, 13):
            if (year == 2023 and month < 6) or (year == 2024 and month > 6):
                continue
            current_month = month if month <= 12 else month - 12
            for gender in genders:
                for age in range(10, 61, 10): # 연령대 순회 설정
                    driver.get(url.format(category,year, month, gender, age))
                        # 책 정보 전부 로드될 때까지 대기
                    try:
                        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.gd_name')))  # 책제목

                        # 책 정보 추출
                        book_titles = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'a.gd_name')]  # 책제목
                        book_authors = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.authPub.info_auth')]  # 지은이
                        book_publisher = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.authPub.info_pub')]  # 출판사
                        book_prices = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'strong.txt_num')]  # 책가격
                        book_date = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.authPub.info_date')]  # 출판일
                        book_sales = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.saleNum')]  # 판매지수
                        book_ratings = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'span.rating_grade em.yes_b')]  # 평점

                        time.sleep(1)


                        # 최대 길이를 기준으로 부족한 리스트에 NaN 값 채우기
                        max_length = max(len(book_titles), len(book_authors), len(book_publisher), len(book_prices), len(book_date), len(book_sales), len(book_ratings))

                        book_titles += [np.nan] * (max_length - len(book_titles))
                        book_authors += [np.nan] * (max_length - len(book_authors))
                        book_publisher += [np.nan] * (max_length - len(book_publisher))
                        book_prices += [np.nan] * (max_length - len(book_prices))
                        book_date += [np.nan] * (max_length - len(book_date))
                        book_sales += [np.nan] * (max_length - len(book_sales))
                        book_ratings += [np.nan] * (max_length - len(book_ratings))

                        data = {
                            'Title': book_titles,
                            'Author': book_authors,
                            'Publisher': book_publisher,
                            'Price': book_prices,
                            'Publish_Date' : book_date,
                            'Sales Index': book_sales,
                            'Rating': book_ratings
                        }

                        # 데이터프레임 생성 및 칼럼 데이터 값 정리
                        df = pd.DataFrame(data)
                        
                        if year == 2023:
                            df['Year'] = 2023
                        else:
                            df['Year'] = 2024
                        df['Month'] = current_month
                        df['Category'] = categories[category]
                        df['Author'] = df['Author'].apply(lambda x:x.split(' 저')[0])
                        df['Gender'] = '1' if gender == 'F' else '0'
                        df['Sales Index'] = df['Sales Index'].apply(lambda x:x.split(' ')[1].replace(',',''))
                        df['Price'] = df['Price'].str.replace('[,원]', '', regex=True).astype(int)
                        df['Age'] = age

                        all_data.append(df)

                    except Exception as e:
                        err_log = f"Event. Category: {categories[category]}, Date: {year}.{month}, Age: {age}, Sex: {gender}"
                        log_data.append(err_log)
                        print(err_log)
driver.quit()

# DataFrame으로 합치기
all_data_df = pd.concat(all_data, ignore_index=True)
log_df = pd.DataFrame(log_data, columns=["History"])

Event. Category: 가정 살림, Date: 2023.6, Age: 20, Sex: F
Event. Category: 가정 살림, Date: 2023.6, Age: 30, Sex: M
Event. Category: 가정 살림, Date: 2023.7, Age: 20, Sex: F
Event. Category: 가정 살림, Date: 2023.7, Age: 30, Sex: M
Event. Category: 건강 취미, Date: 2023.6, Age: 10, Sex: F
Event. Category: 건강 취미, Date: 2023.6, Age: 20, Sex: F


In [8]:
# CSV 파일로 저장
all_data_df.to_csv('Yes24_books_data.csv', index=False, encoding='utf-8-sig')
log_df.to_csv('Yes24_EventLog.csv', index=False, encoding='utf-8-sig')

In [9]:
# 전처리 Step
data = all_data_df.copy()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15540 entries, 0 to 15539
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         15540 non-null  object
 1   Author        15540 non-null  object
 2   Publisher     15540 non-null  object
 3   Price         15540 non-null  int32 
 4   Publish_Date  15540 non-null  object
 5   Sales Index   15540 non-null  object
 6   Rating        15468 non-null  object
 7   Year          15540 non-null  int64 
 8   Month         15540 non-null  int64 
 9   Category      15540 non-null  object
 10  Gender        15540 non-null  object
 11  Age           15540 non-null  int64 
dtypes: int32(1), int64(3), object(8)
memory usage: 1.4+ MB


In [11]:
data.head()

Unnamed: 0,Title,Author,Publisher,Price,Publish_Date,Sales Index,Rating,Year,Month,Category,Gender,Age
0,팬 하나로 충분한 두 사람 식탁,국가비,달,31500,2023년 06월,18882,9.4,2023,6,가정 살림,1,10
1,집에서 운영하는 작은 빵집 SOFT BREAD,호야,더테이블,29700,2023년 06월,20196,9.9,2023,6,가정 살림,1,10
2,베이킹은 과학이다,"나카야마 히로노리, 기무라 마키코",터닝포인트,20700,2017년 11월,11133,9.5,2023,6,가정 살림,1,10
3,맛있어서 지속 가능한 디디미니 다이어트 레시피,미니 박지우,빅피시,16920,2022년 05월,7704,9.9,2023,6,가정 살림,1,10
4,"사춘기 딸에게 힘이 되어주는, 부모의 말 공부",이현정,포레스트북스,15750,2023년 04월,13689,9.6,2023,6,가정 살림,1,10


In [12]:
missing_values_count = data.isnull().sum()
print(missing_values_count)

Title            0
Author           0
Publisher        0
Price            0
Publish_Date     0
Sales Index      0
Rating          72
Year             0
Month            0
Category         0
Gender           0
Age              0
dtype: int64


In [31]:
data.describe()
# df['Category'].value_counts()
# df['Age'].value_counts()
# df['Publisher'].value_counts()
# df['Gender'].value_counts()
# df['Price'].value_counts()

Unnamed: 0,Price,Sales Index,Rating,Year,Month,Gender,Age
count,14930.0,14930.0,14930.0,14930.0,14930.0,14930.0,14930.0
mean,17768.750837,129902.6,9.481835,2023.453449,6.498995,0.498995,35.358339
std,7330.053458,177484.3,0.426288,0.497845,3.345916,0.500016,16.815487
min,2700.0,174.0,3.5,2023.0,1.0,0.0,10.0
25%,15120.0,26592.0,9.3,2023.0,4.0,0.0,20.0
50%,16650.0,72531.0,9.6,2023.0,6.0,0.0,40.0
75%,19800.0,165576.0,9.8,2024.0,9.0,1.0,50.0
max,160650.0,1263009.0,10.0,2024.0,12.0,1.0,60.0


In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14930 entries, 0 to 14929
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Title          14930 non-null  object  
 1   Author         14930 non-null  object  
 2   Publisher      14930 non-null  object  
 3   Price          14930 non-null  int64   
 4   Date           14930 non-null  object  
 5   Sales Index    14930 non-null  int64   
 6   Rating         14930 non-null  float64 
 7   Year           14930 non-null  int64   
 8   Month          14930 non-null  int64   
 9   Category       14930 non-null  object  
 10  Gender         14930 non-null  int64   
 11  Age            14930 non-null  int64   
 12  Price_Binned   14930 non-null  category
 13  Rating_Binned  14930 non-null  category
 14  Age_labels     14930 non-null  category
dtypes: category(3), float64(1), int64(6), object(5)
memory usage: 1.4+ MB
