In [6]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

# Chrome WebDriver 초기화
options = Options()
options.add_argument("--start-maximized")
options.add_experimental_option('detach', True)

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

age_data = []
log_data = []

# 연령대, 날짜 순회
for age in range(1, 7):
    age = f'0{age}'
    for year in range(2017, 2025):
        for month in range(1,13):
            if month < 10:
                month_str = f'0{month}'
            else:
                month_str = str(month)

            if month == 2:
                date = f"{year}02{3}"
            else:
                date = f"{year}{month_str}4"

            log = f"Age: {age}, Date: {date} complete.."
            log_data.append(log)
            print(log)
            
            if year == 2024 and month == 6:
                break  

            url = f"https://book.interpark.com/display/collectlist.do?_method=Bestseller201312&sc.page=1&sc.row=20&sc.shopNo=0000400000&sc.dispNo=028&sc.highDispNo=028&sc.category=age&sc.cltTp={age}&sc.addNewBook=N&sc.addRiseBook=N&sc.filterSaleSt=&sc.orderByTp=&sc.cltWeek={date}&sc.cltDate=&sc.ebSaleTp=01&sc.bestFirst=&bookblockname=bestseller&booklinkname=%C1%D6%B0%A3%BA%A3%BD%BA%C6%AE&weekYear1=2024&weekMonth1=01&week1=4&weekYear=2024&weekMonth=01&bid1=Best_zone&bid2=028&bid3=age&bid4=Search"
            
            driver.get(url)

            try:
                WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'p.bName')))
                
                # 책 정보 추출
                book_titles = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'p.bName')]  # 책제목
                book_type = [element.text.strip() for element in driver.find_elements(By.XPATH, '//*[@id="divRlist"]/table/tbody/tr/td[4]')]  # 분야
                book_publisher = [element.text.strip() for element in driver.find_elements(By.XPATH, '//*[@id="divRlist"]/table/tbody/tr/td[5]/p[1]')]  # 출판사
                book_counts = [element.text.strip() for element in driver.find_elements(By.CSS_SELECTOR, 'div.stepBlue_num')]  # 실판매량
                
                data = {
                    'Title': book_titles,
                    'Type': book_type,
                    'Publisher': book_publisher,
                    'Count': book_counts
                }
                
                # 데이터프레임 생성
                df = pd.DataFrame(data)
                df['Year'] = year
                df['Month'] = month
                df['Count'] = df['Count'].str.extract('(\d+)').astype(int)
                df['Age'] = int(age) * 10
                age_data.append(df)

            except Exception as e:
                err_log = f"Age: {age}, Date: {date} Not complete, No Data."
                log_data.append(err_log)
                print(err_log)
                continue

        if year == 2024 and month == 5:
            break           

driver.quit()

# DataFrame으로 합치기 / 인덱스 제거
all_data_df = pd.concat(age_data, ignore_index=True)
log_df = pd.DataFrame(log_data, columns=["History"])

# 결과 출력
print(all_data_df.head())


Age: 01, Date: 2017014 complete..
Age: 01, Date: 2017023 complete..
Age: 01, Date: 2017034 complete..
Age: 01, Date: 2017044 complete..
Age: 01, Date: 2017054 complete..
Age: 01, Date: 2017064 complete..
Age: 01, Date: 2017074 complete..
Age: 01, Date: 2017084 complete..
Age: 01, Date: 2017094 complete..
Age: 01, Date: 2017104 complete..
Age: 01, Date: 2017114 complete..
Age: 01, Date: 2017124 complete..
Age: 01, Date: 2018014 complete..
Age: 01, Date: 2018023 complete..
Age: 01, Date: 2018034 complete..
Age: 01, Date: 2018044 complete..
Age: 01, Date: 2018054 complete..
Age: 01, Date: 2018064 complete..
Age: 01, Date: 2018074 complete..
Age: 01, Date: 2018084 complete..
Age: 01, Date: 2018094 complete..
Age: 01, Date: 2018094 Not complete, No Data.
Age: 01, Date: 2018104 complete..
Age: 01, Date: 2018114 complete..
Age: 01, Date: 2018124 complete..
Age: 01, Date: 2019014 complete..
Age: 01, Date: 2019023 complete..
Age: 01, Date: 2019034 complete..
Age: 01, Date: 2019044 complete..
Ag

In [7]:
all_data_df.to_csv('Interpark_books_age.csv', index=False, encoding='utf-8-sig')
log_df.to_csv('Interpark_log_data.csv', index=False, encoding='utf-8-sig')

In [11]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8229 entries, 0 to 8228
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Title      8229 non-null   object
 1   Type       8229 non-null   object
 2   Publisher  8229 non-null   object
 3   Count      8229 non-null   int32 
 4   Year       8229 non-null   int64 
 5   Month      8229 non-null   int64 
 6   Age        8229 non-null   object
dtypes: int32(1), int64(2), object(4)
memory usage: 418.0+ KB


In [8]:
missing_values_count = all_data_df.isnull().sum()
print(missing_values_count)

Title        0
Type         0
Publisher    0
Count        0
Year         0
Month        0
Age          0
dtype: int64
