In [None]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time
import requests
from selenium.webdriver.common.by import By

book_df = pd.DataFrame()
book_data_list = []


def get_book_info(url):
    try:

        res = requests.get(url)
        driver = webdriver.Chrome()
        driver.get(url)  
        time.sleep(2)

        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")
        
        target = soup.find('div', class_='meta-data')
        
        b_name_element = target.find('p', class_='book-name')
        b_name = b_name_element.text.strip() if b_name_element else None

        b_author_element = soup.find('p', class_='author')
        b_author = b_author_element.find('span').text.strip() if b_author_element else None
        
        b_readtogether_element = soup.find('div', class_='read-together')
        b_readtogether = b_readtogether_element.find('strong',class_='number').text.strip() if b_readtogether_element else None

        category_data = soup.find('div', class_='book-info-detail slide-container')
        if category_data:
            b_category = category_data.find('a')
            if b_category:
                b_category = b_category.text.strip()
            else:
                b_category = 'no'
        else:
            b_category = 'no'
        
        b_percent_element = soup.find('strong', class_='line-desc')
        b_percent = b_percent_element.text.strip() if b_percent_element else None
        
        # b_reviewcount_element = soup.find('div', class_='review')
        # b_reviewcount = b_reviewcount_element.find('strong',class_='number').text.strip() if b_reviewcount_element else None 
        b_reviewcount_element = soup.find('div', class_='review')

        if b_reviewcount_element:
            b_reviewcount = b_reviewcount_element.find('strong', class_='number')
            if b_reviewcount:
                b_reviewcount = b_reviewcount.text.strip()
            else:
                b_reviewcount = 0
        else:
            b_reviewcount = 0

        b_keyword_element = soup.find('div', class_='keyword-divider')
        b_keyword = b_keyword_element.find('a', class_='keyword').text.strip() if b_keyword_element else None
        
        
        # 오디오북 여부
        try:
            b_type = soup.find('div', class_='book-type')
            if b_type:
                b_audiobook = "O"
            else:
                b_audiobook = "X"
        except AttributeError:
            b_audiobook = "X"

        book_data = []
        book_data.append({'Book_Name': b_name, 'Book_Author': b_author,'Book_Category': b_category, 'Completion_Percent': b_percent, 
                        'ReadTogether': b_readtogether, 'ReviewCount':b_reviewcount ,'Keyword' : b_keyword,'AudioBook' : b_audiobook})
        
        driver.quit()

        return  book_data,{'Book_Name': b_name}

    except Exception as e:
        return None, None
    


def get_reviews(url):
    try:

        res = requests.get(url)
        driver = webdriver.Chrome()
        driver.get(url)  # Open the URL
        time.sleep(2)  # Add a delay to allow the page to load (you can adjust the time as needed)
        
        # Check if there are reviews
        if "리뷰가 없습니다." in driver.page_source:
            print("No reviews for this book. Skipping...")
            driver.quit()
            return pd.DataFrame()

        # Scroll to the end of the page
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)  # Add a delay after scrolling

            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
            
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, "html.parser")

        target = soup.find('ul', class_='review-list')
        p_names = target.find_all('p', class_='nickname')
        p_texts = target.find_all('pre', class_='cont')

        reviews_data = []

        for p_name, p_text in zip(p_names, p_texts):
            a_tag_text = p_name.find('a').text
            reviews_data.append({'Nickname': a_tag_text, 'Review_Text': p_text.text})

        driver.quit()

        return pd.DataFrame(reviews_data)

    except Exception as e:
        print(f"Error for URL {url}: {str(e)}")
        return pd.DataFrame()


base_url = "https://www.millie.co.kr/v3/bookDetail/{}"

start_number = 179627456
end_number = 179627798

for book_number in range(start_number, end_number + 1):
    
    book_url = base_url.format(book_number)
    book_data, book_info = get_book_info(book_url)
   
    # 등록되지 않은 번호이면 book_data, book_info가 None으로 반환될 것이고, 이 경우에는 건너뛰고 다음 번호로 진행
    if book_data is not None and book_info is not None:
        book_df = pd.concat([book_df, pd.DataFrame(book_data)])

        review_url = f"https://www.millie.co.kr/v3/bookDetail/more/review/{book_number}"
        reviews_df = get_reviews(review_url)

        # 리뷰가 있는 경우에만 데이터 수집
        if not reviews_df.empty:
            # 책 정보를 리뷰 데이터프레임에 추가
            reviews_df['Book_Name'] = book_info['Book_Name']

            # CSV 파일로 저장
            reviews_df.to_csv(f'All_book_reviews_{book_number}.csv', index=False, encoding='utf-8')

            print(f"Data for book number {book_number} saved successfully.")
        else:
            print(f"No reviews for book number {book_number}. Skipping...")

book_df.to_csv('Bookdata_.csv', index=False, encoding='utf-8')
