In [None]:
# import packages
import pandas as pd
import numpy as np
import re, requests, math
import os, dotenv, logging

from bs4 import BeautifulSoup

logging.basicConfig(
    level = logging.INFO,
    format = '[%(levelname)s] %(asctime)s — %(message)s',
    datefmt = '%Y-%m-%d %H:%M:%S'
)

In [None]:
# import the user_id, user_agent and cookie for douban.env
dotenv.load_dotenv("../personal_envs/personal-book-review-scraper_douban.env", override=True)
user_id, user_agent, cookie = os.getenv("user_id"), os.getenv("user_agent"), os.getenv("cookie")

# generate the header
header = {'User-Agent': user_agent, 'Cookie': cookie}

In [None]:
# check whether can login successful
test_url = 'https://www.douban.com/'
web_data = requests.get(test_url, headers = header)
soup = BeautifulSoup(web_data.text, features = "lxml")

if soup.select_one('li.nav-user-account span'):
    print(f"Login successful. Username: {soup.select_one('li.nav-user-account span').text.replace('的账号', '').strip()}")
else:
    raise ValueError("Login failed. Please check if the cookie is valid or has expired.")

In [None]:
# define the function of parse each book in one page
def get_books_info_from_page(url, header):
    
    # create the dataframe for all used columns
    page_data = pd.DataFrame(columns = ['Name', 'Pub_info', 'Date', 'Star', 'Tag', 'Comment', 'Website'])
    
    # get the web data from the url
    web_data = requests.get(url, headers = header)
    soup = BeautifulSoup(web_data.text, features = "lxml")

    # parse by each book
    for div in soup.select('div.info'):
        if div.find('a').contents[0] == '读书主页':
            pass
        else:
            try:
                # get the feature of each book
                book_name = str(div.find('a').contents[0]).replace('  ','').replace('\n','')
                book_pubinfo = str(div.select('div.pub')).split('>')[1].split('<')[0].replace('  ','').replace('\n','')
                book_date = str(div.select('span.date')).split('>')[1].split('<')[0][:10]
                book_comment = str(div.find('p')).split('>')[1].split('<')[0].replace('  ','').replace('\n','')
                book_tag = str(div.select('span.tags')).split('>')[1].split('<')[0][4:]
                book_star = [i for i in range(1, 6) if div.select(f'span.rating{i}-t')][0]
                book_website = re.search(r'href="([^"]+)"', str(div.select('a')[0])).group(1)
                
                # combine the features in one row
                book_parse = pd.DataFrame({'Name': book_name, 'Pub_info': book_pubinfo, 
                                           'Date': book_date, 'Star': book_star, 
                                           'Tag': book_tag, 'Comment': book_comment, 
                                           'Website': book_website}, 
                                           index=[0])
                
                # concat the book info together
                page_data = pd.concat([page_data, book_parse], ignore_index = True)
                
            except:
                
                # print 'error' if wrong
                print(book_name, 'parse error')
                
    return page_data

In [None]:
# create the data for all dataframe
data = pd.DataFrame(columns = ['Name', 'Pub_info', 'Date', 'Star', 'Tag', 'Comment', 'Website'])

# get the number of books
url = f'https://book.douban.com/people/{user_id}/collect?start=0&sort=time&rating=all&filter=all&mode=grid'
web_data = requests.get(url, headers = header)
soup = BeautifulSoup(web_data.text, features="lxml")

web_title = str(soup.title).split('>')[1].split('<')[0].replace('  ','').replace('\n','')
num_books = int(re.findall(r"\(\s*\+?(-?\d+)\s*\)", web_title)[0])
print('Total book #:', num_books)

# parse the book information by page
for i in range(0, math.ceil(num_books / 15)+1): # num_books ange(0, math.ceil(num_books / 15)+1)
    if i != math.ceil(num_books / 15):
        print(f'Parsing books: {i*15+1} ~ {i*15+15}...')
    else:
        print(f'Parsing books: {i*15+1} ~ {num_books}...')
    url = f'https://book.douban.com/people/{user_id}/collect?start={i*15}&sort=time&rating=all&filter=all&mode=grid'
    book_info = get_books_info_from_page(url, header)
    data = pd.concat([data, book_info], ignore_index = True)
    
print('Finish parsing all books.')

In [None]:
# clean the final parsed data
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date').reset_index(drop=True)
data.tail(3)

In [None]:
# export the data to a excel file
data_saved_path = "book_review_sample.xlsx"
data.to_excel(data_saved_path)
logging.info(f"XLSX generated — Path: {data_saved_path}")