In [1]:
# import packages
import pandas as pd
import numpy as np
import re, requests, math
import os, dotenv

from bs4 import BeautifulSoup

import warnings
warnings.filterwarnings("ignore")

In [2]:
# import the user_id, user_agent and cookie for douban.env
dotenv.load_dotenv("douban.env")
user_id, user_agent, cookie  = os.getenv("user_id"), os.getenv("user_agent"), os.getenv("cookie")

# generate the header
header = {'User-Agent': user_agent, 'Cookie': cookie}

In [3]:
# define the function of parse each book in one page
def get_books_info_from_page(url, header):
    
    # create the dataframe for all used columns
    page_data = pd.DataFrame(columns = ['Name', 'Pub_info', 'Date', 'Star', 'Tag', 'Comment', 'Website'])
    
    # get the web data from the url
    web_data = requests.get(url, headers = header)
    soup = BeautifulSoup(web_data.text, features = "lxml")

    # parse by each book
    for div in soup.select('div.info'):
        if div.find('a').contents[0] == '读书主页':
            pass
        else:
            try:
                # get the feature of each book
                book_name = str(div.find('a').contents[0]).replace('  ','').replace('\n','')
                book_pubinfo = str(div.select('div.pub')).split('>')[1].split('<')[0].replace('  ','').replace('\n','')
                book_date = str(div.select('span.date')).split('>')[1].split('<')[0][:10]
                book_comment = str(div.find('p')).split('>')[1].split('<')[0].replace('  ','').replace('\n','')
                book_tag = str(div.select('span.tags')).split('>')[1].split('<')[0][4:]
                book_star = [i for i in range(1, 6) if div.select(f'span.rating{i}-t')][0]
                book_website = re.search(r'href="([^"]+)"', str(div.select('a')[0])).group(1)
                
                # combine the features in one row
                book_parse = pd.DataFrame({'Name': book_name, 'Pub_info': book_pubinfo, 
                                           'Date': book_date, 'Star': book_star, 
                                           'Tag': book_tag, 'Comment': book_comment, 
                                           'Website': book_website}, 
                                           index=[0])
                
                # concat the book info together
                page_data = pd.concat([page_data, book_parse], ignore_index = True)
                
            except:
                
                # print 'error' if wrong
                print(book_name, 'parse error')
                
    return page_data

In [4]:
# create the data for all dataframe
data = pd.DataFrame(columns = ['Name', 'Pub_info', 'Date', 'Star', 'Tag', 'Comment', 'Website'])

# get the number of books
url = f'https://book.douban.com/people/{user_id}/collect?start=0&sort=time&rating=all&filter=all&mode=grid'
web_data = requests.get(url, headers = header)
soup = BeautifulSoup(web_data.text, features="lxml")

web_title = str(soup.title).split('>')[1].split('<')[0].replace('  ','').replace('\n','')
num_books = int(re.findall(r"\(\s*\+?(-?\d+)\s*\)", web_title)[0])
print('Total book #:', num_books)

# parse the book information by page
for i in range(0, math.ceil(num_books / 15)+1): # num_books ange(0, math.ceil(num_books / 15)+1)
    if i != math.ceil(num_books / 15):
        print(f'Parsing books: {i*15+1} ~ {i*15+15}...')
    else:
        print(f'Parsing books: {i*15+1} ~ {num_books}...')
    url = f'https://book.douban.com/people/{user_id}/collect?start={i*15}&sort=time&rating=all&filter=all&mode=grid'
    book_info = get_books_info_from_page(url, header)
    data = pd.concat([data, book_info], ignore_index = True)
    
print('Finish parsing all books.')

Total book #: 491
Parsing books: 1 ~ 15...
Parsing books: 16 ~ 30...
Parsing books: 31 ~ 45...
Parsing books: 46 ~ 60...
Parsing books: 61 ~ 75...
Parsing books: 76 ~ 90...
Parsing books: 91 ~ 105...
Parsing books: 106 ~ 120...
Parsing books: 121 ~ 135...
Parsing books: 136 ~ 150...
Parsing books: 151 ~ 165...
Parsing books: 166 ~ 180...
Parsing books: 181 ~ 195...
Parsing books: 196 ~ 210...
Parsing books: 211 ~ 225...
Parsing books: 226 ~ 240...
Parsing books: 241 ~ 255...
Parsing books: 256 ~ 270...
Parsing books: 271 ~ 285...
Parsing books: 286 ~ 300...
Parsing books: 301 ~ 315...
Parsing books: 316 ~ 330...
Parsing books: 331 ~ 345...
Parsing books: 346 ~ 360...
Parsing books: 361 ~ 375...
Parsing books: 376 ~ 390...
Parsing books: 391 ~ 405...
Parsing books: 406 ~ 420...
Parsing books: 421 ~ 435...
Parsing books: 436 ~ 450...
Parsing books: 451 ~ 465...
Parsing books: 466 ~ 480...
Parsing books: 481 ~ 495...
Parsing books: 496 ~ 491...
Finish parsing all books.


In [5]:
# clean the final parsed data
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date').reset_index(drop=True)
data.tail(3)

Unnamed: 0,Name,Pub_info,Date,Star,Tag,Comment,Website
488,透明的红萝卜,莫言 / 浙江文艺出版社 / 2020-7 / 49.00元,2025-04-15,4,汉语文学 现当代文学,《透明的红萝卜》是莫言的成名作，作为他作品中的第一个“特异儿童”（后续中有《生死疲劳》中一出...,https://book.douban.com/subject/35096959/
489,爆炸,莫言 / 浙江文艺出版社 / 2020-7 / 46.00元,2025-04-26,4,汉语文学 现当代文学,《爆炸》描写了一个丈夫在得知妻子怀孕后，在特殊年代不得不带着妻子去流产。传统观念和现代思想，...,https://book.douban.com/subject/35096965/
490,沉思,[奥地利] 弗朗茨·卡夫卡 / 彤雅立 / 北京燕山出版社 / 2021-1 / 45.00,2025-05-06,2,外国文学 德语文学,这本书是卡夫卡于29岁发表的第一本书。大部分都是灵光乍现的小短文，看上去似乎有些没头没尾的。...,https://book.douban.com/subject/35218473/


In [6]:
# export the data to a excel file
data.to_excel("book_review_sample_result.xlsx")