# Import thư viện

In [None]:
from bs4 import BeautifulSoup
import requests
import json
import csv
import os

# Util

### Write Json to .json

In [None]:
def writeJsonToFile(html):
    with open('out.json', 'w') as f:
        return json.dump(html, f)

### Write records to .csv

In [None]:
def add_dict_to_csv(dict):
    # Mở file CSV trong chế độ ghi
    filePath = "./data_train.csv"
    mode = None
    if not os.path.isfile(filePath):
      mode = "w"
    else:
      mode = "a"
    with open(filePath, mode=mode, newline="", encoding="utf-8") as file:
        # Tạo writer object
        writer = csv.writer(file)
        # Ghi data row
        writer.writerow(dict)

### Convert Timestamp to year

In [None]:
import datetime
def convert_timestamp_to_year(timestamp):
    seconds = abs(timestamp) // 1000
    formatted_year = None
    try:
        delta = datetime.timedelta(seconds=seconds)
        epoch = datetime.datetime(1970, 1, 1)
        if timestamp < 0:
            date = epoch - delta
        else:
            date = epoch + delta
        formatted_year = date.strftime("%Y")
        return formatted_year
    except Exception as e:
        return None

# I.    CRAWL DATA

In [None]:
book_dict_origin = {
    'title':None, #tiêu đề sách
    'author':None, #tác giả
    'language':None, #ngôn ngữ của sách
    'avg_ratings':None, # điểm đánh giá
    'ratings_count':None, # số lượng ratings
    'reviews_count':None, # số lượng reviews
    'publisher':None, # nhà/người phát hành
    'publish_year':None, # Năm phát hành
    'num_pages':None, # số trang
    'page_format':None, # loại bìa
    'genres':None, # Thể loại
    'link':None # link sách
}


In [None]:
def get_soup(bookURL):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.209.0 Safari/532.0'}
    status = None
    response = None
    while(status != 200):
        try:
            response = requests.get(bookURL, headers=headers)
            status = response.status_code
        except Exception as e:
            print(e)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup



In [None]:
def get_book_details_dict(soup):
    book_dict = book_dict_origin
    
    # Title
    title = soup.find('h1', {'class':'Text__title1'}).text
    book_dict['title'] = title

    # Author
    author = soup.find('span', {'class':'ContributorLink__name'}).text
    book_dict['author'] = author

    # Voters-count
    ratings_count = soup.find('span', {'data-testid':'ratingsCount'}).text.split()[0]
    if(ratings_count):
        ratings_count= ratings_count.replace(',', '')
    book_dict['ratings_count'] = ratings_count

    # Rating
    avg_ratings = soup.find('div', {'class':'RatingStatistics__rating'}).text
    book_dict['avg_ratings'] = avg_ratings

    # Reviews-count
    reviews_count = soup.find('span', {'data-testid':'reviewsCount'}).text.split()[0]
    if(reviews_count):
        reviews_count = reviews_count.replace(',', '')
    book_dict['reviews_count'] = reviews_count

    # Raw details
    script_tag_book_details_raw = soup.find('script', {'type': 'application/json'}).get_text()
    script_tag_book_details = json.loads(script_tag_book_details_raw)['props']['pageProps']['apolloState']
    # writeJsonToFile(json.loads(script_tag_book_details_raw))
    book_details_wrap = next(iter({key: val for key, val in script_tag_book_details.items() if "Book:" in key and "details" in val}.values()))
    book_details = book_details_wrap['details']
    # Publisher
    publisher = book_details['publisher']   
    book_dict['publisher'] = publisher

    # Language
    language = book_details['language']['name']
    book_dict['language'] = language

    # published
    timestamp = book_details['publicationTime']# millisecond
    formatted_year = None
    if(timestamp):
        formatted_year = convert_timestamp_to_year(timestamp)
        book_dict['publish_year'] = formatted_year
    else:
        book_dict['publish_year'] = formatted_year
    

    # Pages-count
    num_pages = book_details['numPages']
    book_dict['num_pages'] = num_pages

    # Pages-count
    page_format = book_details['format']
    book_dict['page_format'] = page_format

    #Genres
    genres = book_details_wrap['bookGenres']
    genresList = []
    if(genres):
      if(len(genres)>0):
          for item in genres:
              genresList.append(item['genre']['name'])
          book_dict['genres'] = genresList
    return book_dict

In [40]:
try:
    rootURL = 'https://www.goodreads.com'

    # create csv header ()
    add_dict_to_csv(book_dict_origin.keys())

    #  crawl and save to csv
    count = 0
    for i in range(1,101):
        # list book of page i
        ListBookURL = f'https://www.goodreads.com/list/show/1.Best_Books_Ever?page={i}'
        # get soup
        listBook_soup = None
        while(True):
            listBook_soup = get_soup(ListBookURL)
            # check get list book
            if(books := listBook_soup.find_all('a', {'class':'bookTitle'})):
                break
            
        books = listBook_soup.find_all('a', {'class':'bookTitle'})
        # link of book
        for book in books:
            # get link
            bookURL = rootURL+book['href']
            
            print("fetching data...",bookURL)
            # get soup
            bookSoup = None
            while(True):
                bookSoup = bookSoup = get_soup(bookURL)
                if(bookSoup.find('h1', {'class':'Text__title1'})):
                    break

            # get detais
            bookDetails = get_book_details_dict(bookSoup)
            bookDetails['link'] = bookURL
            print(bookDetails)
            
            # save to csv
            add_dict_to_csv(bookDetails.values())
            count = count + 1

            # show 
            print("crawled and saved")
            print()
except Exception as e:
    print("Error:", e)
    raise e

fetching data... https://www.goodreads.com/book/show/2767052-the-hunger-games
{'title': 'The Hunger Games', 'author': 'Suzanne Collins', 'language': 'English', 'avg_ratings': '4.33', 'ratings_count': '7985663', 'reviews_count': '197597', 'publisher': 'Scholastic Press', 'publish_year': '2008', 'num_pages': 374, 'page_format': 'Hardcover', 'genres': ['Young Adult', 'Fiction', 'Dystopia', 'Fantasy', 'Science Fiction', 'Romance', 'Adventure', 'Teen', 'Post Apocalyptic', 'Audiobook'], 'link': 'https://www.goodreads.com/book/show/2767052-the-hunger-games'}
crawled and saved

fetching data... https://www.goodreads.com/book/show/2.Harry_Potter_and_the_Order_of_the_Phoenix


KeyboardInterrupt: 

### Get 10.000 books details (100 book in one page)

### test crawl a book

In [None]:
# try:
#     bookURL = 'https://www.goodreads.com/book/show/77203.The_Kite_Runner'
#     # bookURL = 'https://www.goodreads.com/book/show/18626461-messages-from-heaven'
#     soup = get_soup(bookURL)
#     bookDetails = get_book_details_dict(soup)
#     bookDetails['link'] = bookURL
#     add_dict_to_csv(bookDetails.values())
#     print(bookDetails)
# except Exception as e:
#     print("Error:", e)
#     winsound.Beep(440, 2000)  # phát ra âm thanh nếu có lỗi
#     raise e
