reference: https://medium.com/bitgrit-data-science-publication/scraping-100-free-data-science-books-with-python-5b5c515033a7

target website: https://www.learndatasci.com/

what we get from inspection:

- `<div class=“star-ratings”>` — Goodreads rating and amount of ratings
- `<div class=“book-cats”>` — Book category
- `<h2>` — Book title
- `<div class=“meta-auth”>` — author name, year
- `<p>` — book description
- `<a class=”btn”.. >` — book link and amazon review link 


In [1]:
# web scraping libraries
from urllib.request import urlopen # open urls
from bs4 import BeautifulSoup # extract data from html files

# ds libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 8) # default plot size

# regex
import re

# word cloud
#from wordcloud import WordCloud, STOPWORDS

# interactive tables on google colab
#from google.colab import data_table

- urllib.request — used to open our website and return HTML data.
- bs4 — Beautiful soup library, the star of the show, helps us extract the right data from HTML.
- wordcloud — create word cloud plots for our text data analysis
- re — python regular expression library

In [1]:
url = "https://www.learndatasci.com/free-data-science-books/"

# get html of page
html = urlopen(url)

# create bs object
soup = BeautifulSoup(html, 'lxml') # using lxml parser
# you will need pip install lxml
# OR
#soup = BeautifulSoup(html,  "html.parser")


# get title
title = soup.title
print(title.get_text())

NameError: name 'urlopen' is not defined

In [None]:
books = soup.find_all('section', attrs={"class": ""}) # to prevent getting ad section

book1 = books[0]
print(book1.prettify())

- soup.find() — first occurrence of class/tag
- soup.find_all() — all occurrences of class/tag
- soup.find().find() — searching within a class/tag
- .get_text() — returns the text of the HTML tag
- .prettify() — pretty output of HTML

In [None]:
rating = book1.find(class_='star-ratings').find('b').get_text()
total_ratings = book1.find(class_='star-ratings').find('span').get_text()
total_ratings = re.search(r'\d+', total_ratings).group() # get numbers only
book_cat = book1.find(class_='book-cats').get_text()
title = book1.find('h2').get_text()
author, year = book1.find(class_='meta-auth').find('b').get_text().split(', ')
desc = book1.find('p').get_text()
links = book1.find_all('a')
book_link = links[0].get('href')
review_link = links[1].get('href')

print(f"title: {title}")
print(f"category: {book_cat}")
print(f"author: {author}")
print(f"year: {year}")
print(f"rating: {rating}")
print(f"total_ratings: {total_ratings}")
print(f"description: {desc}")
print(f"link: {book_link}")
print(f"review link: {review_link}")

books without year and multiple authors

In [None]:
book7 = books[7] # book without year
book35 = books[35] # book without year but multiple author
book17 = books[17] # book with multiple authors

print(book1.find(class_='meta-auth').find('b').get_text())
print(book7.find(class_='meta-auth').find('b').get_text())
print(book35.find(class_='meta-auth').find('b').get_text())
print(book17.find(class_='meta-auth').find('b').get_text())

In [None]:
# author = book1.find(class_='meta-auth').find('b').get_text()
# author = book7.find(class_='meta-auth').find('b').get_text()
author = book17.find(class_='meta-auth').find('b').get_text()
author = book35.find(class_='meta-auth').find('b').get_text()

# some books don't have year and has multiple authors
if (re.search(r'\d+', author) != None):
  author_year = author.split(", ")
  author = ", ".join(str for str in author_year[:-1])
  year = author_year[-1]
else:
  year = None

print(author)
print(year)

books without rating

In [None]:
book23 = books[23] # book without rating

print(book1.find(class_='star-ratings').prettify())
print()
print(book23.find(class_='star-ratings').prettify())

In [None]:
# rating = book1.find(class_='star-ratings').find('b')
# total_ratings = book1.find(class_='star-ratings').find('span')
rating = book23.find(class_='star-ratings').find('b')
total_ratings = book23.find(class_='star-ratings').find('span')

# some books don't have ratings
if (rating != None and total_ratings != None):
  rating = rating.get_text()
  total_ratings = total_ratings.get_text()
  total_ratings = re.search(r'\d+', total_ratings).group()


print(rating)
print(total_ratings)

books without review link

In [None]:
book8 = books[8] # book without review link

print(len(book1.find_all('a')))
print(len(book8.find_all('a')))

In [None]:
links = book8.find_all('a')
book_link = links[0].get('href')

if (len(links) == 2):
  review_link = links[1].get('href')
else:
  review_link = None
  
print(book_link)
print(review_link)

books without description

In [None]:
book13 = books[13] # book without desc

print(book1.find('p'))
print(book13.find('p'))

storing and building dataframe

In [None]:
title_list = []
book_cat_list = []
author_list = []
year_list = []
rating_list = []
total_ratings_list = []
description_list = []
book_link_list = []
review_link_list = []

In [None]:
def getInfo(book):

  # get and add title data
  title = book.find('h2')
  title_list.append(title.get_text())

  book_cat = book.find(class_='book-cats')
  if book_cat != None:
    book_cat = book_cat.get_text()

  book_cat_list.append(book_cat)

  # get author and year data
  author = book.find(class_='meta-auth').find('b').get_text()

  # some books don't have year and some books have multiple authors
  if (re.search(r'\d+', author) != None):
    author_year = author.split(", ")
    author = ", ".join(str for str in author_year[:-1])
    year = author_year[-1]
  else:
    year = None
  
  author_list.append(author)
  year_list.append(year)

  # get rating and total number of ratings
  rating = book.find(class_='star-ratings').find('b')
  total_ratings = book.find(class_='star-ratings').find('span')

  # some books don't have ratings
  if (rating != None and total_ratings != None):
    rating = rating.get_text()
    total_ratings = total_ratings.get_text()
    total_ratings = re.search(r'\d+', total_ratings).group() # get numbers only

  rating_list.append(rating)
  total_ratings_list.append(total_ratings)

  # get description
  desc = book.find('p')

  # books without description
  if (desc != None):
    desc = desc.get_text()

  description_list.append(desc)

  # get book links and review links
  links = book.find_all('a')
  
  book_link = links[0].get('href')
  book_link_list.append(book_link)

  # Some books don't have links
  if (len(links) == 2):
    review_link = links[1].get('href')
  else:
    review_link = None

  review_link_list.append(review_link)

In [None]:
for book in books:
  getInfo(book)

In [None]:
df_books = pd.DataFrame({
    "title": title_list,
    "book_cat" : book_cat_list,
    "author": author_list,
    "year": year_list,
    "rating": rating_list, 
    "total_ratings": total_ratings_list,
    "description": description_list,
    "book_link": book_link_list,
    "review_link":review_link_list
})
df_books.head()

In [None]:
df_books.info()

data cleaning

What can we do to replace these missing values? Here’s what I thought of:
- book_cat — check the book, and impute it ourselves manually since it’s only one book
- year — leave it empty for now
- rating — replace with 0.0
- total_ratings — replace with 0
- description & review_link — replace with "None"

In [None]:
df_books.isnull().sum()

In [None]:
df_books[df_books['book_cat'].isnull()]

In [None]:
df_books['book_cat'].unique()

In [None]:
df_books.fillna({'rating': '0.0'}, inplace=True)
df_books.fillna({'total_ratings':'0'}, inplace=True)
df_books.fillna({'book_cat': 'Artificial Intelligence'}, inplace=True)
df_books.fillna({'description':'None'}, inplace=True)
df_books.fillna({'review_link':'None'}, inplace=True)
df_books.isnull().sum()

data transformation

- year -> datetime
- rating -> float
- total_rating -> integer

In [None]:
# data transformation
df_books = df_books.convert_dtypes() # convert all to string

# convert to datetime
df_books['year'] = df_books['year'].astype('Int64')
df_books['rating'] = df_books['rating'].astype('float64')
df_books['total_ratings'] = df_books['total_ratings'].astype('Int64')

df_books.dtypes

<h2> Exploratory Data Analysis

In [None]:
from wordcloud import WordCloud, STOPWORDS

def plot_wordcloud(text, file_name, stopwords_list=[], max_words = 500):
  # create stopword list
  stopwords = set(STOPWORDS)
  stopwords.update(stopwords_list)

  # generate word cloud 
  wordcloud = WordCloud(width=1000, height = 600,
                        stopwords=stopwords,
                        max_words = max_words,
                        background_color="white").generate(text)

  # generate plot
  wordcloud.to_file(file_name + ".png");

  # Display the generated image:

  plt.figure(figsize=(12,8))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis("off");

word cloud of book titles

In [None]:
text = " ".join(title for title in df_books.title)
print("There are {} words in the combination of all titles.".format(len(set(text.split(" ")))))

In [None]:
plot_wordcloud(text, "100ds_titles")

In [None]:
text = " ".join(desc for desc in df_books.description)
print("There are {} words in the combination of all description.".format(len(set(text.split(" ")))))

In [None]:
plot_wordcloud(text, "100ds_book_descriptions", ['None'], 1000) # add None to stopwords

In [None]:
# Book category hist

sns.histplot(data=df_books, y='book_cat', discrete=True);

In [None]:
# Boook year

sns.histplot(data=df_books, x='year', discrete=True);

In [None]:
# Book rating and total rating

df_books[['rating', 'total_ratings']].describe()

In [None]:
fig, axes = plt.subplots(2)

sns.histplot(data=df_books, x='rating', discrete=True, ax = axes[0]);
sns.histplot(data=df_books, x='total_ratings', ax = axes[1]);

In [None]:
fig, axes = plt.subplots(2)

sns.boxplot(x=df_books['rating'], ax = axes[0]);
sns.boxplot(x=df_books['total_ratings'], ax = axes[1]);

In [None]:
sns.stripplot(x='rating', y = 'total_ratings', data=df_books, 
              linewidth=1, size = 15, alpha=.50, palette = "deep");

<h3> which book to read?

In [None]:
df_books[(df_books['total_ratings'] > 1500) & (df_books['rating'] > 4.0)].iloc[:, :6]

<h4> top 10

In [None]:
df_books.sort_values(by=['total_ratings'], ascending=False)[:10].iloc[:, :6]

In [None]:
df_books.sort_values(by=['rating', 'total_ratings'], ascending=False)[:10].iloc[:, :6]