In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import os
import re

# Page Links
Since there are multiple pages, we will get the links for each one

In [2]:
headers = ({'User-Agent':
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36',
                           'Accept-Language': 'en-US, en;q=0.5'})

## Note
The URL has changed, so I have rewritten the script.

In [3]:
url = 'https://mitpress.mit.edu/search-result-list/?keyword=essential+knowledge&series=mit-press-essential-knowledge-series&page_number='
get_url = lambda x: url + f'{x}'


num = 12
page = requests.get(get_url(num), headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

while soup.findAll(class_ = 'supapress-page') == []:
    num -= 1

    page = requests.get(get_url(num), headers=headers)
    soup = BeautifulSoup(page.content, 'html.parser')

In [4]:
# Create a list of URLs
pages = []

for page in range(1, num+1):
    pages.append(get_url(page))

In [5]:
[print(page) for page in pages]

https://mitpress.mit.edu/search-result-list/?keyword=essential+knowledge&series=mit-press-essential-knowledge-series&page_number=1
https://mitpress.mit.edu/search-result-list/?keyword=essential+knowledge&series=mit-press-essential-knowledge-series&page_number=2
https://mitpress.mit.edu/search-result-list/?keyword=essential+knowledge&series=mit-press-essential-knowledge-series&page_number=3
https://mitpress.mit.edu/search-result-list/?keyword=essential+knowledge&series=mit-press-essential-knowledge-series&page_number=4
https://mitpress.mit.edu/search-result-list/?keyword=essential+knowledge&series=mit-press-essential-knowledge-series&page_number=5
https://mitpress.mit.edu/search-result-list/?keyword=essential+knowledge&series=mit-press-essential-knowledge-series&page_number=6
https://mitpress.mit.edu/search-result-list/?keyword=essential+knowledge&series=mit-press-essential-knowledge-series&page_number=7


[None, None, None, None, None, None, None]

# Information to be Collected
We are going to collect the following data from each book:
- Title
- Author
- Number of pages
- Link to book
- Link to author's webpage(s)
- Publish Date
- Price
- Link to purchase the book

In [6]:
# Initialize lists to add data
titles  = []
links   = []

for page in pages:
    # Create soup
    raw_page = requests.get(page)
    soup = BeautifulSoup(raw_page.content, 'html.parser')

    # Get relevant html
    results = soup.findAll(class_ = "information-wrapper")
    
    for data in results:
        
        # Get the title
        title = data.find(class_ = 'sp__the-title')
        full_title = title.string
        
        if full_title is None:
            new_html = data.find(class_ = 'sp__the-title').a
            title_list = [string for string in new_html.stripped_strings]
            full_title = ' '.join(title_list)
            
        titles.append(full_title)
        
        # Get the URL
        url_ending = data.find(class_ = 'sp__the-title').a.get('href')
        links.append('https://mitpress.mit.edu' + url_ending)

In [7]:
authors = []
authors_webpages = []
publish_dates = []
book_length = []
prices = []
purchase_links = []

for link in links:
    # Create soup
    raw_page = requests.get(link)
    soup = BeautifulSoup(raw_page.content, 'html.parser')
    
    info = soup.find(class_ = "book-wrapper__top-section--details")
    
    # Get the author's name(s) and websites
    author_metadata = info.findAll(class_ = 'sp__author-link')
    book_author = []
    author_links = []
    
    for data in author_metadata:
        # Author(s)
        author = data.string
        book_author.append(author)
        # Author(s) webpages
        author_link = data.get('href')
        author_links.append(author_link)
        
    # Save the data
    authors.append(' & '.join(book_author))
    authors_webpages.append(' & '.join(author_links))
    
    
    # Publish date
    publish = info.find(class_ = 'sp__published')
    publish_date = re.sub('Published: ', '', publish.string)
    publish_dates.append(publish_date)
    
    # Number of pages
    num_pages = info.find(class_ = 'sp__the-pages').string
    pages_string = re.sub(' pp.', '', num_pages)
    book_length.append(int(pages_string))
    
    # Price
    price = info.find(class_ = 'sp__price').string
    prices.append(price)
    
    # Purchase links
    purchase_link = info.find(class_ = 'retail-link').get('href')
    purchase_links.append(purchase_link)

## Save data as a DataFrame

In [8]:
metadata = {
    "title": titles,
    "link": links,
    "authors": authors,
    "authors_webpage": authors_webpages,
    "publish_date": publish_dates,
    "book_length": book_length,
    "price": prices,
    "purchase_links": purchase_links
}

In [9]:
df = pd.DataFrame(data=metadata)
df.head()

Unnamed: 0,title,link,authors,authors_webpage,publish_date,book_length,price,purchase_links
0,The Future,https://mitpress.mit.edu/9780262534819/the-future,Nick Montfort,https://mitpress.mit.edu/author/nick-montfort-...,"December 8, 2017",192,$15.95,https://www.penguinrandomhouse.com/search/site...
1,Critical Thinking,https://mitpress.mit.edu/9780262538282/critica...,Jonathan Haber,https://mitpress.mit.edu/author/jonathan-haber...,"April 7, 2020",232,$15.95,https://www.penguinrandomhouse.com/search/site...
2,Hunting,https://mitpress.mit.edu/9780262543293/hunting,Jan E. Dizard & Mary Zeiss Stange,https://mitpress.mit.edu/author/jan-e-dizard-3...,"October 4, 2022",248,$16.95,https://www.penguinrandomhouse.com/search/site...
3,Content,https://mitpress.mit.edu/9780262543286/content,Kate Eichhorn,https://mitpress.mit.edu/author/kate-eichhorn-...,"May 10, 2022",192,$15.95,https://www.penguinrandomhouse.com/search/site...
4,"Machine Learning, revised and updated edition",https://mitpress.mit.edu/9780262542524/machine...,Ethem Alpaydın,https://mitpress.mit.edu/author/ethem-alpaydn-...,"August 17, 2021",280,$15.95,https://www.penguinrandomhouse.com/search/site...


# Save Data
Save as a csv and excel file

In [10]:
# Path to the folder
path = os.path.realpath('saved_data')

# CSV file
df.to_csv(path+'\\data.csv', index=False)

# Excel file
df.to_excel(path+'\\data.xlsx', index=False)