In [4]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import numpy as np
import os

# Pages
Since there are multiple pages, we will get the links for each one

In [5]:
headers = ({'User-Agent':
           'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)Chrome/44.0.2403.157 Safari/537.36',
                           'Accept-Language': 'en-US, en;q=0.5'})

In [6]:
url = 'https://mitpress.mit.edu/books/series/mit-press-essential-knowledge-series'

page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

page_num = []
pages = []

for page in soup.findAll(class_ = 'pager__item'):
    page_num.append(int(page.a.get('href').split('=')[1]))


num_of_pages = max(page_num)

for page in range(num_of_pages+1):
    pages.append(url + '?page=' + str(page))

In [7]:
pages

['https://mitpress.mit.edu/books/series/mit-press-essential-knowledge-series?page=0',
 'https://mitpress.mit.edu/books/series/mit-press-essential-knowledge-series?page=1',
 'https://mitpress.mit.edu/books/series/mit-press-essential-knowledge-series?page=2']

## Information
- Title
- Author
- Link to book
- Publish Date
- Price

In [4]:
# Initialize lists to add data
titles  = []
authors = []
links =   []
dates   = []
prices  = []


for page in pages:
    # Create soup
    raw_page = requests.get(page)
    soup = BeautifulSoup(raw_page.content, 'html.parser')

    # Get relevant html
    results = soup.find(class_ = "results__list")

    # Book Title
    for i in results.findAll(class_ = 'sm-teaser__title'):
        titles.append(i.string.strip())

    # Author
    for i in results.findAll(class_="sm-teaser__meta--first"):
        atrs = i.findAll(property='name')
        if len(atrs) > 1:  # In case there are multiple authors
            authors.append(', '.join([x.string.strip() for x in atrs]))
        else:  # Single authors
            authors.append(i.a.string.strip())

    # Link
    for i in results.findAll(typeof='Book'):
        links.append(i.a.get('href'))

    # Publish Date
    for i in results.findAll(property='publishDate'):
        dates.append(i.get("datetime"))

    # Price
    for i in results.findAll('span', property='price'):
        prices.append(i.text.strip())

## Save data as a DataFrame

In [5]:
data = {
    "title":titles,
    "authors":authors,
    "link": links,
    "publish_date":dates,
    "price":prices
}

In [6]:
df = pd.DataFrame(data=data)
df.head()

Unnamed: 0,title,authors,link,publish_date,price
0,Placebos,Kathryn T Hall,https://mitpress.mit.edu/books/placebos,2022-09-20,$16.95
1,Happiness,Tim Lomas,https://mitpress.mit.edu/books/happiness-1,2022-09-20,$16.95
2,Hunting,"Jan Dizard, Mary Zeiss Stange",https://mitpress.mit.edu/books/hunting,2022-09-20,$16.95
3,Robot Ethics,Mark Coeckelbergh,https://mitpress.mit.edu/books/robot-ethics-1,2022-09-06,$16.95
4,Analog,Robert Hassan,https://mitpress.mit.edu/books/analog,2022-09-06,$16.95


# Save Data
Save as a csv and excel file

In [7]:
# Path to the folder
path = os.path.realpath('saved_data')

# CSV file
df.to_csv(path+'\\data.csv', index=False)

# Excel file
df.to_excel(path+'\\data.xlsx', index=False)