In [2]:
import numpy as np
import requests
import bs4
import sys, os
import webbrowser
import json
import pandas as pd

## Downloading Files from the Web

> The requests module was written because Python’s urllib2 module is
too complicated to use. In fact, take a permanent marker and black out this
entire paragraph. Forget I ever mentioned urllib2. If you need to download
things from the web, just use the requests module.

In [4]:
rj_address = 'https://automatetheboringstuff.com/files/rj.txt'
rj_file = 'rj.txt'
# Download the file
rj = requests.get(rj_address)
# Check if the download was successful
rj.raise_for_status()
# Save the file to disk in binary mode
with open(rj_file, 'wb') as f:
    f.write(rj.content)

In [5]:
# Print a few first lines of the file
with open(rj_file, 'r') as f:
    for i in range(5):
        print(f.readline(), end='')

The Project Gutenberg EBook of Romeo and Juliet, by William Shakespeare

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included


## Parsing HTML with the bs4 Module

In [13]:
filename = 'example.html'
# Load the file and parse it with BeautifulSoup
with open(filename , 'r') as f:
    soup = bs4.BeautifulSoup(f, 'html.parser')
    # Print the first 100 characters of the parsed file 
    print(soup.prettify()[:100])

<!-- This is the example.html example file. -->
<html>
 <head>
  <title>
   The Website Title
  </ti


### Finding an Element with the `select()` Method

In [21]:
# Load the file and parse it with BeautifulSoup
with open(filename , 'r') as f:
    soup = bs4.BeautifulSoup(f, 'html.parser')
    # Find the element with an id attribute of author
    authors = soup.select('#author')

    # Check author type and length, check the type of the first element
    print(f"Type of author: {type(authors)}")
    print(f"Length of author: {len(authors)}")
    print(f"Type of the first element: {type(authors[0])}\n")

    # Print the author element
    author = authors[0]
    # Print the author element as a string
    print(f"Author as a string: {str(author)}")
    # Print the text of the author element
    print(f"Author text: {author.getText()}")
    # Print the author element attributes
    print(f"Author attributes: {author.attrs}")

Type of author: <class 'bs4.element.ResultSet'>
Length of author: 1
Type of the first element: <class 'bs4.element.Tag'>

Author as a string: <span id="author">Al Sweigart</span>
Author text: Al Sweigart
Author attributes: {'id': 'author'}


In [23]:
# Load the file and parse it with BeautifulSoup
with open(filename, 'r') as f:
    soup = bs4.BeautifulSoup(f, 'html.parser')

    # Pull all the <p> elements from the parsed file
    paragraphs = soup.select('p')

    # Print the first 3 paragraphs using getText()
    for paragraph in paragraphs:
        print(paragraph.getText())

Download my Python book from my website.
Learn Python the easy way!
By Al Sweigart


### Getting Data from an Element’s Attributes

In [27]:
# Load the file and parse it with BeautifulSoup
with open(filename, 'r') as f:
    soup = bs4.BeautifulSoup(f, 'html.parser')

    # Pull all the <span> elements from the parsed file
    spans = soup.select('span')

    # Print the first span element and its  attributes
    span = spans[0]
    print(f"Span element: {span}")
    print(f"Span attributes: {span.attrs}")

    # Use get method to access the value of the id attribute
    print(f"Span 'id' attribute: '{span.get('id')}'")

Span element: <span id="author">Al Sweigart</span>
Span attributes: {'id': 'author'}
Span 'id' attribute: 'author'


## Project: Opening All Search Results

In [38]:
# Get search results from Google for a specific python package
url = "https://pypi.org/search/?q=numpy"
result = requests.get(url)
result.raise_for_status()

# Print the first 100 characters of the search results
# print(len(result.text))

# Parse the search results with BeautifulSoup to get top 5 links
soup = bs4.BeautifulSoup(result.text, 'html.parser')
links = soup.select('.package-snippet')
for link in links[:2]:
    # Open the top 2 links in the browser
    link_url = 'https://pypi.org' + link.get('href')
    # webbrowser.open(link_url)

## Project: Downloading All XKCD Comics

In [55]:
url = 'https://xkcd.com'
# Create a directory to store the comics
os.makedirs('xkcd', exist_ok=True)

# Download the first 5 comics
NUM_COMICS = 5
for i in range(NUM_COMICS):
    # Download the page
    print(f'Downloading page {url}...')
    page = requests.get(url)
    page.raise_for_status()

    # print(f"Page content:\n {page.text[:100]}")

    # Find the URL of the comic image using 'id'='comic'
    soup = bs4.BeautifulSoup(page.text, 'html.parser')
    images = soup.select('#comic img')
    image = images[0]
    print(f"Image: {image}")
    print(f"Image attr: {image.attrs}")
    comic_url = 'https:' + image.get('src')

    # Download the comic image
    print(f'Downloading image {comic_url}...')
    # image = requests.get(comic_url)
    # image.raise_for_status()

    # Save the image to the directory ./xkcd
    # with open(os.path.join('xkcd', os.path.basename(comic_url)), 'wb') as f:
    #     f.write(image.content)

    # Get the Prev button's url
    prev_link = soup.select('a[rel="prev"]')[0]
    url = 'https://xkcd.com' + prev_link.get('href')
    print()

print('Done.')

Downloading page https://xkcd.com...
Image: <img alt="Ferris Wheels" src="//imgs.xkcd.com/comics/ferris_wheels.png" srcset="//imgs.xkcd.com/comics/ferris_wheels_2x.png 2x" style="image-orientation:none" title="They left the belt drive in place but switched which wheel was powered, so people could choose between a regular ride, a long ride, and a REALLY long ride."/>
Image attr: {'src': '//imgs.xkcd.com/comics/ferris_wheels.png', 'title': 'They left the belt drive in place but switched which wheel was powered, so people could choose between a regular ride, a long ride, and a REALLY long ride.', 'alt': 'Ferris Wheels', 'srcset': '//imgs.xkcd.com/comics/ferris_wheels_2x.png 2x', 'style': 'image-orientation:none'}
Downloading image https://imgs.xkcd.com/comics/ferris_wheels.png...

Downloading page https://xkcd.com/2972/...
Image: <img alt="Helium Synthesis" src="//imgs.xkcd.com/comics/helium_synthesis.png" srcset="//imgs.xkcd.com/comics/helium_synthesis_2x.png 2x" style="image-orientation

## Python books from Nostarch Press

In [4]:
filename = 'nostarch_books_python_classified.json'
# Load the JSON file into Pandas DataFrame
with open(filename, 'r') as f:
    data = json.load(f)
    df = pd.DataFrame(data)

In [6]:
df.head(3)

Unnamed: 0,title,author,year,edition,full_description,level,score
0,Automate the Boring Stuff with Python,Al Sweigart,2019,2nd Edition,"Automate the Boring Stuff with Python, 3rd edi...",Beginner,0.64
1,Automate the Boring Stuff with Python,Al Sweigart,2025,3rd Edition,If you’ve ever spent hours renaming files or u...,Beginner,0.8
2,Beyond the Basic Stuff with Python,Al Sweigart,2020,,Look Inside!\n\n\n\nDownload Chapter 2: ENVIRO...,Intermediate,0.43


In [7]:
# Save the DataFrame to a CSV file
csv_filename = 'nostarch_books_python_classified.csv'   
df.to_csv(csv_filename, index=False)

In [10]:
# Get Intermediate Python books sorted by score from high to low
intermediate_books = df[df['level'] == 'Intermediate']
intermediate_books = intermediate_books.sort_values(by='score', ascending=False)

In [12]:
intermediate_books

Unnamed: 0,title,author,year,edition,full_description,level,score
21,Object-Oriented Python,Irv Kalb,2021,,Look Inside!\n \n\nDownload Chapter 2: MODELIN...,Intermediate,0.71
3,Black Hat Python,Justin Seitz and Tim Arnold,2021,2nd Edition,Look Inside!\n\n \n\nDownload Chapter 3: Wri...,Intermediate,0.5
9,Gray Hat Python,Justin Seitz,2009,,"\nDownload Chapter 2: ""Debuggers and Debugger ...",Intermediate,0.49
29,Python Playground,Mahesh Venkitachalam,2023,2nd Edition,Download Chapter 1: THE KOCH SNOWFL AKE\nLook ...,Intermediate,0.46
28,Python One-Liners,Christian Mayer,2020,,Look Inside!\n\n\n\n\n\nDownload Chapter 2: PY...,Intermediate,0.45
8,Doing Math with Python,Amit Saha,2015,,“Saha does an excellent job providing a clear ...,Intermediate,0.44
26,Python for Excel Power Users,Tracy Stephens,2025,,"Excel is a mainstay of the modern workplace, b...",Intermediate,0.44
2,Beyond the Basic Stuff with Python,Al Sweigart,2020,,Look Inside!\n\n\n\nDownload Chapter 2: ENVIRO...,Intermediate,0.43
32,Serious Python,Julien Danjou,2018,,"""Serious Python contains a considerable amount...",Intermediate,0.42
34,The Art of Clean Code,Christian Mayer,2022,,Download Chapter 2: THE 80/20 PRINCIPLE\nLook ...,Intermediate,0.42
