#### Building a webscraper

Using requests module to read the contents of the webpage

In [1]:
import requests

In [65]:
res = requests.get("https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
text = res.text
status = res.status_code

print(text[:200], status) # limiting the amount of text printed

<!DOCTYPE html>
<html lang="en">
	<head>
		<!-- Anti-flicker snippet (recommended)  -->
		<style>
			.async-hide {
				opacity: 0 !important;
			}
		</style>
		<title>codedamn Web Scraper demo</title> 200


In [3]:
#  conda install -c anaconda beautifulsoup4 

Importing module BeautifulSoup

In [5]:
from bs4 import BeautifulSoup

Extracting ```title``` from the webpage

In [6]:
page = requests.get("https://codedamn.com")
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.title.text # gets the text of the <title>(...)</title>

In [7]:
print(title)

codedamn - Learn coding like it's 2026


In [16]:
# Extracting the body of the page
body = soup.body

# Extracting head of the page
head = soup.head

# print the body and head
# print(body, head)

Using the ```.select``` to select elements.

In [21]:
# Extract first <h1>(...)</h1>text
first_h1 = soup.select('h1')[0].text

In [22]:
print(first_h1)

Learn web development with mentorship, hands-on practice, and courses.


In [34]:
#creating an empty list to hold all h2_tags
all_h2_tags = []

# Append all h1 tags to all_h2_tags
for element in soup.select('h2'):
    all_h2_tags.append(element.text)

# Select 7th p element
seventh_p_text = soup.select('p')[6].text

print(all_h2_tags, "\n=============\n", seventh_p_text)

['The Perfect Practice Environment', ' Browse from a laptop/desktop to experience demo', '"Video courses" don\'t work alone.', 'Learning Path'] 
 Our frontend web developer learning path covers everything you need to know, including industry-specific questions/hands-on practice exercises along with HD video courses from top creators.This comes with mentorship support if you're signed up for 1337 membership.


Extracting specific inner texts from the page

In [37]:
import requests
from bs4 import BeautifulSoup

# make a request
page = requests.get(
            "https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, "html.parser")

# create top_items as empty list
top_items = []

# Extract and store in top_items
products = soup.select('div.thumbnail')
for element in products:
    title = element.select('h4 > a.title')[0].text
    review_label = element.select('div.ratings')[0].text
    info = {
        "title" : title.strip(),
        "review": review_label.strip()
    }
    top_items.append(info)

print(top_items)

[{'title': 'Asus AsusPro Adv...', 'review': '7 reviews'}, {'title': 'Asus ROG Strix G...', 'review': '4 reviews'}, {'title': 'Acer Aspire 3 A3...', 'review': '2 reviews'}]


#### Extracting Links

Getting image data from the webpage

In [38]:
import requests
from bs4 import BeautifulSoup

# make a request
page = requests.get(
            "https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, "html.parser")

# create image_data as empty list
image_data = []

# Extract and save images to image_data
images = soup.select('img')
for image in images:
    src = image.get('src')
    alt = image.get('alt')
    image_data.append({"src": src, "alt": alt})
    
print(image_data)

[{'src': '/webscraper-python-codedamn-classroom-website/logo_white.svg', 'alt': 'Web Scraper'}, {'src': '/webscraper-python-codedamn-classroom-website/cart2.png', 'alt': 'item'}, {'src': '/webscraper-python-codedamn-classroom-website/cart2.png', 'alt': 'item'}, {'src': '/webscraper-python-codedamn-classroom-website/cart2.png', 'alt': 'item'}, {'src': '/webscraper-python-codedamn-classroom-website/fbicon.png', 'alt': 'Web Scraper on Facebook'}, {'src': '/webscraper-python-codedamn-classroom-website/twicon.png', 'alt': 'Web Scraper on Twitter'}]


In [44]:
import requests
from bs4 import BeautifulSoup

# make a request 
page = requests.get(
                "https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, 'html.parser')

# Create all_items as empty list
all_links = []

# Extract and store items
links = soup.select('a')
for ahref in links:
    text = ahref.text
    text = text.strip() if text is not None else ""
    
    href = ahref.get('href')
    href = href.strip() if href is not None else ""
    all_links.append({"href": href, "text": text})
    
print(all_links)

[{'href': '', 'text': 'Toggle navigation'}, {'href': '/webscraper-python-codedamn-classroom-website/', 'text': ''}, {'href': '#page-top', 'text': ''}, {'href': '/webscraper-python-codedamn-classroom-website/', 'text': 'Web Scraper'}, {'href': '/webscraper-python-codedamn-classroom-website/cloud-scraper', 'text': 'Cloud Scraper'}, {'href': '/webscraper-python-codedamn-classroom-website/pricing', 'text': 'Pricing'}, {'href': '#section3', 'text': 'Learn'}, {'href': '/webscraper-python-codedamn-classroom-website/documentation', 'text': 'Documentation'}, {'href': '/webscraper-python-codedamn-classroom-website/tutorials', 'text': 'Video Tutorials'}, {'href': '/webscraper-python-codedamn-classroom-website/how-to-videos', 'text': 'How to'}, {'href': '/webscraper-python-codedamn-classroom-website/test-sites', 'text': 'Test Sites'}, {'href': 'https://forum.webscraper.io/', 'text': 'Forum'}, {'href': 'https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn?hl=en', '

#### Extracting data and creating a CSV file

In [48]:
import requests
from bs4 import BeautifulSoup
import csv

page = requests.get(
            "https://codedamn-classrooms.github.io/webscraper-python-codedamn-classroom-website/")
soup = BeautifulSoup(page.content, "html.parser")

# create empty list to hold items
all_products = []

# Extract desired items from the webpage
products = soup.select('div.thumbnail')
for product in products:
    name = product.select('h4 > a')[0].text.strip()
    description = product.select('p.description')[0].text.strip()
    price = product.select('h4.price')[0].text.strip()
    reviews = product.select('div.ratings')[0].text.strip()
    image = product.select('img')[0].get('src')
    
    all_products.append({
        "name": name,
        "description": description,
        "price": price,
        "reviews": reviews,
        "image": image
    })
    
keys = all_products[0].keys()

with open('products.csv', 'w', newline="") as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(all_products)

Reading the newly created ```.csv``` file

In [60]:
import pandas as pd
df = pd.read_csv('products.csv')
df.head()

Unnamed: 0,name,description,price,reviews,image
0,Asus AsusPro Adv...,Asus AsusPro Advanced BU401LA-FA271G Dark Grey...,$1139.54,7 reviews,/webscraper-python-codedamn-classroom-website/...
1,Asus ROG Strix G...,"Apple MacBook Air 13.3"", Core i5 1.8GHz, 8GB, ...",$1101.83,4 reviews,/webscraper-python-codedamn-classroom-website/...
2,Acer Aspire 3 A3...,"Acer Aspire 3 A315-51 Black, 15.6"" FHD, Core\n...",$494.71,2 reviews,/webscraper-python-codedamn-classroom-website/...
