In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import urllib3

# Exercise 1 : Parsing HTML With BeautifulSoup

In [2]:
# Reading the HTML content of the page.

url = 'https://octopus.developers.institute/courses/collection/93/course/416/section/1149/chapter/3432'
page = requests.get(url, verify=False)



In [3]:
# Creatiing a BeautifulSoup object to parse this HTML.

soup = BeautifulSoup(page.text, 'html')

# Finding the title of the webpage (the content inside the <title> tag).
soup.title.get_text()

'Developers Institute'

In [4]:
# Extracting all paragraphs (<p> tags) from the page.

soup.find_all('p')

[<p class="my-2">Login to access your courses</p>]

In [5]:
# Retrieving all links (URLs in <a href=""> tags) on the page.

soup.find_all('href')


[]

# Exercise 2 : Scraping Robots.Txt From Wikipedia

In [6]:
url2 = 'http://en.wikipedia.org/robot.txt'
re2 = requests.get(url2)
page2 = BeautifulSoup(re2.text, 'html')
print(page2)

<!DOCTYPE html>
<html dir="ltr" lang="en">
<head><meta charset="utf-8"/>
<title>Not Found</title>
<link href="/favicon.ico" rel="shortcut icon"/>
<style>
* { margin: 0; padding: 0; }
body { background: #fff; color: #202122; font: 0.938em/1.6 sans-serif; }
.content { margin: 7% auto 0; padding: 2em 1em 1em; max-width: 640px; }
img { float: left; margin: 0 2em 2em 0; }
a img { border: 0; }
h1 { margin-top: 1em; font-size: 1.2em; }
p { margin: 0.7em 0 1em 0; }
a { color: #36c; text-decoration: none; }
a:hover { text-decoration: underline; }
em { color: #72777d; font-style: normal; }
</style>
</head><body><div class="content" role="main">
<a href="https://www.wikimedia.org"><img alt="Wikimedia" height="135" src="https://www.wikimedia.org/static/images/wmf.png" srcset="https://www.wikimedia.org/static/images/wmf-2x.png 2x" width="135"/></a>
<h1>Page not found</h1>
<p><em>/robot.txt</em></p>
<p>We could not find the above page on our servers.</p>
<p><b>Did you mean: <a href="/wiki/robot.txt"

# Exercise 3 : Extracting Headers From Wikipedia’s Main Page

In [7]:
url3 = 'https://en.wikipedia.org/wiki/Main_Page'
re3 = requests.get(url3)
page3 = BeautifulSoup(re3.text, 'html')

In [8]:
page3.find('header')

<header class="vector-header mw-header">
<div class="vector-header-start">
<nav aria-label="Site" class="vector-main-menu-landmark" role="navigation">
<div class="vector-dropdown vector-main-menu-dropdown vector-button-flush-left vector-button-flush-right" id="vector-main-menu-dropdown">
<input aria-haspopup="true" aria-label="Main menu" class="vector-dropdown-checkbox" data-event-name="ui.dropdown-vector-main-menu-dropdown" id="vector-main-menu-dropdown-checkbox" role="button" type="checkbox"/>
<label aria-hidden="true" class="vector-dropdown-label cdx-button cdx-button--fake-button cdx-button--fake-button--enabled cdx-button--weight-quiet cdx-button--icon-only" for="vector-main-menu-dropdown-checkbox" id="vector-main-menu-dropdown-label"><span class="vector-icon mw-ui-icon-menu mw-ui-icon-wikimedia-menu"></span>
<span class="vector-dropdown-label-text">Main menu</span>
</label>
<div class="vector-dropdown-content">
<div class="vector-unpinned-container" id="vector-main-menu-unpinned-

# Exercise 4 : Checking For Page Title

In [9]:
# Writing a Python program to check whether a page contains a title or not.


def checking_title(url):
    re = requests.get(url)
    page = BeautifulSoup(re.text, 'html')
    if page.title != None:
        return f"The page {url} has a title: {page.title.text}"
    else:
        return f"The page {url} hasn't a title"

In [10]:
checking_title('https://www.w3schools.com/python/python_sets_join.asp')

'The page https://www.w3schools.com/python/python_sets_join.asp has a title: Python - Join Sets'

# Exercise 5 : Analyzing US-CERT Security Alerts

In [11]:
# url5 = 'https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page=0'
# response5 = requests.get(url5)
# page5 = BeautifulSoup(response5.text, 'html')
# alerts = page5.find_all('span')
# print(alerts)



In [12]:
# alert5 = [k.text for k in alerts]
# print(alert5)

In [13]:
# url5 = 'https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93&f%5B1%5D=release_date_year%3A2024&page='
# total_alerts = 0

# for page_num in range(13):  # Loop through 13 pages
#     response5 = requests.get(url5 + str(page_num))
#     soup = BeautifulSoup(response5.text, 'html')

#     # Find all span elements
#     alerts = soup.find_all('Alert')

#     # Count the number of alerts in each page
#     for alert in alerts:
#         total_alerts += 1

# print(f"Total number of security alerts: {total_alerts}")

# Exercise 6 : Scraping Movie Details

In [14]:
url6 = 'https://www.imdb.com/list/ls534888914/'
responce = requests.get(url6)

In [None]:
responce

In [None]:
page = BeautifulSoup(responce.text, 'html')

In [None]:
# lf film name and film year
list = page.find_all('h3')
list

In [None]:
list_names_years = [title.text.strip() for title in list]
print(list_names_years)

In [None]:
film_year = [year[-5:-1] for year in list_names_years[:20]]
print(film_year)

In [None]:
film_name = [name.split('.')[1].strip()[0:-7] for name in list_names_years[:20]]
print(film_name)

In [None]:
# lf for film summary
list_summarys = page.find_all('p')
film_summary = [summary.text.strip() for summary in list_summarys[2::4]]

In [None]:
df_films = pd.DataFrame(columns=['film_name', 'film_year', 'film_summary'])
df_films['film_name'] = film_name
df_films['film_year'] = film_year
df_films['film_summary'] = film_summary
print(df_films)

# 