In [11]:
import os

In [4]:
from bs4 import BeautifulSoup

In [46]:
from datetime import datetime

## Exercise 1 : Parsing HTML with BeautifulSoup
Instructions<br>
Objective:
- Use urlopen() to fetch the HTML content of a webpage and then parse it using BeautifulSoup.
- Read the HTML content of the page.
- Create a BeautifulSoup object to parse this HTML.
- Find the title of the webpage (the content inside the <title> tag).
- Extract all paragraphs (<p> tags) from the page.
- Retrieve all links (URLs in <a href=""> tags) on the page.

In [10]:
html_content = '''
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Sports World</title>
    <style>
        body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
    </style>
</head>
<body>

    <header>
        <h1>Welcome to Sports World</h1>
        <p>Your one-stop destination for the latest sports news and videos.</p>
    </header>

    <nav>
        <a href="#football">Football</a>
        <a href="#basketball">Basketball</a>
        <a href="#tennis">Tennis</a>
    </nav>

    <section id="football">
        <h2>Football</h2>
        <article>
            <h3>Latest Football News</h3>
            <p>Read about the latest football matches and player news.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/football-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="basketball">
        <h2>Basketball</h2>
        <article>
            <h3>NBA Highlights</h3>
            <p>Watch highlights from the latest NBA games.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/basketball-video-id" frameborder="0" allowfullscreen>
                </iframe>
            </div>
        </article>
    </section>

    <section id="tennis">
        <h2>Tennis</h2>
        <article>
            <h3>Grand Slam Updates</h3>
            <p>Get the latest updates from the world of Grand Slam tennis.</p>
            <div class="video">
                <iframe width="560" height="315" src="https://www.youtube.com/embed/tennis-video-id" frameborder="0" allowfullscreen></iframe>
            </div>
        </article>
    </section>

    <footer>
        <form action="mailto:contact@sportsworld.com" method="post" enctype="text/plain">
            <label for="name">Name:</label><br>
            <input type="text" id="name" name="name"><br>
            <label for="email">Email:</label><br>
            <input type="email" id="email" name="email"><br>
            <label for="message">Message:</label><br>
            <textarea id="message" name="message" rows="4" cols="50"></textarea><br><br>
            <input type="submit" value="Send">
        </form>
    </footer>

</body>
</html>
'''

with open("index.html", "w", encoding="utf-8") as file:
    file.write(html_content)

In [12]:
from urllib.request import urlopen
file_path = os.path.abspath("index.html")
file_url = "file:///" + file_path.replace("\\", "/")
response = urlopen(file_url)
html = response.read()
html
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <title>
   Sports World
  </title>
  <style>
   body { font-family: Arial, sans-serif; }
        header, nav, section, article, footer { margin: 20px; padding: 15px; }
        nav { background-color: #333; }
        nav a { color: white; padding: 14px 20px; text-decoration: none; display: inline-block; }
        nav a:hover { background-color: #ddd; color: black; }
        .video { text-align: center; margin: 20px 0; }
  </style>
 </head>
 <body>
  <header>
   <h1>
    Welcome to Sports World
   </h1>
   <p>
    Your one-stop destination for the latest sports news and videos.
   </p>
  </header>
  <nav>
   <a href="#football">
    Football
   </a>
   <a href="#basketball">
    Basketball
   </a>
   <a href="#tennis">
    Tennis
   </a>
  </nav>
  <section id="football">
   <h2>
    Football
   </h2>
   <article>
    <h3>
     Latest Football New

In [13]:
# The title
print(soup.title.string)

Sports World


In [14]:
# All paragraphs
print(soup.find_all('p'))

[<p>Your one-stop destination for the latest sports news and videos.</p>, <p>Read about the latest football matches and player news.</p>, <p>Watch highlights from the latest NBA games.</p>, <p>Get the latest updates from the world of Grand Slam tennis.</p>]


In [15]:
# Retrieve all links
links = soup.find_all('a')
for link in links:
    href = link.get('href')
    print(href)

#football
#basketball
#tennis


## Exercise 2 : Scraping robots.txt from Wikipedia
Instructions<br>
- Write a Python program to download and display the content of robot.txt for wikipedia

In [21]:
import requests
headers = {
    'User-Agent': 'EducationalBot/1.0 (contact: your-email@example.com)'
}
response = requests.get("https://en.wikipedia.org/robots.txt", headers=headers)
print(response.text)

﻿# robots.txt for http://www.wikipedia.org/ and friends
#
# Please note: There are a lot of pages on this site, and there are
# some misbehaved spiders out there that go _way_ too fast. If you're
# irresponsible, your access to the site may be blocked.
#

# Observed spamming large amounts of https://en.wikipedia.org/?curid=NNNNNN
# and ignoring 429 ratelimit responses, claims to respect robots:
# http://mj12bot.com/
User-agent: MJ12bot
Disallow: /

# advertising-related bots:
User-agent: Mediapartners-Google*
Disallow: /

# Wikipedia work bots:
User-agent: IsraBot
Disallow:

User-agent: Orthogaffe
Disallow:

# Crawlers that are kind enough to obey, but which we'd rather not have
# unless they're feeding search engines.
User-agent: UbiCrawler
Disallow: /

User-agent: DOC
Disallow: /

User-agent: Zao
Disallow: /

# Some bots are known to be trouble, particularly those designed to copy
# entire sites. Please obey robots.txt.
User-agent: sitecheck.internetseer.com
Disallow: /

User-agent: 

## Exercise 3 : Extracting Headers from Wikipedia’s Main Page
Instructions<br>
- Write a Python program to extract and display all the header tags from wikipedia.

In [25]:
headers = {
    'User-Agent': 'EducationalBot/1.0 (contact: your-email@example.com)'
}
response = requests.get("https://en.wikipedia.org/wiki/Main_Page", headers=headers)

soup = BeautifulSoup(response.text, 'html.parser')

In [26]:
all_headers = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
for header in all_headers:
    print(header.name + ' ' + header.text.strip())

h1 Main Page
h1 Welcome to Wikipedia
h2 From today's featured article
h2 Did you know ...
h2 In the news
h2 On this day
h2 Today's featured video
h2 Other areas of Wikipedia
h2 Wikipedia's sister projects
h2 Wikipedia languages


## Exercise 4 : Checking for Page Title
Instructions<br>
- Write a Python program to check whether a page contains a title or not.

In [36]:
responce = requests.get("https://docs.langchain.com/")
soup = BeautifulSoup(responce.text, 'html.parser')

In [37]:
print(soup.title.string)

Home - Docs by LangChain


## Exercise 5 : Analyzing US-CERT Security Alerts
Instructions<br>
- Write a Python program to get the number of security alerts issued by US-CERT in the current year.

In [42]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
responce = requests.get("https://www.cisa.gov/news-events/cybersecurity-advisories?f%5B0%5D=advisory_type%3A93", headers = headers)
soup_ci = BeautifulSoup(responce.text, 'html.parser')

In [47]:
current_year = str(datetime.now().year)

In [49]:

advisories = soup_ci.find_all('div', class_='views-row')

count_this_year = 0

for advisory in advisories:
  date_tag = advisory.find('time')
  if date_tag and current_year in date_tag.text:
    count_this_year += 1

print(f'Year : {current_year}')
print(f'Number of advisories : {count_this_year}')

Year : 2025
Number of advisories : 0


## Exercise 6 : Scraping Movie Details
Instructions<br>
- Write a Python program to get movie name, year and a brief summary of the top 10 random movies from this IMBD website.

In [7]:
import requests
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.5"
}
responce = requests.get("https://www.imdb.com/list/ls091294718/", headers = headers)
soup_m = BeautifulSoup(responce.text, 'html.parser')

In [16]:
movies = (soup_m.find_all('li', class_="ipc-metadata-list-summary-item")[:10])

for index, movie in enumerate(movies, 1):
  name = movie.find('h3', class_ = "ipc-title__text").text

In [12]:
year_tag = movie.find("span", class_="sc-b189961a-8 kLaxwI cli-title-metadata-item")
year = year_tag.text if year_tag else "N/A"

In [14]:
summary_tag = movie.find("div", class_="ipc-html-content-inner-div")
summary = summary_tag.text if summary_tag else "No summary available on this page."

In [17]:
print(f"{index}. {name}")
print(f"Year: {year}")
print(f"Summary: {summary}")
print("-" * 30)

10. 10. Some Like It Hot
Year: N/A
Summary: After two male musicians witness a mob hit, they flee the state in an all-female band disguised as women, but further complications set in.
------------------------------
