# Web scraping Task
##### Student: Khalid Salim
##### Student Number: 2
##### Group: CAI3_AIS4_G1

## For the webpage: https://baraasalout.github.io/test.html, perform the following tasks

### Importing required libraries and modules

In [1]:
# Installing required libraries
#!pip install requests
#!pip install beautifulsoup4

In [2]:
# Importing required modules and functions
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import gc
import json



### Task 1 - Extract Text Data:
* Extract all headings (`<h1>`, `<h2>`).
* Extract all text content inside `<p>` and `<li>` tags.
* Save this data into a `Extract_Text_Data.CSV` file.

In [3]:
# Getting page HTML code
url = 'https://baraasalout.github.io/test.html'

page = requests.get(url)
#print(page.content)

In [4]:
# Parsing HTML content
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE html>

<html>
<head>
<title>Web Scraping Task with Form</title>
<style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f5f5f5;
        }

        h1 {
            color: darkred;
            text-align: center;
        }

        h2 {
            color: darkblue;
            font-style: italic;
        }

        p {
            color: #555;
            font-size: 14px;
        }

        img {
            width: 250px;
            height: auto;
            border-radius: 10px;
        }

        table {
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
        }

        table, th, td {
            border: 1px solid #ccc;
        }

        th {
            background-color: #333;
            color: white;
            padding: 10px;
        }

        td {
            text-align: center;
            padding: 10px;
        }

        .btn {
            background-col

In [5]:
#print(soup.text)

#### Extract all headings (`<h1>`, `<h2>`).

In [6]:
h1_headings = soup.find_all('h1')
h1_headings

[<h1>Web Scraping Practice</h1>]

In [7]:
#print(h1_headings[0].get_text())

In [8]:
h2_headings = soup.find_all('h2')
h2_headings

[<h2>Available Products</h2>,
 <h2>Product Table</h2>,
 <h2>Watch This Video</h2>,
 <h2>Contact Us</h2>,
 <h2>Product Information</h2>,
 <h2>Featured Products</h2>]

In [9]:
#for heading in h2_headings:
#    print(heading.get_text())

#### Extract all text content inside `<p>` and `<li>` tags

In [10]:
paragraphs = soup.find_all('p')
paragraphs

[<p>Welcome to the web scraping task! Use your skills to extract the required data from this page.</p>,
 <p><strong>Sharp Objects</strong></p>,
 <p style="color: green;">£47.82</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>In a Dark, Dark Wood</strong></p>,
 <p style="color: green;">£19.63</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>The Past Never Ends</strong></p>,
 <p style="color: green;">£56.50</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>A Murder in Time</strong></p>,
 <p style="color: green;">£16.64</p>,
 <p style="color: green;"> Out stock</p>,
 <p class="name">Wireless Headphones</p>,
 <p class="price" style="display: none;">$49.99</p>,
 <p class="colors">Available colors: Black, White, Blue</p>,
 <p class="name">Smart Speaker</p>,
 <p class="price" style="display: none;">$89.99</p>,
 <p class="colors">Available colors: Grey, Black</p>,
 <p class="name">Smart Watch</p>,
 <p class="price" style="display: none;">$149.99</p>,
 <p class="co

In [11]:
listings = soup.find_all('li')
listings

[<li class="highlight">Laptop</li>,
 <li>Smartphone</li>,
 <li>Tablet</li>,
 <li>Smartwatch</li>]

#### Save this data into a `Extract_Text_Data.CSV` file.

In [12]:
# Arranging data
header = [['Type', 'Content']] # csv file header
h1_headings_data = [['Heading', row.get_text()] for row in h1_headings]
#print(*h1_headings_data, sep='\n')
h2_headings_data = [['Heading', row.get_text()] for row in h2_headings]
paragraph_data = [['Paragraph', row.get_text()] for row in paragraphs]
#print(*paragraph_data, sep='\n')
listing_data = [['Listing', row.get_text()] for row in listings]
#print(*listing_data, sep='\n')

# data to be saved into csv
extracted_data = header
extracted_data.extend(h1_headings_data)
extracted_data.extend(h2_headings_data)
extracted_data.extend(paragraph_data)
extracted_data.extend(listing_data)
#print(extracted_data)

In [13]:
# creating csv file to save data into
with open('task1_extract_text_data.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(extracted_data)

In [14]:
# clearing memory for task 1
del page
del h1_headings
del h2_headings
del paragraphs
del extracted_data
del paragraph_data
del listings
del listing_data
del header
del h1_headings_data
del h2_headings_data
gc.collect()

27

### Task 2 - Extract Table Data:
- Product Name.
- Price.
- Stock Status.us.

In [15]:
# preparing table data for csv file
tables = soup.find('table')
tables = tables.get_text().split('\n\n\n')
tables_data = [row.split('\n') for row in tables]
extracted_data = [[item for item in row if item != ""] for row in tables_data]
print(tables)
print(tables_data)

['\n\nProduct\nPrice\nIn Stock', 'Laptop\n$1000\nYes', 'Smartphone\n$800\nNo', 'Tablet\n$500\nYes\n\n']
[['', '', 'Product', 'Price', 'In Stock'], ['Laptop', '$1000', 'Yes'], ['Smartphone', '$800', 'No'], ['Tablet', '$500', 'Yes', '', '']]


In [16]:
# creating csv file for table data
with open('task2_extract_table_data.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(extracted_data)

In [17]:
# clearing memory for task 2
del tables
del tables_data
del extracted_data
gc.collect()

0

### Task 3 - Extract Product Information (Card Section):
- Extract data from the book cards at the bottom of the page, including:    
- Book Title.    
- Price    .
- Stock Availabil  y.
- Button text (e.g., "Add to basket"). 
- Save the dataproductiict_Informa` JSON fileary-


In [18]:
# Extracting card section
cards = soup.find('div')
cards

<div class="book-products" style="display: flex; justify-content: space-around; margin-top: 20px;">
<div style="text-align: center; width: 200px; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
<img alt="Sharp Objects" src="http://books.toscrape.com/media/cache/26/0c/260c6ae16bce31c8f8c95daddd9f4a1c.jpg" style="width: 100%; height: auto; border-radius: 5px;"/>
<p><strong>Sharp Objects</strong></p>
<p style="color: green;">£47.82</p>
<p style="color: green;">✔ In stock</p>
<button style="background-color: blue; color: white; border: none; padding: 10px 15px; border-radius: 5px; cursor: pointer;">Add to basket</button>
</div>
<div style="text-align: center; width: 200px; border: 1px solid #ddd; padding: 10px; border-radius: 5px;">
<img alt="In a Dark, Dark Wood" src="http://books.toscrape.com/media/cache/92/27/92274a95b7c251fea59a2b8a78275ab4.jpg" style="width: 100%; height: auto; border-radius: 5px;"/>
<p><strong>In a Dark, Dark Wood</strong></p>
<p style="color: green;">£1

In [19]:
# extracting card tags
cards_title = cards.find_all('p')
cards_title

[<p><strong>Sharp Objects</strong></p>,
 <p style="color: green;">£47.82</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>In a Dark, Dark Wood</strong></p>,
 <p style="color: green;">£19.63</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>The Past Never Ends</strong></p>,
 <p style="color: green;">£56.50</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>A Murder in Time</strong></p>,
 <p style="color: green;">£16.64</p>,
 <p style="color: green;"> Out stock</p>]

In [20]:
# Getting card information from tags
card_info = [title.get_text().split('\n') for title in cards]
card_info

[['', ''],
 ['', '', 'Sharp Objects', '£47.82', '✔ In stock', 'Add to basket', ''],
 ['', ''],
 ['', '', 'In a Dark, Dark Wood', '£19.63', '✔ In stock', 'Add to basket', ''],
 ['', ''],
 ['', '', 'The Past Never Ends', '£56.50', '✔ In stock', 'Add to basket', ''],
 ['', ''],
 ['', '', 'A Murder in Time', '£16.64', ' Out stock', 'Add to basket', ''],
 ['', '']]

In [21]:
# removing empty lists from card information
card_info = [[info for info in book if info !=""] for book in card_info]
card_info = [row for row in card_info if row]
card_info

[['Sharp Objects', '£47.82', '✔ In stock', 'Add to basket'],
 ['In a Dark, Dark Wood', '£19.63', '✔ In stock', 'Add to basket'],
 ['The Past Never Ends', '£56.50', '✔ In stock', 'Add to basket'],
 ['A Murder in Time', '£16.64', ' Out stock', 'Add to basket']]

In [22]:
# Preparing data for json file
dict_keys = ['item'+str(i) for i in range(1, len((card_info))+1)]
cards_data = {dict_keys[i] : card_info[i] for i in range(len(dict_keys))}
cards_data

{'item1': ['Sharp Objects', '£47.82', '✔ In stock', 'Add to basket'],
 'item2': ['In a Dark, Dark Wood', '£19.63', '✔ In stock', 'Add to basket'],
 'item3': ['The Past Never Ends', '£56.50', '✔ In stock', 'Add to basket'],
 'item4': ['A Murder in Time', '£16.64', ' Out stock', 'Add to basket']}

In [23]:
with open('task3_product_information.json', 'w') as json_file:
    json.dump(cards_data, json_file)
print(json.dumps(cards_data, indent=4, ensure_ascii=False))

{
    "item1": [
        "Sharp Objects",
        "£47.82",
        "✔ In stock",
        "Add to basket"
    ],
    "item2": [
        "In a Dark, Dark Wood",
        "£19.63",
        "✔ In stock",
        "Add to basket"
    ],
    "item3": [
        "The Past Never Ends",
        "£56.50",
        "✔ In stock",
        "Add to basket"
    ],
    "item4": [
        "A Murder in Time",
        "£16.64",
        " Out stock",
        "Add to basket"
    ]
}


In [24]:
# clearing memory for task 3
del cards
del cards_title
del card_info
del dict_keys
del cards_data
gc.collect()

32

### Task 4 - Extract From Details:
Extract all input fields from the form, including:
- Field name (e.g., username, password).
- Input type (e.g., text, password, checkbox, etc.).
- Default values, if any.
- Save the data into a `JSON` file.

In [25]:
form_tag = soup.find('form')
form_tag

<form>
<label for="username">Username:</label>
<input id="username" name="username" placeholder="Enter your username" type="text"/>
<label for="password">Password:</label>
<input id="password" name="password" placeholder="Enter your password" type="password"/>
<label for="options">Choose an option:</label>
<select id="options" name="options">
<option value="option1">Option 1</option>
<option value="option2">Option 2</option>
<option value="option3">Option 3</option>
</select>
<label>
<input name="terms" type="checkbox"/> I agree to the terms and conditions
            </label>
<input type="submit" value="Submit"/>
</form>

In [26]:
# Extracting Field Names:
name_tags = form_tag.find_all('label')
#print(name_tags)
name_tags = name_tags[:-1]
field_names = [name_tag.get_text().replace(":", "") for name_tag in name_tags if ":" in name_tag.get_text()]
#field_names = [name.replace(":", "") for name in field_names if ":" in name]
# adding options name
#field_names = field_names.append(options_tag.attrs['name'])
#field_names.append(options_tag)
print(field_names)

['Username', 'Password', 'Choose an option']


In [27]:
# Extracting Input Types:
input_tags = form_tag.find_all('input')
input_types = [input_tag['type'] for input_tag in input_tags if 'type' in input_tag.attrs]
input_types = input_types[:-1]
print(input_types)

['text', 'password', 'checkbox']


In [28]:
# extracting default values
input_tags = form_tag.find_all('input')[:-2]
names_def_val = [tag.attrs['value'] if 'value' in tag.attrs else "" for tag in input_tags] # only for name, password
options_def_val = form_tag.find('option').attrs['value']
names_def_val.append(options_def_val)
name_tags
input_tags
print(names_def_val)

['', '', 'option1']


In [29]:
# preparing data in dictionary of dictionaries format for json file

# The dictionary of dictionaries keys
dict_keys = ['field_'+str(i+1) for i in range(len(field_names))]
#print(dict_keys)

# Inner dictionaries keys 
sub_dict_keys = ['Name', 'Input Type', 'Default Value'] 

# Inner dictionary values: list of tuples from elements of field_names, input_types and default value 
sub_dict_vals = list(zip(field_names, input_types, names_def_val)) 
print(list(zip(sub_dict_keys, list(sub_dict_vals)[1])))

# Creating the large dictionaries from inner dictionries
data_dict = {}
for i in range(len(field_names)):
    data_dict[dict_keys[i]] = {k:v for k,v in zip(sub_dict_keys, sub_dict_vals[i])} 
    
data_dict

[('Name', 'Password'), ('Input Type', 'password'), ('Default Value', '')]


{'field_1': {'Name': 'Username', 'Input Type': 'text', 'Default Value': ''},
 'field_2': {'Name': 'Password',
  'Input Type': 'password',
  'Default Value': ''},
 'field_3': {'Name': 'Choose an option',
  'Input Type': 'checkbox',
  'Default Value': 'option1'}}

In [30]:
# dumping the data into the json file
with open('task4_field_details_info.json', 'w') as file:
    json.dump(data_dict, file)
print(json.dumps(data_dict, indent=4))

{
    "field_1": {
        "Name": "Username",
        "Input Type": "text",
        "Default Value": ""
    },
    "field_2": {
        "Name": "Password",
        "Input Type": "password",
        "Default Value": ""
    },
    "field_3": {
        "Name": "Choose an option",
        "Input Type": "checkbox",
        "Default Value": "option1"
    }
}


In [31]:
# clearing memory for task 4
del form_tag
del field_names
del input_types
del names_def_val
del dict_keys
del sub_dict_keys
del data_dict
gc.collect()

32

### Task 5 - Extract Links and Multimedia: 
- Extract the video link from the `<iframe>` tag.
- Save the data into a .JSON filery-


In [32]:
# Extracting media link and dumping it into the json file
media_tag = soup.find('iframe')
media_link = media_tag.attrs['src']
#print(media_link)
link_dict = {'link' : media_link}
with open('task5_media_links.json', 'w') as f:
    json.dump(link_dict, f)
print(json.dumps(link_dict, indent=4))

{
    "link": "https://www.youtube.com/watch?v=ujf9RNuBdCU"
}


In [33]:
# dumping the data into the json file
media_tag = soup.find_all('iframe')

In [34]:
# clearing memory for task 5
del media_tag
del media_link
del link_dict
gc.collect()

32

### Task 6 - Scraping Challenge: 
Students must write a script to extract data from the Featured Products section with the following requirements:
- Product Name: Located within `<span class="name">`.
- Hidden Price: Located within `<span class="price">`, which has `style="display: none;"`.
- Available Colors: Located within `<span class="colors">`.
- Product ID: The value stored in the data-id attribute.
- Example Output:
  
  `[`<br />
  ` {'id': '101', 'name': 'Wireless Headphones', 'price': '$49.99', 'colors': 'Black, White, Blue'},`<br />
  ` {'id': '102', 'name': 'Smart Speaker', 'price': '$89.99', 'colors': 'Grey, Black'},`<br />
  ` {'id': '103', 'name': 'Smart Watch', 'price': '$149.99', 'colors': 'Black, Silver, Gold'}`<br />
  `]`

In [35]:
# getting featured products tags
feat_prod_tags = soup.find_all('div', class_='products')
feat_prod_tags

[<div class="products">
 <div class="product-card" data-id="101">
 <img alt="Wireless Headphones" src="https://via.placeholder.com/250x150?text=Product+1"/>
 <p class="name">Wireless Headphones</p>
 <p class="price" style="display: none;">$49.99</p>
 <p class="colors">Available colors: Black, White, Blue</p>
 <button>Add to Basket</button>
 </div>
 <div class="product-card" data-id="102">
 <img alt="Smart Speaker" src="https://via.placeholder.com/250x150?text=Product+2"/>
 <p class="name">Smart Speaker</p>
 <p class="price" style="display: none;">$89.99</p>
 <p class="colors">Available colors: Grey, Black</p>
 <button>Add to Basket</button>
 </div>
 <div class="product-card" data-id="103">
 <img alt="Smart Watch" src="https://via.placeholder.com/250x150?text=Product+3"/>
 <p class="name">Smart Watch</p>
 <p class="price" style="display: none;">$149.99</p>
 <p class="colors">Available colors: Black, Silver, Gold</p>
 <button>Add to Basket</button>
 </div>
 </div>]

In [41]:
# getting featured products field values (i.e id, name, price, colors)
ids_list = [tag.attrs['data-id'] for tag in feat_prod_tags[0].find_all('div', class_='product-card')]
names_list = [tag.get_text() for tag in feat_prod_tags[0].find_all('p', class_='name')]
price_list = [tag.get_text() for tag in feat_prod_tags[0].find_all('p', class_='price')]
colors_list = [tag.get_text().replace("Available colors: ", "") \
               for tag in feat_prod_tags[0].find_all('p', class_='colors')]
# print(ids_list)
# print(names_list)
# print(price_list)
# print(colors_list)

dict_keys = ['id', 'name', 'price', 'colors'] 
dict_vals = list(zip(ids_list, names_list, price_list, colors_list))
#dict_vals = [list(val) for val in dict_vals]
print(dict_keys)
print(dict_vals[:])
print(list(zip(dict_keys, dict_vals[2])))

feat_prod_data = []
# constructing featured products data list of dictionaries
for i in range(len(ids_list)):
    feat_prod_data.append(dict(zip(dict_keys, dict_vals[i])))
    # feat_prod_data.append({k:v for k,v in zip(dict_keys, dict_vals[i])})

feat_prod_data




# # Inner dictionary values: list of tuples from elements of field_names, input_types and default value 
# sub_dict_vals = list(zip(field_names, input_types, names_def_val)) 
# print(list(zip(sub_dict_keys, list(sub_dict_vals)[1])))

# # Creating the large dictionaries from inner dictionries
# data_dict = {}
# for i in range(len(field_names)):
#     data_dict[dict_keys[i]] = {k:v for k,v in zip(sub_dict_keys, sub_dict_vals[i])} 
    
# data_dict

['id', 'name', 'price', 'colors']
[('101', 'Wireless Headphones', '$49.99', 'Black, White, Blue'), ('102', 'Smart Speaker', '$89.99', 'Grey, Black'), ('103', 'Smart Watch', '$149.99', 'Black, Silver, Gold')]
[('id', '103'), ('name', 'Smart Watch'), ('price', '$149.99'), ('colors', 'Black, Silver, Gold')]


[{'id': '101',
  'name': 'Wireless Headphones',
  'price': '$49.99',
  'colors': 'Black, White, Blue'},
 {'id': '102',
  'name': 'Smart Speaker',
  'price': '$89.99',
  'colors': 'Grey, Black'},
 {'id': '103',
  'name': 'Smart Watch',
  'price': '$149.99',
  'colors': 'Black, Silver, Gold'}]