# Web scraping Task
##### Student: Khalid Salim
##### Student Number: 2
##### Group: CAI3_AIS4_G1

## For the webpage: https://baraasalout.github.io/test.html, perform the following tasks

### Importing required libraries and modules

In [1]:
# Installing required libraries
#!pip install requests
#!pip install beautifulsoup4

In [2]:
# Importing required modules and functions
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import gc



### Task 1 - Extract Text Data:

In [3]:
# Getting page HTML code
url = 'https://baraasalout.github.io/test.html'

page = requests.get(url)
#print(page.content)

In [20]:
# Parsing HTML content
soup = BeautifulSoup(page.content, 'html.parser')
soup

<!DOCTYPE html>

<html>
<head>
<title>Web Scraping Task with Form</title>
<style>
        body {
            font-family: Arial, sans-serif;
            margin: 20px;
            background-color: #f5f5f5;
        }

        h1 {
            color: darkred;
            text-align: center;
        }

        h2 {
            color: darkblue;
            font-style: italic;
        }

        p {
            color: #555;
            font-size: 14px;
        }

        img {
            width: 250px;
            height: auto;
            border-radius: 10px;
        }

        table {
            width: 100%;
            border-collapse: collapse;
            margin: 20px 0;
        }

        table, th, td {
            border: 1px solid #ccc;
        }

        th {
            background-color: #333;
            color: white;
            padding: 10px;
        }

        td {
            text-align: center;
            padding: 10px;
        }

        .btn {
            background-col

In [5]:
#print(soup.text)

#### Extract all headings (`<h1>`, `<h2>`).

In [6]:
h1_headings = soup.find_all('h1')
h1_headings

[<h1>Web Scraping Practice</h1>]

In [7]:
#print(h1_headings[0].get_text())

In [8]:
h2_headings = soup.find_all('h2')
h2_headings

[<h2>Available Products</h2>,
 <h2>Product Table</h2>,
 <h2>Watch This Video</h2>,
 <h2>Contact Us</h2>,
 <h2>Product Information</h2>,
 <h2>Featured Products</h2>]

In [9]:
#for heading in h2_headings:
#    print(heading.get_text())

#### Extract all text content inside `<p>` and `<li>` tags

In [10]:
paragraphs = soup.find_all('p')
paragraphs

[<p>Welcome to the web scraping task! Use your skills to extract the required data from this page.</p>,
 <p><strong>Sharp Objects</strong></p>,
 <p style="color: green;">£47.82</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>In a Dark, Dark Wood</strong></p>,
 <p style="color: green;">£19.63</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>The Past Never Ends</strong></p>,
 <p style="color: green;">£56.50</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>A Murder in Time</strong></p>,
 <p style="color: green;">£16.64</p>,
 <p style="color: green;"> Out stock</p>,
 <p class="name">Wireless Headphones</p>,
 <p class="price" style="display: none;">$49.99</p>,
 <p class="colors">Available colors: Black, White, Blue</p>,
 <p class="name">Smart Speaker</p>,
 <p class="price" style="display: none;">$89.99</p>,
 <p class="colors">Available colors: Grey, Black</p>,
 <p class="name">Smart Watch</p>,
 <p class="price" style="display: none;">$149.99</p>,
 <p class="co

In [11]:
listings = soup.find_all('li')
listings

[<li class="highlight">Laptop</li>,
 <li>Smartphone</li>,
 <li>Tablet</li>,
 <li>Smartwatch</li>]

#### Save this data into a `Extract_Text_Data.CSV` file.

In [12]:
# Arranging data
header = [['Type', 'Content']] # csv file header
h1_headings_data = [['Heading', row.get_text()] for row in h1_headings]
#print(*h1_headings_data, sep='\n')
h2_headings_data = [['Heading', row.get_text()] for row in h2_headings]
paragraph_data = [['Paragraph', row.get_text()] for row in paragraphs]
#print(*paragraph_data, sep='\n')
listing_data = [['Listing', row.get_text()] for row in listings]
#print(*listing_data, sep='\n')

# data to be saved into csv
extracted_data = header
extracted_data.extend(h1_headings_data)
extracted_data.extend(h2_headings_data)
extracted_data.extend(paragraph_data)
extracted_data.extend(listing_data)
#print(extracted_data)

In [13]:
# creating csv file to save data into
with open('extract_text_data.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(extracted_data)

In [14]:
del extracted_data
del paragraph_data
del listing_data
del h1_headings_data
del h2_headings_data
gc.collect()

27

### Task 2 - Extract Table Data:

In [15]:
# preparing table data for csv file
tables = soup.find('table')
tables = tables.get_text().split('\n\n\n')
tables_data = [row.split('\n') for row in tables]
extracted_data = [[item for item in row if item != ""] for row in tables_data]
print(tables)
print(tables_data)

['\n\nProduct\nPrice\nIn Stock', 'Laptop\n$1000\nYes', 'Smartphone\n$800\nNo', 'Tablet\n$500\nYes\n\n']
[['', '', 'Product', 'Price', 'In Stock'], ['Laptop', '$1000', 'Yes'], ['Smartphone', '$800', 'No'], ['Tablet', '$500', 'Yes', '', '']]


In [16]:
# creating csv file for table data
with open('extract_table_data.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(extracted_data)

### Task 3 - Extract Product Information (Card Section):

In [None]:
# Extracting card section
cards = soup.find('div')
cards

In [34]:
cards_titles = cards.find_all('p')
cards_titles

[<p><strong>Sharp Objects</strong></p>,
 <p style="color: green;">£47.82</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>In a Dark, Dark Wood</strong></p>,
 <p style="color: green;">£19.63</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>The Past Never Ends</strong></p>,
 <p style="color: green;">£56.50</p>,
 <p style="color: green;">✔ In stock</p>,
 <p><strong>A Murder in Time</strong></p>,
 <p style="color: green;">£16.64</p>,
 <p style="color: green;"> Out stock</p>]

In [56]:
book_info = [title.get_text().split('\n') for title in cards]
book_info

[['', ''],
 ['', '', 'Sharp Objects', '£47.82', '✔ In stock', 'Add to basket', ''],
 ['', ''],
 ['', '', 'In a Dark, Dark Wood', '£19.63', '✔ In stock', 'Add to basket', ''],
 ['', ''],
 ['', '', 'The Past Never Ends', '£56.50', '✔ In stock', 'Add to basket', ''],
 ['', ''],
 ['', '', 'A Murder in Time', '£16.64', ' Out stock', 'Add to basket', ''],
 ['', '']]

In [57]:
book_info = [[info for info in book if info !=""] for book in book_info]
book_info = [row for row in book_info if row]
book_info

[['Sharp Objects', '£47.82', '✔ In stock', 'Add to basket'],
 ['In a Dark, Dark Wood', '£19.63', '✔ In stock', 'Add to basket'],
 ['The Past Never Ends', '£56.50', '✔ In stock', 'Add to basket'],
 ['A Murder in Time', '£16.64', ' Out stock', 'Add to basket']]