# Web scraping Task
##### Student: Khalid Salim
##### Student Number: 2
##### Group: CAI3_AIS4_G1

## For the webpage: https://baraasalout.github.io/test.html, perform the following tasks

### Importing required libraries and modules

In [None]:
# Installing required libraries
#!pip install requests
#!pip install beautifulsoup4

In [None]:
# Importing required modules and functions
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
import gc
import json

### Task 1 - Extract Text Data:
* Extract all headings (`<h1>`, `<h2>`).
* Extract all text content inside `<p>` and `<li>` tags.
* Save this data into a `Extract_Text_Data.CSV` file.

In [None]:
# Getting page HTML code
url = 'https://baraasalout.github.io/test.html'

page = requests.get(url)
#print(page.content)

In [None]:
# Parsing HTML content
soup = BeautifulSoup(page.content, 'html.parser')
soup

In [None]:
#print(soup.text)

#### Extract all headings (`<h1>`, `<h2>`).

In [None]:
h1_headings = soup.find_all('h1')
h1_headings

In [None]:
#print(h1_headings[0].get_text())

In [None]:
h2_headings = soup.find_all('h2')
h2_headings

In [None]:
#for heading in h2_headings:
#    print(heading.get_text())

#### Extract all text content inside `<p>` and `<li>` tags

In [None]:
paragraphs = soup.find_all('p')
paragraphs

In [None]:
listings = soup.find_all('li')
listings

#### Save this data into a `Extract_Text_Data.CSV` file.

In [None]:
# Arranging data
header = [['Type', 'Content']] # csv file header
h1_headings_data = [['Heading', row.get_text()] for row in h1_headings]
#print(*h1_headings_data, sep='\n')
h2_headings_data = [['Heading', row.get_text()] for row in h2_headings]
paragraph_data = [['Paragraph', row.get_text()] for row in paragraphs]
#print(*paragraph_data, sep='\n')
listing_data = [['Listing', row.get_text()] for row in listings]
#print(*listing_data, sep='\n')

# data to be saved into csv
extracted_data = header
extracted_data.extend(h1_headings_data)
extracted_data.extend(h2_headings_data)
extracted_data.extend(paragraph_data)
extracted_data.extend(listing_data)
#print(extracted_data)

In [None]:
# creating csv file to save data into
with open('task1_extract_text_data.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(extracted_data)

In [None]:
# clearing memory for task 1
del page
del h1_headings
del h2_headings
del paragraphs
del extracted_data
del paragraph_data
del listings
del listing_data
del header
del h1_headings_data
del h2_headings_data
gc.collect()

### Task 2 - Extract Table Data:
- Product Name.
- Price.
- Stock Status.us.

In [None]:
# preparing table data for csv file
tables = soup.find('table')
tables = tables.get_text().split('\n\n\n')
tables_data = [row.split('\n') for row in tables]
extracted_data = [[item for item in row if item != ""] for row in tables_data]
print(tables)
print(tables_data)

In [None]:
# creating csv file for table data
with open('task2_extract_table_data.csv', 'w', encoding='utf-8', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(extracted_data)

In [None]:
# clearing memory for task 2
del tables
del tables_data
del extracted_data
gc.collect()

### Task 3 - Extract Product Information (Card Section):
- Extract data from the book cards at the bottom of the page, including:    
- Book Title.    
- Price    .
- Stock Availabil  y.
- Button text (e.g., "Add to basket"). 
- Save the dataproductiict_Informa` JSON fileary-


In [None]:
# Extracting card section
cards = soup.find('div')
cards

In [None]:
# extracting card tags
cards_title = cards.find_all('p')
cards_title

In [None]:
# Getting card information from tags
card_info = [title.get_text().split('\n') for title in cards]
card_info

In [None]:
# removing empty lists from card information
card_info = [[info for info in book if info !=""] for book in card_info]
card_info = [row for row in card_info if row]
card_info

In [None]:
# Preparing data for json file
dict_keys = ['item'+str(i) for i in range(1, len((card_info))+1)]
cards_data = {dict_keys[i] : card_info[i] for i in range(len(dict_keys))}
cards_data

In [None]:
with open('task3_product_information.json', 'w') as json_file:
    json.dump(cards_data, json_file)
print(json.dumps(cards_data, indent=4, ensure_ascii=False))

In [None]:
# clearing memory for task 3
del cards
del cards_title
del card_info
del dict_keys
del cards_data
gc.collect()

### Task 4 - Extract From Details:
Extract all input fields from the form, including:
- Field name (e.g., username, password).
- Input type (e.g., text, password, checkbox, etc.).
- Default values, if any.
- Save the data into a `JSON` file.

In [None]:
form_tag = soup.find('form')
form_tag

In [None]:
# Extracting Field Names:
name_tags = form_tag.find_all('label')
#print(name_tags)
name_tags = name_tags[:-1]
field_names = [name_tag.get_text().replace(":", "") for name_tag in name_tags if ":" in name_tag.get_text()]
#field_names = [name.replace(":", "") for name in field_names if ":" in name]
# adding options name
#field_names = field_names.append(options_tag.attrs['name'])
#field_names.append(options_tag)
print(field_names)

In [None]:
# Extracting Input Types:
input_tags = form_tag.find_all('input')
input_types = [input_tag['type'] for input_tag in input_tags if 'type' in input_tag.attrs]
input_types = input_types[:-1]
print(input_types)

In [None]:
# extracting default values
input_tags = form_tag.find_all('input')[:-2]
names_def_val = [tag.attrs['value'] if 'value' in tag.attrs else "" for tag in input_tags] # only for name, password
options_def_val = form_tag.find('option').attrs['value']
names_def_val.append(options_def_val)
name_tags
input_tags
print(names_def_val)

In [None]:
# preparing data in dictionary of dictionaries format for json file

# The dictionary of dictionaries keys
dict_keys = ['field_'+str(i+1) for i in range(len(field_names))]
#print(dict_keys)

# Inner dictionaries keys 
sub_dict_keys = ['Name', 'Input Type', 'Default Value'] 

# Inner dictionary values: list of tuples from elements of field_names, input_types and default value 
sub_dict_vals = list(zip(field_names, input_types, names_def_val)) 
print(list(zip(sub_dict_keys, list(sub_dict_vals)[1])))

# Creating the large dictionaries from inner dictionries
data_dict = {}
for i in range(len(field_names)):
    data_dict[dict_keys[i]] = {k:v for k,v in zip(sub_dict_keys, sub_dict_vals[i])} 
    
data_dict

In [None]:
# dumping the data into the json file
with open('task4_field_details_info.json', 'w') as file:
    json.dump(data_dict, file)
print(json.dumps(data_dict, indent=4))

In [None]:
# clearing memory for task 4
del form_tag
del field_names
del input_types
del names_def_val
del dict_keys
del sub_dict_keys
del data_dict
gc.collect()

### Task 5 - Extract Links and Multimedia: 
- Extract the video link from the `<iframe>` tag.
- Save the data into a .JSON filery-


In [None]:
# Extracting media link and dumping it into the json file
media_tag = soup.find('iframe')
media_link = media_tag.attrs['src']
#print(media_link)
link_dict = {'link' : media_link}
with open('task5_media_links.json', 'w') as f:
    json.dump(link_dict, f)
print(json.dumps(link_dict, indent=4))

In [None]:
# dumping the data into the json file
media_tag = soup.find_all('iframe')

In [None]:
# clearing memory for task 5
del media_tag
del media_link
del link_dict
gc.collect()

### Task 6 - Scraping Challenge: 
Students must write a script to extract data from the Featured Products section with the following requirements:
- Product Name: Located within `<span class="name">`.
- Hidden Price: Located within `<span class="price">`, which has `style="display: none;"`.
- Available Colors: Located within `<span class="colors">`.
- Product ID: The value stored in the data-id attribute.
- Example Output:
  
  `[`<br />
  ` {'id': '101', 'name': 'Wireless Headphones', 'price': '$49.99', 'colors': 'Black, White, Blue'},`<br />
  ` {'id': '102', 'name': 'Smart Speaker', 'price': '$89.99', 'colors': 'Grey, Black'},`<br />
  ` {'id': '103', 'name': 'Smart Watch', 'price': '$149.99', 'colors': 'Black, Silver, Gold'}`<br />
  `]`

In [None]:
# getting featured products tags
feat_prod_tags = soup.find_all('div', class_='products')
feat_prod_tags

In [None]:
# getting featured products field values (i.e id, name, price, colors)
ids_list = [tag.attrs['data-id'] for tag in feat_prod_tags[0].find_all('div', class_='product-card')]
names_list = [tag.get_text() for tag in feat_prod_tags[0].find_all('p', class_='name')]
price_list = [tag.get_text() for tag in feat_prod_tags[0].find_all('p', class_='price')]
colors_list = [tag.get_text().replace("Available colors: ", "") \
               for tag in feat_prod_tags[0].find_all('p', class_='colors')]
# print(ids_list)
# print(names_list)
# print(price_list)
# print(colors_list)

dict_keys = ['id', 'name', 'price', 'colors'] 
dict_vals = list(zip(ids_list, names_list, price_list, colors_list))
# print(dict_keys)
# print(dict_vals)
# print(list(zip(dict_keys, dict_vals[2])))

# constructing featured products data list of dictionaries
feat_prod_data = []
for i in range(len(ids_list)):
    feat_prod_data.append(dict(zip(dict_keys, dict_vals[i])))
    # or another alternative:
    # feat_prod_data.append({k:v for k,v in zip(dict_keys, dict_vals[i])})

feat_prod_data
