### Importing Packages

In [14]:
##!pip install beautifulsoup4
#!pip install requests

In [15]:
from bs4 import BeautifulSoup
import requests
import time
import datetime

from requests_html import HTMLSession

import pandas as pd
import numpy as np
import csv
import smtplib
import re
import random
import os

### Setup - Webscrape Header & Determine "Robot.txt" Limits

#### Context:
The links that are "not explicitly disallowed" to be crawled that can be of use ~~(for me)~~ are the following: </br>
"https://www.amazon.com/s?k={Category}" </br>
"https://www.amazon.com/gp/{productName}"


#### Algorithm:
+ Base page will be "https://www.amazon.com/s?k={Category}".
+ At Base page, Extract the ff:
    + Product Links
+ Make a function from #2 to iterate and extract all links from "1st" to "Last" page.
+ After extracting all links, make function that will extract the Product Landing page with the ff:
    + ASIN (Amazon Standard Identification Number)
    + Product Name
    + Product Price
    + Product Rating
    + Product Availability
    + Product Sold

In [16]:
# Extract Product ASIN (Amazon Standard Identification Number)
def soup_product_ASIN(soup):
    try: 
        soup_product_ASIN_tag = soup_product_content.find("input", attrs = {"id":"ASIN"})
        soup_product_ASIN = soup_product_ASIN_tag.get("value")
    except Exception as e: 
        soup_product_ASIN = ""
    return soup_product_ASIN

# Extract Product Sold
def soup_product_nSOLD(soup):
    try: 
        soup_product_nSOLD_tag = soup.find("span", attrs = {"id":"social-proofing-faceout-title-tk_bought"})
        soup_product_nSOLD = soup_product_nSOLD_tag.text.strip()
    except AttributeError: 
        try:
            soup_product_nSOLD_tag = soup.find("div", attrs = {"id":"outOfStock"})
            soup_product_nSOLD = soup_product_nSOLD_tag.text.strip()
        except AttributeError: 
            soup_product_nSOLD = ""
    return soup_product_nSOLD

# Extract Product Availability
def soup_product_AVAILABILITY(soup):
    try: 
        soup_product_AVAILABILITY_tag = soup.find("div", attrs = {"id":"availability"})
        soup_product_AVAILABILITY = soup_product_AVAILABILITY_tag.text.strip()
    except Exception as e: 
        soup_product_AVAILABILITY = ""
    return soup_product_AVAILABILITY

# Extract Product No. of Reviews
def soup_product_REVIEWS(soup):
    try: 
        soup_product_REVIEWS_tag = soup.find("span", attrs = {"id":"acrCustomerReviewText"})
        soup_product_REVIEWS = soup_product_REVIEWS_tag.text.strip()
    except Exception as e: 
        soup_product_REVIEWS = ""
    return soup_product_REVIEWS

# Extract Product Rating
def soup_product_RATING(soup):
    try: 
        soup_product_RATING_tag = soup.find("i", attrs = {"data-hook":"average-star-rating"})
        soup_product_RATING = soup_product_RATING_tag.text.strip()
    except Exception as e: 
        soup_product_RATING = ""
    return soup_product_RATING

# Extract Product Price
def soup_product_PRICE(soup):
    try: 
        soup_product_PRICE_tag = soup.find("div", attrs = {"id":"corePrice_feature_div"})\
            .find("span", attrs = {"class":"a-offscreen"})
        soup_product_PRICE = soup_product_PRICE_tag.text.strip()
    except Exception as e: 
        soup_product_PRICE = ""
    return soup_product_PRICE

# Extract Product Title
def soup_product_TITLE(soup):
    try: 
        soup_product_TITLE_tag = soup.find("span", attrs = {"id":"productTitle"})
        soup_product_TITLE = soup_product_TITLE_tag.text.strip()
    except Exception as e: 
        soup_product_TITLE = ""
    return soup_product_TITLE

# HTML Parsing
def BS4_Parsing(raw_html):
    BS4_Parsing = BeautifulSoup(raw_html.content, "html.parser")
    return BS4_Parsing

# HTML Request
def BS4_HTMLrequest(link):
    delay = random.uniform(1,4)
    time.sleep(delay)
    response_category = requests.get(link, headers = HEADERS)
    BS4_Parsing = BeautifulSoup(response_category.content, "html.parser")
    return BS4_Parsing

# File path and file name to save the CSV file
def export_toCSV(df):
    current_time = datetime.datetime.now()
    current_date = current_time.strftime("%Y-%m-%d")
    current_time = current_time.strftime("%H-%M")

    csv_file_path_a = rf"G:\00 Work Files\02 Work Trainings & Apps\00 Project Portfolio\Ecommerce_Amazon (Python, SQL, PowerBI)\CSV Files\Webscrape_Amazon_Archive\product_details_masterData.csv"
    csv_file_path_b = rf"G:\00 Work Files\02 Work Trainings & Apps\00 Project Portfolio\Ecommerce_Amazon (Python, SQL, PowerBI)\CSV Files\Webscrape_Amazon_Archive\product_details_{current_date}_{current_time}_GMT+8.csv"
    csv_file_path_c = rf"G:\00 Work Files\02 Work Trainings & Apps\00 Project Portfolio\Ecommerce_Amazon (Python, SQL, PowerBI)\CSV Files\product_details_updateSQL_{current_date}.csv"

    # Write the DataFrame to a CSV file
    encodings_to_try = ['latin1', 'utf-8', 'Windows-1252']
    for encodingTRY in encodings_to_try:
        try:
            print(f"Trying encoding: {encodingTRY}")
            
            # Write the DataFrame to path_a = append
            if os.path.exists(csv_file_path_a):
                df.to_csv(csv_file_path_a, mode='a', index=False, header=False, encoding=encodingTRY)
            else:
                # Write the DataFrame to a new CSV file
                df.to_csv(csv_file_path_a, index=False, encoding=encodingTRY)
                
            # Write the DataFrame to path_b = always new CSV file
            df.to_csv(csv_file_path_b, index=False, encoding=encodingTRY)

            # Write the DataFrame to path_c = append if same date today, else: new CSV file
            if os.path.exists(csv_file_path_c):
                df.to_csv(csv_file_path_c, mode='a', index=False, header=False, encoding=encodingTRY)
            else:
                df.to_csv(csv_file_path_c, index=False, encoding=encodingTRY)
        
            break
            
        except UnicodeEncodeError as e:
            # If encoding fails, try the next encoding
            print(f"Error writing to file with encoding {encodingTRY}: {e}")
            continue
    # If all encodings fail, print an error message
    else:
        print("Failed to write DataFrame to CSV file with any of the encodings.")
    
    success_string = f"""
    CSV files saved successfully: 
    • {csv_file_path_a}
    • {csv_file_path_b}
    • {csv_file_path_c}
    """
    return success_string

In [17]:
# ---------------------------------- Main Code
# ---------------------------------- HTTP REQUEST SETUP & EXCEPTION HANDLING
if __name__ == '__main__':
    # Webcrawling Best Practice = Send "User-Agent" info to website.
    HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
    BASE_URL = "https://www.amazon"
    Amazon_branchSites = [ ".sg",".com", ".com.au",".ca",".co.uk"]
#    Amazon_branchSites = [".com", ".com.au",".ca",".co.uk"]
#    Amazon_branchSites = [".com.au",".ca",".co.uk"]
    Base_URL_sites = []
    for site in Amazon_branchSites:
        Base_URL_sites.append(BASE_URL+site)
        
    # Extract href in category list.
    csv_FilePaTH = r"G:\00 Work Files\02 Work Trainings & Apps\00 Project Portfolio\Ecommerce_Amazon (Python, SQL, PowerBI)\CSV Files\Amazon_Olist\product_category_name_translation"
    csv_FileName = r"product_category_name_translation.csv"
    csv_filePaTH_REF = os.path.join(csv_FilePaTH,csv_FileName)
    
    df_productDetails = pd.read_csv(csv_filePaTH_REF, encoding='latin1', na_values=[""])
    categoryNames = df_productDetails['product_category_name_english']
    
    for Category in categoryNames:
        Base_URL_Category = []
    
        try:
            for base_indexURL,x in enumerate(Base_URL_sites):
                Base_URL_site = f"{Base_URL_sites[base_indexURL]}"
                Base_URL_Category = f"{Base_URL_sites[base_indexURL]}/s?k={Category}"
                response_category = requests.get(Base_URL_Category, headers = HEADERS)
                statusCode = response_category.status_code
                delay = random.uniform(1,4)
                print(f"Base URL: {Base_URL_site}")
                print(f"URL Requested: {Base_URL_Category}")
                print(f"Status Code: {statusCode}")
                print(f"Base URL Index: {base_indexURL}")
    
                if statusCode == 200: break
                print(f"Retrying Connection: {delay:.2f} seconds.")
                print("-----------")
                time.sleep(delay)
    
        except Exception as e:
            print(f"Error: {e}")
    
        # ---------------------------------- CATEGORY PRODUCT LINKS
        # Start with the initial page
        Category_URL = Base_URL_Category
        category_link_list = []
        category_link_list.append(Category_URL)
    
        while Category_URL:
            # Find the link to the next page
            soup_category_links_aTag = BS4_Parsing(response_category).find("a",\
              attrs={"class":"s-pagination-item s-pagination-next s-pagination-button s-pagination-separator"})
    
            if soup_category_links_aTag:
                # Extract the URL of the next page
                next_page_url = soup_category_links_aTag.get('href')
    
                # Update the Category_URL to the URL of the next page
                Category_URL = Base_URL_site + next_page_url
    
                # Append the URL to the category_link_list
                category_link_list.append(Category_URL)
    
                # Request HTML
                response_category = requests.get(Category_URL, headers = HEADERS)
                if response_category.status_code == 200:
                    delay = random.uniform(1,4)
                    print(f"URL Request Success on {Category_URL}")
                    print(f"Next Extraction will be after {delay:.2f} seconds.")
                    time.sleep(delay)
            else:
                # If there is no next page, exit the loop
                break
    
        # Now you have a list of all the category links
        print(f"----------------")
        print(f"No. of Pages Extracted: {len(category_link_list)} for {Category} Category.")
    
        # ---------------------------------- PRODUCT DETAILS EXTRACTION
        # Create list for the product details value holder
        productDetails = {"TIMESTAMP":[], "ASIN":[], "CATEGORY":[], "TITLE":[], "PRICE":[], "RATING":[],\
                          "NO. OF REVIEWS":[],"AVAILABILITY":[], "NO. OF SOLD PRODUCTS":[], "LINK":[]}
    
        for nLink_category in category_link_list:
            # <a class="a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal" htrf=**>
            soup_product_links = []
            soup_product_links_aTag = BS4_HTMLrequest(nLink_category).find_all("a",\
                  attrs={"class":"a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
    
            i = 0 ##----------------------------------------------------------- for testing purposes, to shorten web scraping time
            for links in soup_product_links_aTag:
                i +=1 ##------------------------------------------------------- for testing purposes, to shorten web scraping time
                if i == 3:
                    break
                soup_product_links.append(links.get("href"))
                
            i = 0 ##----------------------------------------------------------- for testing purposes, to shorten web scraping time
            for index,links in enumerate(soup_product_links_aTag):
                i +=1 ##------------------------------------------------------- for testing purposes, to shorten web scraping time
                if i == 3:
                    break
                    
                Base_URL_Product = f"{Base_URL_site}{soup_product_links[index]}"
                response_product = requests.get(Base_URL_Product, headers = HEADERS)
                soup_product_content = BeautifulSoup(response_product.content, "html.parser")
                
                current_time = datetime.datetime.now()
                timestamp = current_time.strftime("%Y-%m-%d %H:%M:%S")
                
                productDetails["LINK"].append(Base_URL_Product)
                productDetails["TIMESTAMP"].append(timestamp)
                productDetails["CATEGORY"].append(Category)
                productDetails["ASIN"].append(soup_product_ASIN(soup_product_content))
                productDetails["TITLE"].append(soup_product_TITLE(soup_product_content))
                productDetails["PRICE"].append(soup_product_PRICE(soup_product_content))
                productDetails["RATING"].append(soup_product_RATING(soup_product_content))
                productDetails["NO. OF REVIEWS"].append(soup_product_REVIEWS(soup_product_content))
                productDetails["AVAILABILITY"].append(soup_product_AVAILABILITY(soup_product_content))
                productDetails["NO. OF SOLD PRODUCTS"].append(soup_product_nSOLD(soup_product_content))
                delay = random.uniform(1,4)
                print(f"----------------")
                print(f"URL Requested: {nLink_category}")
                print(f"Appending Success for Index {index} out of {len(soup_product_links_aTag)-1}.")
                print(f"Extracted Page: {Base_URL_Product}")
                print(f"Please wait for next product extraction after {delay:.2f} seconds.")
                print(f"Thank you!")
                time.sleep(delay)
        # Check Output
        df_products = pd.DataFrame(productDetails)
        print(f"----------------")
        print(df_products)
        # Export to CSV files
        print(export_toCSV(df_products))


Base URL: https://www.amazon.sg
URL Requested: https://www.amazon.sg/s?k=health_beauty
Status Code: 200
Base URL Index: 0
URL Request Success on https://www.amazon.sg/s?k=health_beauty&page=2&qid=1709199404&ref=sr_pg_1
Next Extraction will be after 3.77 seconds.
URL Request Success on https://www.amazon.sg/s?k=health_beauty&page=3&qid=1709199407&ref=sr_pg_2
Next Extraction will be after 3.35 seconds.
URL Request Success on https://www.amazon.sg/s?k=health_beauty&page=4&qid=1709199413&ref=sr_pg_3
Next Extraction will be after 3.12 seconds.
URL Request Success on https://www.amazon.sg/s?k=health_beauty&page=5&qid=1709199417&ref=sr_pg_4
Next Extraction will be after 3.31 seconds.
URL Request Success on https://www.amazon.sg/s?k=health_beauty&page=6&qid=1709199422&ref=sr_pg_5
Next Extraction will be after 2.08 seconds.
URL Request Success on https://www.amazon.sg/s?k=health_beauty&page=7&qid=1709199427&ref=sr_pg_6
Next Extraction will be after 2.98 seconds.
----------------
No. of Pages Ex

KeyboardInterrupt: 