In [63]:
import requests

In [69]:
def get_sitemap(website):
    # get the website's sitemap
    
    # if url has -sitemap.xml in it, it is a sitemap
    if 'sitemap.xml' in website:
        response = requests.get(website)
        response.raise_for_status()
        return response.text
        
    response = requests.get(website + '/sitemap.xml')
    response.raise_for_status()  # Raise an error for bad status codes
    sitemap = response.text

    return sitemap

# usage
website = 'https://southshorefinelinens.com/'
sitemap = get_sitemap(website)
sitemap

'<?xml version="1.0" encoding="UTF-8"?>\n<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">\n  <!-- This is the parent sitemap linking to additional sitemaps for products, collections and pages as shown below. The sitemap can not be edited manually, but is kept up to date in real time. -->\n  <sitemap>\n    <loc>https://southshorefinelinens.com/sitemap_products_1.xml?from=1714667421763&amp;to=8687205351650</loc>\n  </sitemap>\n  <sitemap>\n    <loc>https://southshorefinelinens.com/sitemap_pages_1.xml</loc>\n  </sitemap>\n  <sitemap>\n    <loc>https://southshorefinelinens.com/sitemap_collections_1.xml</loc>\n  </sitemap>\n  <sitemap>\n    <loc>https://southshorefinelinens.com/sitemap_blogs_1.xml</loc>\n  </sitemap>\n</sitemapindex>\n'

In [71]:
def get_urls(sitemap):
    urls = []
    startloc = 0
    while True:
        startloc = sitemap.find('<loc>', startloc)
        if startloc == -1:
            break
        endloc = sitemap.find('</loc>', startloc)
        urls.append(sitemap[startloc+5:endloc])
        startloc = endloc
    return urls
    
urls = get_urls(sitemap)
urls

['https://southshorefinelinens.com/sitemap_products_1.xml?from=1714667421763&amp;to=8687205351650',
 'https://southshorefinelinens.com/sitemap_pages_1.xml',
 'https://southshorefinelinens.com/sitemap_collections_1.xml',
 'https://southshorefinelinens.com/sitemap_blogs_1.xml']

In [72]:
import xml.etree.ElementTree as ET

def fetch_and_parse_sitemap(url):
    response = requests.get(url)
    response.raise_for_status()  # Ensure the request was successful
    return response.text

def get_urls_from_sitemap(sitemap_content):
    root = ET.fromstring(sitemap_content)
    urls = []

    # Extract URLs from <loc> tags
    for loc in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
        urls.append(loc.text)
    
    return urls

def find_all_urls(sitemap_urls):
    all_urls = []
    for sitemap_url in sitemap_urls:
        sitemap_content = fetch_and_parse_sitemap(sitemap_url)
        urls = get_urls_from_sitemap(sitemap_content)
        all_urls.extend(urls)
    
    return all_urls
all_urls = find_all_urls(urls)
# print(all_urls)

# for urls in all_urls:
#     print(urls)


In [76]:
import os
def save_html_content(url, folder):
    try:
        response = requests.get(url)
        response.raise_for_status()

        # # Create a safe filename by replacing non-alphanumeric characters
        filename = url.replace("https://", "").replace("http://", "").replace("/", "_").replace("?", "_") + ".html"
        filepath = os.path.join(folder, filename)

        # Save the HTML content
        with open(filepath, 'w', encoding='utf-8') as file:
            file.write(response.text)
        
        print(f"Saved: {url} to {filepath}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")

def find_all_urls_and_save_html(sitemap_urls):
    folder = website.replace("https://", "").replace("http://", "").replace("/", "_").replace("?", "_") 
    os.makedirs(folder, exist_ok=True)  # Create the folder if it doesn't exist

    all_urls = []
    for sitemap_url in sitemap_urls:
        sitemap_content = fetch_and_parse_sitemap(sitemap_url)
        urls = get_urls_from_sitemap(sitemap_content)
        all_urls.extend(urls)
    
    for url in all_urls:
        save_html_content(url, folder)


# sitemap_urls =["https://appifycommerce.myshopify.com/sitemap.xml"]

# find_all_urls_and_save_html(sitemap_urls)

find_all_urls_and_save_html(urls)


Saved: https://southshorefinelinens.com/ to southshorefinelinens.com_\southshorefinelinens.com_.html
Saved: https://southshorefinelinens.com/products/winter-brush-comforter-set to southshorefinelinens.com_\southshorefinelinens.com_products_winter-brush-comforter-set.html
Saved: https://southshorefinelinens.com/products/southshore-basics-microfiber-duvet-cover-set to southshorefinelinens.com_\southshorefinelinens.com_products_southshore-basics-microfiber-duvet-cover-set.html
Saved: https://southshorefinelinens.com/products/extra-deep-pocket-sheet-set-vilano-springs to southshorefinelinens.com_\southshorefinelinens.com_products_extra-deep-pocket-sheet-set-vilano-springs.html
Saved: https://southshorefinelinens.com/products/duvet-cover-set-vilano-springs to southshorefinelinens.com_\southshorefinelinens.com_products_duvet-cover-set-vilano-springs.html
Saved: https://southshorefinelinens.com/products/winter-brush-quilt-set to southshorefinelinens.com_\southshorefinelinens.com_products_wint

In [77]:
# # 
# import os
# import requests

# def save_html_content(url, base_folder=r"CO"):
#     try:
#         # Create a folder name based on the URL
#         folder_name = url.replace("https://", "").replace("http://", "").replace("/", "_").replace("?", "_").replace(":", "_")
#         folder_path = os.path.join(base_folder, folder_name)
        
#         # Create the folder if it doesn't exist
#         os.makedirs(folder_path, exist_ok=True)
        
#         # Fetch the HTML content
#         response = requests.get(url)
#         response.raise_for_status()

#         # Define the file path
#         filename = "index.html"
#         filepath = os.path.join(folder_path, filename)

#         # Save the HTML content
#         with open(filepath, 'w', encoding='utf-8') as file:
#             file.write(response.text)
        
#         print(f"Saved: {url} to {filepath}")
#     except requests.exceptions.RequestException as e:
#         print(f"Failed to fetch {url}: {e}")

# def find_all_urls_and_save_html(sitemap_urls):
#     base_folder = "SHANI"
#     os.makedirs(base_folder, exist_ok=True)  # Create the base folder if it doesn't exist

#     all_urls = []
#     for sitemap_url in sitemap_urls:
#         sitemap_content = fetch_and_parse_sitemap(sitemap_url)
#         urls = get_urls_from_sitemap(sitemap_content)
#         all_urls.extend(urls)
    
#     for url in all_urls:
#         save_html_content(url, base_folder)

# # Example usage:
# # sitemap_urls = ["https://appifycommerce.myshopify.com/sitemap.xml"]
# find_all_urls_and_save_html(urls)
