In [16]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse

In [17]:
sitemap_url = 'https://hydrotruewater.com/page-sitemap.xml'  

In [18]:
# Function to fetch and parse sitemap URLs
def fetch_sitemap_urls(sitemap_url):
    try:
        response = requests.get(sitemap_url)
        response.raise_for_status()  # Check if the request was successful
        soup = BeautifulSoup(response.content, 'xml')
        # Filter out URLs that contain '/wp'
        urls = [loc.text for loc in soup.find_all('loc') if '/wp' not in loc.text]
        return urls
    except Exception as e:
        print(f"Failed to fetch or parse sitemap: {e}")
        return []

# Function to fetch data from each URL
def fetch_url_data(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract title
        title = soup.find('title').text if soup.find('title') else 'No Title'
        
        # Extract meta description
        meta_desc = soup.find('meta', attrs={'name': 'description'})
        meta_desc = meta_desc['content'] if meta_desc else 'No Description'
        
        # Extract the first H1 tag
        h1 = soup.find('h1').text if soup.find('h1') else 'No H1'
        
        return {'URL': url, 'Title': title, 'Meta Description': meta_desc, 'H1': h1}
    except Exception as e:
        return {'URL': url, 'Title': 'Failed to retrieve', 'Meta Description': 'Failed to retrieve', 'H1': 'Failed to retrieve'}

# Function to process all URLs in the sitemap
def process_sitemap(sitemap_url):
    url_list = fetch_sitemap_urls(sitemap_url)
    data = [fetch_url_data(url) for url in url_list]
    return pd.DataFrame(data)


In [19]:
df = process_sitemap(sitemap_url)

# Helper function to calculate URL depth
def url_depth(url):
    # Remove the protocol and domain part of the URL
    path = urlparse(url).path
    return path.count('/')

In [20]:
## Formatting

df['Depth'] = df['URL'].apply(url_depth)
df_sorted = df.sort_values(by=['Depth', 'URL'])
total_rows = df_sorted.shape[0]
print(df_sorted[['URL', 'Title', 'Meta Description', 'H1', 'Depth']])
print(f"Total number of rows: {total_rows}")

df_sorted.head()

                                                  URL  \
0                         https://hydrotruewater.com/   
4                https://hydrotruewater.com/about-us/   
8      https://hydrotruewater.com/application-briefs/   
14           https://hydrotruewater.com/applications/   
1              https://hydrotruewater.com/contact-us/   
5    https://hydrotruewater.com/free-site-assessment/   
3          https://hydrotruewater.com/privacy-policy/   
16               https://hydrotruewater.com/products/   
7               https://hydrotruewater.com/resources/   
2                 https://hydrotruewater.com/service/   
13  https://hydrotruewater.com/applications/agricu...   
6   https://hydrotruewater.com/applications/car-wa...   
12  https://hydrotruewater.com/applications/commer...   
15  https://hydrotruewater.com/applications/coolin...   
9   https://hydrotruewater.com/applications/genera...   
10  https://hydrotruewater.com/applications/labora...   
11  https://hydrotruewater.com/

Unnamed: 0,URL,Title,Meta Description,H1,Depth
0,https://hydrotruewater.com/,Home - www.hydrotruewater.com,No Description,Water treatment systems andservices you can co...,1
4,https://hydrotruewater.com/about-us/,About Us - www.hydrotruewater.com,No Description,About Us,2
8,https://hydrotruewater.com/application-briefs/,Application Briefs - www.hydrotruewater.com,No Description,No H1,2
14,https://hydrotruewater.com/applications/,Applications - www.hydrotruewater.com,No Description,Applications,2
1,https://hydrotruewater.com/contact-us/,Contact Us - www.hydrotruewater.com,No Description,Contact Us,2


In [23]:
# Export as CSV
csv_filename = 'sitemap_data.csv'

df_sorted.to_csv(csv_filename, index=False)
print(f"DataFrame exported successfully to {csv_filename}")

DataFrame exported successfully to sitemap_data.csv
