In [28]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import ChromiumOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC




In [29]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [30]:
def set_chrome_options() -> ChromiumOptions:
    """Sets chrome options for Selenium.Chrome options for headless browser is enabled.
    Args: None
    
    returns:
        Chrome options that can work headless i.e. without actually launching the browser.
    """
    chrome_options = ChromiumOptions()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

In [37]:
def scrape_sitemap(url):
    """
    Extracts all the links from a company's sitemap.xml.
    
    Args:
    url (str): The URL for a company sitemap.xml.
    
    Returns:
    pd.Series: A pandas Series with all the links, or None if no links found.
    """
    try:
        with requests.get(url, headers=headers) as response:
            response.raise_for_status()  # Check if the request was successful
            soup = BeautifulSoup(response.text, 'lxml-xml')
            urls = [link.text.strip() for link in soup.find_all('loc') if link]
            
            if not urls:
                return None  # Return None if no URLs found
            
            extended_urls = []
            for link in urls:
                if link.endswith('xml'):
                    try:
                        with requests.get(link, headers=headers) as response:
                            response.raise_for_status()  # Check if the request was successful
                            nested_soup = BeautifulSoup(response.text, 'lxml')
                            nested_urls = [url.text.strip() for url in nested_soup.find_all('loc') if url]
                            extended_urls.extend(nested_urls)
                    except requests.RequestException as e:
                        print(f"Error occurred while processing {link}: {e}")
                else:
                    extended_urls.append(link)
            
            if not extended_urls:
                return None  # Return None if no extended URLs found
            
            return pd.Series(extended_urls).drop_duplicates().str.strip()
    
    except requests.RequestException as e:
        print(f"Error occurred: {e}")
        return None  # Return None if the initial request fails


In [38]:


def scrape_website(all_links, options):
    """
    Extracts all the text data from the webpages of a company.

    Args:
    all_links (pd.Series): A pandas Series with all the links in the company's website.
    options: Chrome options to apply to the browser

    Returns:
    pd.DataFrame: A pandas DataFrame with the columns 'key' (webpage link), 'text' (includes 
                  the text of the webpage), and 'timestamp' (when the data was scraped).
    
    pd.DataFrame : A pandas dataframe with columns 'key' (webpage link), 'error with timestamp' .
    """
    log_dict = {}
    text_dict = {}
    wait_condition = (By.TAG_NAME,['html','div','body'])

    for link in all_links.to_list():
        try:
            # First, scrape the page using requests
            with requests.get(link, headers=headers) as response:
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'lxml')
                    
                    [tag.decompose() for tag in soup.find_all(['header', 'nav', 'footer'])]
                    text_only_requests = soup.get_text(separator=' ', strip=True)
                    print(link)
                
                # If content seems too short or response code is not 200, use Selenium
                if response.status_code != 200 or len(text_only_requests.split()) < 50:
                    try:
                        browser = webdriver.Chrome(options)
                        browser.get(link)
                        wait = WebDriverWait(browser, timeout=30)

                        wait.until(lambda d: browser.find_element(By.TAG_NAME, 'html').is_displayed() \
                                   if browser.find_element(By.TAG_NAME, 'html') else True &\
                                   
                                  np.all(np.array([i.is_displayed() \
                                                       for i in browser.find_elements(By.TAG_NAME, 'div')])) \
                                   
                                   if browser.find_elements(By.TAG_NAME, 'div') else True)
                                   
                               
                                  
                        soup_selenium = BeautifulSoup(browser.page_source, 'lxml')

                        [tag.decompose() for tag in soup_selenium.find_all(['header', 'nav', 'footer'])]
                        text_only_selenium = soup_selenium.get_text(separator=' ', strip=True).lower()

                        text_dict[link] = text_only_selenium
                        
                        if len(text_only_selenium.lower().split()) < 20:
                            log_dict[link] = str(pd.to_datetime(datetime.today().date())) + \
                            " "+text_only_selenium
                            
                    except Exception as e:
                        print(f"Error occurred while processing {link} in selenium: {e.with_traceback}")
                        log_dict[link] = f'{pd.to_datetime(datetime.today().date())}  {e.with_traceback}'
                       
                    finally:
                        browser.close()
                                
                else:
                    text_dict[link] = text_only_requests.lower()
        
        except requests.RequestException as e:
            print(f"Error occurred while processing {link}: {e}")
            log_dict[link] = f'pd.to_datetime(datetime.today().date())  {e.with_traceback}'
    if not log_dict:
        df_log = pd.DataFrame()
    else:
        
        df_log = pd.DataFrame(list(log_dict.items()), columns=['key', 'error'])
    
    if not text_dict:
        return pd.DataFrame(), df_log   # return empty DataFrame if no text is extracted

    df = pd.DataFrame(list(text_dict.items()), columns=['key', 'text'])
    df['timestamp'] = pd.to_datetime(datetime.today().date())
    

    return df, df_log

In [41]:
def main():
    """
    Runs the scraping engine, extracts data from the specified sitemaps, and saves
    the extracted data to a CSV file.
    
    The CSV file contains the scraped data from the first 10 webpages of each sitemap.
    """
    options = set_chrome_options()
    try:
        sitemap_df = pd.read_csv("sitemap.csv")
    except FileNotFoundError:
        print("Error: The file 'sitemap.csv' was not found.")
        return
    except pd.errors.EmptyDataError:
        print("Error: The file 'sitemap.csv' is empty.")
        return

    if 'sitemap' not in sitemap_df.columns:
        print("Error: The expected 'sitemap' column is missing in 'sitemap.csv'.")
        return

    print("Starting scraping:\n")
    for index, item in enumerate(sitemap_df['sitemap'], start=1):
        print(f"Working on {item} ...")
        links = scrape_sitemap(item)
        if links is None or links.empty:
            print(f"No links were found in {item}.\n")
            continue

        link_split = item.split('/')
        if link_split:
            website_name = link_split[2]
        print(f"{links.shape[0]} webpages found for scraping. For demo, scraping only the first 10 webpages.\n")
        
        scraped_df, log_df = scrape_website(links[:3], options)
        if scraped_df.empty:
            print(f"No data was scraped from the first 10 links of {item}.\n")
            continue
            
            
        timestamp = str(datetime.now()).split()[0]    
        output_file = f'data/{website_name}_{timestamp}.csv'
        scraped_df.to_csv(output_file, index=False)
        if not log_df.empty:
            log_file = f'log/{website_name}_{timestamp}.csv'
            log_df.to_csv(log_file, index=False)
        
        
        print(f"Finished scraping. Data stored in {output_file}\n")
        
        

In [42]:
main()

Starting scraping:

Working on https://cohere.com/sitemap.xml ...
30 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://cohere.com/responsibility
https://cohere.com/deployment-options/oracle
https://cohere.com/news
Finished scraping. Data stored in data/cohere.com_2023-10-04.csv

Working on https://ai21.com/sitemap.xml ...
91 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.ai21.com/
https://www.ai21.com/about
https://www.ai21.com/ai-co-writing
Finished scraping. Data stored in data/ai21.com_2023-10-04.csv

Working on https://descript.com/sitemap.xml ...
187 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.descript.com/about
https://www.descript.com/affiliate
https://www.descript.com/affiliate-terms
Finished scraping. Data stored in data/descript.com_2023-10-04.csv

Working on https://weaviate.io/sitemap.xml ...
335 webpages found for scraping. For demo, scraping only the f



63 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.anthropic.com/amazon-bedrock
https://www.anthropic.com/claude-in-slack
https://www.anthropic.com/claude-in-slack/support
Finished scraping. Data stored in data/anthropic.com_2023-10-04.csv

Working on https://inflection.ai/sitemap.xml ...
15 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://inflection.ai
https://inflection.ai/about
https://inflection.ai/safety
Finished scraping. Data stored in data/inflection.ai_2023-10-04.csv

Working on https://h2o.ai/sitemap.xml ) ...
983 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://h2o.ai/
https://h2o.ai/br/
https://h2o.ai/br/agradecemos-seu-contato/
Finished scraping. Data stored in data/h2o.ai_2023-10-04.csv

Working on https://harver.com/sitemap_index.xml  ...




1004 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://harver.com/blog/
https://harver.com/blog/hr-tech-5-skills-modern-day-hr-professionals-need/
https://harver.com/blog/hr-tech-5-recruitment-hacks/
Finished scraping. Data stored in data/harver.com_2023-10-04.csv

Working on https://dataminr.com/sitemap.xml ...
451 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.dataminr.com/blog/dataminr-celebrates-pride-month
https://www.dataminr.com/resources/storms-sweep-the-southeast
https://www.dataminr.com/press/embracing-broad-data-sets-protecting-brand-reputation-at-the-world-cup
Finished scraping. Data stored in data/dataminr.com_2023-10-04.csv

Working on https://shield.ai/sitemap_index.xml ...




285 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://shield.ai/blog/
https://shield.ai/america-must-accelerate-scale-and-innovate-win-future-wars/
https://shield.ai/on-overcoming-obstacles/
Finished scraping. Data stored in data/shield.ai_2023-10-04.csv

Working on https://kymeratx.com/sitemap.xml ...




11 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.kymeratx.com/
https://www.kymeratx.com/terms/
https://www.kymeratx.com/privacy-policy/
Finished scraping. Data stored in data/kymeratx.com_2023-10-04.csv

Working on https://arvinas.com/sitemap.xml ...




191 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.arvinas.com/
https://www.arvinas.com/terms-of-use/
https://www.arvinas.com/patients/
Finished scraping. Data stored in data/arvinas.com_2023-10-04.csv

Working on https://ardelyx.com/sitemap.xml ...




13 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://ardelyx.com/
https://ardelyx.com/our-pipeline/
https://ardelyx.com/products/
Finished scraping. Data stored in data/ardelyx.com_2023-10-04.csv

Working on https://monterosatx.com/sitemap.xml ...




21 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.monterosatx.com/news/monte-rosa-therapeutics-strengthens-leadership-with-key-executive-and-board-appointments/
https://www.monterosatx.com/news/monte-rosa-therapeutics-to-participate-in-upcoming-investor-conferences/
https://www.monterosatx.com/news/monte-rosa-therapeutics-expands-senior-management-team/
Finished scraping. Data stored in data/monterosatx.com_2023-10-04.csv

Working on https://trianabio.com/sitemap.xml ...
31 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://trianabio.com/press-release-4-6-2022
https://trianabio.com/job-posting-senior-scientist-lead-discovery
https://images.squarespace-cdn.com/content/v1/61d5e7afb1ff8b1bd358de61/5855e10d-77f0-42cc-aa0f-5d4792c9244a/TRIANA_ID_Logomark_72.png
Finished scraping. Data stored in data/trianabio.com_2023-10-04.csv

Working on https://tangotx.com/sitemap.xml ...




108 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.tangotx.com/
https://www.tangotx.com/science/our-approach/
https://www.tangotx.com/pipeline/
Finished scraping. Data stored in data/tangotx.com_2023-10-04.csv

Working on https://vertex.com/sitemap.xml ...




43 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.vertex.com/our-blog-news-updates-trends-vertex-software/
https://www.vertex.com/american-companies-saving-10m-annually-process-automation/
https://www.vertex.com/hr-management-in-the-age-of-technology/
Finished scraping. Data stored in data/vertex.com_2023-10-04.csv

Working on https://vervetx.com/sitemap.xml ...
28 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.vervetx.com/about-us/our-story
https://www.vervetx.com/about-us/why-cardiovascular-disease
https://www.vervetx.com/careers
Finished scraping. Data stored in data/vervetx.com_2023-10-04.csv

Working on https://novonordisk.com/sitemap.xml ...
319 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.novonordisk.com/
https://www.novonordisk.com/disease-areas.html
https://www.novonordisk.com/disease-areas/type-1-diabetes.html
Finished scraping. Data stored in data/n



67 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://relaytx.com/
https://relaytx.com/terms-of-use/
https://relaytx.com/privacy-policy/
Finished scraping. Data stored in data/relaytx.com_2023-10-04.csv

Working on https://neumoratx.com/sitemap.xml ...




81 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://neumoratx.com/?page_id=30
https://neumoratx.com/news/amgen-and-neumora-therapeutics-announce-strategic-rd-collaboration-to-accelerate-novel-precision-therapies-for-brain-diseases/
https://neumoratx.com/news/neumora-therapeutics-launches-to-pioneer-a-new-era-of-precision-medicines-for-brain-diseases/
Finished scraping. Data stored in data/neumoratx.com_2023-10-04.csv

Working on https://verily.com/sitemap.xml ...
210 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://verily.com/about-us
https://verily.com/about-us/careers
https://verily.com/about-us/careers/open-roles
Finished scraping. Data stored in data/verily.com_2023-10-04.csv

Working on https://kojintx.com/sitemap.xml ...




7 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://kojintx.com/
https://kojintx.com/sample-page/
https://kojintx.com/privacy-policy/
Finished scraping. Data stored in data/kojintx.com_2023-10-04.csv

Working on https://bland.ai/sitemap.xml ...
12 webpages found for scraping. For demo, scraping only the first 10 webpages.

http://www.bland.ai/
http://www.bland.ai/blog
https://www.bland.ai/blog/enterprise-ai-phone-call-use-cases
Finished scraping. Data stored in data/bland.ai_2023-10-04.csv

