In [6]:
#!pip install google-cloud-storage

In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from datetime import datetime
import requests
from pathlib import Path
from selenium import webdriver
from selenium.webdriver.chrome.options import ChromiumOptions
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [2]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}

In [21]:
def set_chrome_options() -> ChromiumOptions:
    """Sets chrome options for Selenium.Chrome options for headless browser is enabled.
    Args: None
    
    returns:
        Chrome options that can work headless i.e. without actually launching the browser.
    """
    chrome_options = ChromiumOptions()
    chrome_options.add_argument("--headless=new")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_prefs = {}
    chrome_options.experimental_options["prefs"] = chrome_prefs
    chrome_prefs["profile.default_content_settings"] = {"images": 2}
    return chrome_options

In [22]:
def scrape_sitemap(url):
    """
    Extracts all the links from a company's sitemap.xml.
    
    Args:
    url (str): The URL for a company sitemap.xml.
    
    Returns:
    pd.Series: A pandas Series with all the links, or None if no links found.
    """
    try:
        with requests.get(url, headers=headers) as response:
            response.raise_for_status()  # Check if the request was successful
            soup = BeautifulSoup(response.text, 'lxml-xml')
            urls = [link.text.strip() for link in soup.find_all('loc') if link]
            
            if not urls:
                return None  # Return None if no URLs found
            
            extended_urls = []
            for link in urls:
                if link.endswith('xml'):
                    try:
                        with requests.get(link, headers=headers) as response:
                            response.raise_for_status()  # Check if the request was successful
                            nested_soup = BeautifulSoup(response.text, 'lxml')
                            nested_urls = [url.text.strip() for url in nested_soup.find_all('loc') if url]
                            extended_urls.extend(nested_urls)
                    except requests.RequestException as e:
                        print(f"Error occurred while processing {link}: {e}")
                else:
                    extended_urls.append(link)
            
            if not extended_urls:
                return None  # Return None if no extended URLs found
            
            return pd.Series(extended_urls).drop_duplicates().str.strip()
    
    except requests.RequestException as e:
        print(f"Error occurred: {e}")
        return None  # Return None if the initial request fails


In [112]:
def scrape_website(all_links, options):
    """
    Extracts all the text data from the webpages of a company.

    Args:
    all_links (pd.Series): A pandas Series with all the links in the company's website.
    options: Chrome options to apply to the browser

    Returns:
    pd.DataFrame: A pandas DataFrame with the columns 'key' (webpage link), 'text' (includes 
                  the text of the webpage), and 'timestamp' (when the data was scraped).
    
    pd.DataFrame : A pandas dataframe with columns 'key' (webpage link), 'error with timestamp' .
    """
    log_dict = {}
    text_dict = {}
    wait_condition = (By.TAG_NAME,['html','div','body'])

    for link in all_links.to_list():
        try:
            # First, scrape the page using requests
            with requests.get(link, headers=headers) as response:
                if response.status_code == 200:
                    soup = BeautifulSoup(response.text, 'lxml')
                    
                    [tag.decompose() for tag in soup.find_all(['header', 'nav', 'footer'])]
                    text_only_requests = soup.get_text(separator=' ', strip=True)
                    print(link)
                
                # If content seems too short or response code is not 200, use Selenium
                if response.status_code != 200 or len(text_only_requests.split()) < 50:
                    print("using selenium to scrape..\n")
                    try:
                        browser = webdriver.Chrome(options)
                        browser.get(link)
                        wait = WebDriverWait(browser, timeout=30)

                        wait.until(lambda d: browser.find_element(By.TAG_NAME, 'html').is_displayed() \
                                   if browser.find_element(By.TAG_NAME, 'html') else True &\
                                   
                                  np.all(np.array([i.is_displayed() \
                                                       for i in browser.find_elements(By.TAG_NAME, 'div')])) \
                                   
                                   if browser.find_elements(By.TAG_NAME, 'div') else True)
                                   
                               
                                  
                        soup_selenium = BeautifulSoup(browser.page_source, 'lxml')

                        [tag.decompose() for tag in soup_selenium.find_all(['header', 'nav', 'footer'])]
                        text_only_selenium = soup_selenium.get_text(separator=' ', strip=True).lower()

                        text_dict[link] = text_only_selenium
                        
                        if len(text_only_selenium.lower().split()) < 20:
                            log_dict[link] = str(pd.to_datetime(datetime.today().date())) + \
                            " "+text_only_selenium
                            
                    except Exception as e:
                        print(f"Error occurred while processing {link} in selenium: {e.with_traceback}")
                        log_dict[link] = f'{pd.to_datetime(datetime.today().date())}  {e.with_traceback}'
                       
                    finally:
                        browser.close()
                                
                else:
                    text_dict[link] = text_only_requests.lower()
        
        except requests.RequestException as e:
            print(f"Error occurred while processing {link}: {e}")
            log_dict[link] = f'pd.to_datetime(datetime.today().date())  {e.with_traceback}'
    if not log_dict:
        df_log = pd.DataFrame()
    else:
        
        df_log = pd.DataFrame(list(log_dict.items()), columns=['key', 'error'])
    
    if not text_dict:
        return pd.DataFrame(), df_log   # return empty DataFrame if no text is extracted

    df = pd.DataFrame(list(text_dict.items()), columns=['key', 'text'])    

    return df, df_log

In [109]:
def main():
    """
    Runs the scraping engine, extracts data from the specified sitemaps, and saves
    the extracted data to a CSV file.
    
    The CSV file contains the scraped data from the first 10 webpages of each sitemap.
    """
    p =Path('../../data')
    path = str(p)
    options = set_chrome_options()
    try:
        sitemap_df = pd.read_csv("sitemap.csv")
    except FileNotFoundError:
        print("Error: The file 'sitemap.csv' was not found.")
        return
    except pd.errors.EmptyDataError:
        print("Error: The file 'sitemap.csv' is empty.")
        return

    if 'sitemap' not in sitemap_df.columns:
        print("Error: The expected 'sitemap' column is missing in 'sitemap.csv'.")
        return

    print("Starting scraping:\n")
    for index, item in enumerate(sitemap_df['sitemap'], start=1):
        print(f"Working on {item} ...")
        links = scrape_sitemap(item)
        if links is None or links.empty:
            print(f"No links were found in {item}.\n")
            continue

        link_split = item.split('/')
        if link_split:
            website_name = link_split[2]
        print(f"{links.shape[0]} webpages found for scraping. For demo, scraping only the first 10 webpages.\n")
        
        scraped_df, log_df = scrape_website(links[:3], options)
        if scraped_df.empty:
            print(f"No data was scraped from the first 10 links of {item}.\n")
            continue
            
        split=str(datetime.now()).split()
        date = str(split[0])
        ms= str(split[1].split('.')[1])
        timestamp = date+"-"+ms
     
        output_file = path+'/'+website_name+'_'+timestamp+'.csv'
        print(output_file)
        scraped_df.to_csv(output_file, index=False)
        if not log_df.empty:
            log_file = f'log/{website_name}_{timestamp}.csv'
            log_df.to_csv(log_file, index=False)
        
        
        print(f"Finished scraping. Data stored in {output_file}\n")

In [103]:
main()

Starting scraping:

Working on https://bland.ai/sitemap.xml ...
12 webpages found for scraping. For demo, scraping only the first 10 webpages.

http://www.bland.ai/
http://www.bland.ai/blog
https://www.bland.ai/blog/enterprise-ai-phone-call-use-cases
../../data/bland.ai_2023-10-05-242917.csv
Finished scraping. Data stored in ../../data/bland.ai_2023-10-05-242917.csv

Working on https://cohere.com/sitemap.xml ...
30 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://cohere.com/responsibility
https://cohere.com/deployment-options/oracle
https://cohere.com/news
../../data/cohere.com_2023-10-05-839812.csv
Finished scraping. Data stored in ../../data/cohere.com_2023-10-05-839812.csv

Working on https://ai21.com/sitemap.xml ...
91 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.ai21.com/
https://www.ai21.com/about
https://www.ai21.com/ai-co-writing
../../data/ai21.com_2023-10-05-189942.csv
Finished scraping. Data stor



66 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.anthropic.com/amazon-bedrock
https://www.anthropic.com/claude-in-slack
https://www.anthropic.com/claude-in-slack/support
../../data/anthropic.com_2023-10-05-373333.csv
Finished scraping. Data stored in ../../data/anthropic.com_2023-10-05-373333.csv

Working on https://inflection.ai/sitemap.xml ...
15 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://inflection.ai
https://inflection.ai/about
https://inflection.ai/safety
../../data/inflection.ai_2023-10-05-219168.csv
Finished scraping. Data stored in ../../data/inflection.ai_2023-10-05-219168.csv

Working on https://h2o.ai/sitemap.xml ) ...
983 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://h2o.ai/
https://h2o.ai/br/
https://h2o.ai/br/agradecemos-seu-contato/
../../data/h2o.ai_2023-10-05-725298.csv
Finished scraping. Data stored in ../../data/h2o.ai_2023-10-05-725298.csv

Wor



1004 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://harver.com/blog/
https://harver.com/blog/hr-tech-5-skills-modern-day-hr-professionals-need/
https://harver.com/blog/hr-tech-5-recruitment-hacks/
../../data/harver.com_2023-10-05-178292.csv
Finished scraping. Data stored in ../../data/harver.com_2023-10-05-178292.csv

Working on https://dataminr.com/sitemap.xml ...
451 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.dataminr.com/blog/dataminr-celebrates-pride-month
https://www.dataminr.com/resources/storms-sweep-the-southeast
https://www.dataminr.com/press/embracing-broad-data-sets-protecting-brand-reputation-at-the-world-cup
../../data/dataminr.com_2023-10-05-411142.csv
Finished scraping. Data stored in ../../data/dataminr.com_2023-10-05-411142.csv

Working on https://shield.ai/sitemap_index.xml ...




285 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://shield.ai/blog/
https://shield.ai/america-must-accelerate-scale-and-innovate-win-future-wars/
https://shield.ai/on-overcoming-obstacles/
../../data/shield.ai_2023-10-05-657874.csv
Finished scraping. Data stored in ../../data/shield.ai_2023-10-05-657874.csv

Working on https://kymeratx.com/sitemap.xml ...




11 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.kymeratx.com/
https://www.kymeratx.com/terms/
https://www.kymeratx.com/privacy-policy/
../../data/kymeratx.com_2023-10-05-299174.csv
Finished scraping. Data stored in ../../data/kymeratx.com_2023-10-05-299174.csv

Working on https://arvinas.com/sitemap.xml ...




191 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.arvinas.com/
https://www.arvinas.com/terms-of-use/
https://www.arvinas.com/patients/
../../data/arvinas.com_2023-10-05-949037.csv
Finished scraping. Data stored in ../../data/arvinas.com_2023-10-05-949037.csv

Working on https://ardelyx.com/sitemap.xml ...




13 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://ardelyx.com/
https://ardelyx.com/our-pipeline/
https://ardelyx.com/products/
../../data/ardelyx.com_2023-10-05-267887.csv
Finished scraping. Data stored in ../../data/ardelyx.com_2023-10-05-267887.csv

Working on https://monterosatx.com/sitemap.xml ...




21 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.monterosatx.com/news/monte-rosa-therapeutics-strengthens-leadership-with-key-executive-and-board-appointments/
https://www.monterosatx.com/news/monte-rosa-therapeutics-to-participate-in-upcoming-investor-conferences/
https://www.monterosatx.com/news/monte-rosa-therapeutics-expands-senior-management-team/
../../data/monterosatx.com_2023-10-05-817657.csv
Finished scraping. Data stored in ../../data/monterosatx.com_2023-10-05-817657.csv

Working on https://trianabio.com/sitemap.xml ...
31 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://trianabio.com/press-release-4-6-2022
https://trianabio.com/job-posting-senior-scientist-lead-discovery
https://images.squarespace-cdn.com/content/v1/61d5e7afb1ff8b1bd358de61/5855e10d-77f0-42cc-aa0f-5d4792c9244a/TRIANA_ID_Logomark_72.png
../../data/trianabio.com_2023-10-05-639980.csv
Finished scraping. Data stored in ../../data/tri



108 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.tangotx.com/
https://www.tangotx.com/science/our-approach/
https://www.tangotx.com/pipeline/
../../data/tangotx.com_2023-10-05-413127.csv
Finished scraping. Data stored in ../../data/tangotx.com_2023-10-05-413127.csv

Working on https://vertex.com/sitemap.xml ...




43 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.vertex.com/our-blog-news-updates-trends-vertex-software/
https://www.vertex.com/american-companies-saving-10m-annually-process-automation/
https://www.vertex.com/hr-management-in-the-age-of-technology/
../../data/vertex.com_2023-10-05-312111.csv
Finished scraping. Data stored in ../../data/vertex.com_2023-10-05-312111.csv

Working on https://vervetx.com/sitemap.xml ...
28 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.vervetx.com/about-us/our-story
https://www.vervetx.com/about-us/why-cardiovascular-disease
https://www.vervetx.com/careers
../../data/vervetx.com_2023-10-05-853328.csv
Finished scraping. Data stored in ../../data/vervetx.com_2023-10-05-853328.csv

Working on https://novonordisk.com/sitemap.xml ...
318 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://www.novonordisk.com/
https://www.novonordisk.com/disease



67 webpages found for scraping. For demo, scraping only the first 10 webpages.

https://relaytx.com/
https://relaytx.com/terms-of-use/
https://relaytx.com/privacy-policy/
../../data/relaytx.com_2023-10-05-520388.csv
Finished scraping. Data stored in ../../data/relaytx.com_2023-10-05-520388.csv

Working on https://neumoratx.com/sitemap.xml ...




KeyboardInterrupt: 

In [2]:
datetime.now()

datetime.datetime(2023, 10, 5, 18, 59, 24, 957549)

In [26]:
from datetime import datetime

# Get the timestamp string in the desired format
timestamp = datetime.now().strftime('%Y-%m-%dT%H-%M-%S')

website_name = 'www.chooch.com'

# Create the filename
filename = f"{website_name}_{timestamp}.csv"

print(filename)

www.chooch.com_2023-10-06T11-15-17.csv


In [4]:
timestamp

'2023-10-05T19-00-37'

In [7]:
import os

In [18]:
p = Path('../../data/')
if  os.path.exists(str(p)):
    print(p)

../../data


In [21]:
a =pd.DataFrame([1,2,3], columns=['key'])
a

Unnamed: 0,key
0,1
1,2
2,3


In [22]:
csv = a.to_csv(index=False)

In [23]:
csv

'key\n1\n2\n3\n'

In [24]:
os.mkdir('data')

In [25]:
path = str(Path('../../data'))

In [26]:
filename ="abc.com_111"

In [27]:
f"{path}/{filename}"

'../../data/abc.com_111'

In [28]:
f'data/{filename}'

'data/abc.com_111'

In [13]:
path = str(Path('data/'))
path

'data'

In [22]:
Path("./rag-detective-2ed9f2d52fde.json")

PosixPath('rag-detective-2ed9f2d52fde.json')

In [30]:
f"{path}/{filename}"

'data/abc.com_111'

In [7]:
from google.cloud import storage

In [19]:
import os

In [18]:
!export GOOGLE_APPLICATION_CREDENTIALS="./rag-detective-2ed9f2d52fde.json"

In [25]:
try:
    storage_client = storage.Client(project='rag-detective')

except Exception as e:
    print("Error")

Error


In [30]:
mydict = [{'a': pd.NA, 'b': 2, 'c': 3, 'd': 4},
          {'a': 100, 'b': 200, 'c': 300, 'd': 400},
          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
df = pd.DataFrame(mydict)
df

Unnamed: 0,a,b,c,d
0,,2,3,4
1,100.0,200,300,400
2,1000.0,2000,3000,4000


In [31]:
df.iloc[0]

a    <NA>
b       2
c       3
d       4
Name: 0, dtype: object

In [34]:
df.dropna(inplace=True)

In [37]:
df.index

Index([1, 2], dtype='int64')