### Test URLS
https://sunstonepartners.com \
https://www.appliedlearning.com \
https://www.forsalebyowner.com login \
https://ecmins.com/ \
http://www.iconnect-corp.com \
https://cessco.ca/ ROBOT \
http://www.ticss.net \
https://www.tyremarket.com/Car-Tyres \
https://www.dentalxchange.com/ \

**TODOList**:

-script defers to selenium if bs4 does nav_scrape but nav still empty (ex: https://www.forsalebyowner.com)

-File output with all columns

-Enhance href relevance function (both contain base_url)

-Four columns of information for each website

-Improve speed of sel nav_tree recursion

-retry if page_result is empty after page scrape

-page scrape for bs4 (page scrape working for sel)

-assess whitespace split to help headers

-requests 200 requirement for first href selection

-account for more options in 'assess' functions

-add website_url parameter into sel_nav_scrape for consistency

-FIX nav scrape for sel (nav scrape working for bs4). Specifically, first_href - not critical because very slow.

> Figure out alternative when no nav (all a tags' hrefs or first relevant href?) (ex: https://www.forsalebyowner.com) \
> Make sure that first_href returned is a URL in both bs4_nav_scrape() and sel_nav_scrape() \

-first text on page (home/about)

-mistral given as many pages as possible (via nav) -> really slow (~15 mins) save until next step

-utilize irrelevant in asses_href


### All imports

In [2]:
import requests
import pandas as pd
import validators
import time
from openai import OpenAI
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium_stealth import stealth
from selenium import webdriver
import re
import urllib.parse as up
from bs4 import BeautifulSoup

### Important meta tags

In [None]:
def get_meta_tags(url): #: str) -> dict[str,str]:

    api_key = 'pk_d22cfd76694dc6566ccbf7ed6f50cdc66215091c'    

#     url = 'https://cessco.ca/'
    # url = 'https://www.appliedlearning.com'

    params = {'url': url, 'api_key': api_key}
    
    # Free JsonLink limit is 30 req/minute so wait 3 seconds just in case
    time.sleep(3)
    
    response = requests.get('https://jsonlink.io/api/extract', params=params)

    if response.status_code == 200:
        data = response.json()
#         print(data)
        print('Title: ', data['title'], '\nDescription: ', data['description'], '\nDomain: ', data['domain'])
        return {k: data.get(k,None) for k in ('title', 'description', 'domain')}
    else:
        print(f'JSONLink Error for {url}: {response.status_code} - {response.text}')
        return {}

In [None]:
get_meta_tags('https://www.dentalxchange.com/')

### Handle navigation

#### Get all a tags (get best nav)
1. test if href valid url \
2. test if url + (optional /) + href valid url \
3. likely a bust

####  Selenium

In [60]:
def sel_normalize_whitespace(text):
    # Replace one or more whitespace characters (including spaces, tabs, and newlines) with a single space
    return re.sub(r'\s+', ' ', text).strip()

def sel_assess_href(base_url: str, href: str) -> str:
    if not validators.url(href):
        href = up.urljoin(base_url,href)
    # Add functionality here to compare if one is contained in the other
    return [href, 'relevant' if up.urlparse(href).netloc == up.urlparse(base_url).netloc else 'irrelevant']

def sel_find_relevant_hrefs(driver):
    atags = driver.find_elements("xpath","//a")
    relevant_hrefs = []
    for a in atags:
        text, href = sel_normalize_whitespace(a.get_attribute('textContent')),a.get_attribute('href')
        website_url = driver.current_url
        if sel_assess_href(website_url, href)[1] == 'relevant' and (href != website_url and text != 'Skip to content'):
            relevant_hrefs += [(text, href)]
    return relevant_hrefs

def sel_find_first_href(nested_list) -> str:
    for item in nested_list:    
        if isinstance(item, list):
                    # Recursively search within the list
                    result = sel_find_first_href(item)
                    if result:  # If a valid URL is found in the recursion, return it
                        return result
        elif isinstance(item, tuple) and len(item) == 2:
            # If the item is a tuple with 2 elements, check the second element for a valid URL
            if validators.url(item[1]):
                return item[1]  # Return the URL if it's valid
    return 'No href found'

def sel_build_tree(base_url,element):
    # Initialize the node with tag name and text content
    node_contents = {'text': sel_normalize_whitespace(element.get_attribute('textContent')),
                     'href': sel_assess_href(base_url,element.get_attribute('href'))[0]} if element.tag_name == 'a' else {}
    node = {
        **node_contents,
        'children': []
    }

    # Recursively build the tree for each child element
    children = element.find_elements(By.XPATH, "./*")  # Only direct children
    for child in children:
        node['children'].append(sel_build_tree(base_url,child))

    if not node['children'] or all(not obj for obj in node['children']):
        del node['children']

    return node

def sel_convert_tree(root) -> list[list,int]:
    ans, total_hrefs = [], 0
    if 'children' not in root:
        if 'text' in root and 'href' in root:
            return (root['text'], root['href'], 1)
    else:
        for child in root['children']:
            links = sel_convert_tree(child)
            if links:
                total_hrefs += links[-1]
                ans += [links[:-1]]

    return [*list(filter(None, ans)), total_hrefs]

# TODO: Fix this - first_href must be a URL (even if no nav, find first relevant href but need to broaden def of relevant)
# Returns the nav tree (either advanced nested or basic list of hrefs) and first href
def sel_nav_scrape(driver) -> list[list[tuple],str]:
    sel_nav_return, nav_trees = [], []

    # Find navs, construct trees and find max
    navs = driver.find_elements("xpath","//nav")
    for nav in navs:
        nav_trees.append(sel_build_tree(driver.current_url, nav))
    max_nav = ({}, 0)
    for tree in nav_trees:
        converted = sel_convert_tree(tree)
        if converted[-1] > max_nav[-1]:
            max_nav = converted
    # [:-1] to account for nested tree
    max_nav_tree = max_nav[:-1]

    # Construct return
    if not max_nav[0]:
    # if max_nav == ({}, 0): #or len(max_nav[-1]) < x:
        # If no/not enough navs, find all relevant atags
        relevant_hrefs = sel_find_relevant_hrefs(driver)  
        sel_nav_return.append(relevant_hrefs)
        first_href = relevant_hrefs[0][1] if relevant_hrefs else ''
    else:
        sel_nav_return.append(max_nav_tree)
        first_href = sel_find_first_href(max_nav)

    sel_nav_return.append(first_href)
    return sel_nav_return

driver = webdriver.Chrome()
driver.get('https://forsalebyowner.com/')
sel_nav = sel_nav_scrape(driver)
driver.close()
sel_nav

[[('Sell Your Home', 'https://www.forsalebyowner.com/sellyourhome/package'),
  ("What's My Home Worth",
   'https://www.forsalebyowner.com/what-is-my-home-worth'),
  ("Seller's Guide",
   'https://www.forsalebyowner.com/seller-guide/should-you-sell-your-house'),
  ('Closing Assistance',
   'https://www.forsalebyowner.com/sellyourhome/closing-assistance'),
  ('Search For Homes', 'https://www.forsalebyowner.com/homes-for-sale'),
  ('Learning Center', 'https://www.forsalebyowner.com/blog'),
  ('Frequently Asked Questions',
   'https://www.forsalebyowner.com/support/faq/forsalebyowner-basics'),
  ('USA Property Directory', 'https://www.forsalebyowner.com/property'),
  ('Sign In', None),
  ('Create My Listing', 'https://www.forsalebyowner.com/sellyourhome/package'),
  ('Whitney, TX130Active Listings1New Listing',
   'https://www.forsalebyowner.com/search/list/whitney-texas/fsbo-source/for_sale-status'),
  ('Granbury, TX68Active Listings0New Listings',
   'https://www.forsalebyowner.com/sear

#### bs4

In [65]:
def bs4_build_tree(base_url, element):
    # Initialize the node with tag name and text content
    node_text = {'text': element.get_text(strip=True)} if element.name == 'a' else {}
    node = {
        **node_text,
        'children': []
    }

    # If it's an <a> tag, include the href attribute
    if element.name == 'a':
        node['href'] = bs4_assess_href(base_url,element.get('href'))[0]

    # Recursively build the tree for each child element
    for child in element.find_all(recursive=False):  # Only direct children
        node['children'].append(bs4_build_tree(base_url, child))
        
    if not node['children'] or all(not obj for obj in node['children']):
        del node['children']

    return node

def bs4_convert_tree(root):
    ans, total_hrefs = [], 0
    if 'children' not in root:
        if 'text' in root and 'href' in root:
            return (root['text'],'' if root['href'] == 'javascript:void(0);' else root['href'],1)
            # Aesthetic output
            # return (f"root['text']}-> {root['href']}",1)
        # else:
        #     return ['','',0]
    else:
        for child in root['children']:
            # print(child)
            links = bs4_convert_tree(child)
            if links:
                # print(links)
                total_hrefs += links[-1]
                ans += [links[:-1]]
            
    return [*list(filter(None,ans)),total_hrefs]

# Need to handle javascript:void(0); case
def bs4_assess_href(base_url, href) -> str:
    if not validators.url(href):
        href = up.urljoin(base_url,href)
    return [href, 'relevant' if up.urlparse(href).netloc == up.urlparse(base_url).netloc else 'irrelevant']

def bs4_find_relevant_hrefs(soup, website_url: str) -> list[tuple[str, str]]:
    atags = soup.find_all('a')
    relevant_hrefs = []
    for a in atags:
        text, href = a.get_text(strip=True),a.get('href')
        if bs4_assess_href(website_url, href)[1] == 'relevant' and (href != website_url and text != 'Skip to content'):
            relevant_hrefs += [(text, href)]
    return relevant_hrefs

def bs4_find_first_href(nested_list) -> str:
    for item in nested_list:    
        if isinstance(item, list):
                    # Recursively search within the list
                    result = bs4_find_first_href(item)
                    if result:  # If a valid URL is found in the recursion, return it
                        return result
        elif isinstance(item, tuple) and len(item) == 2:
            if validators.url(item[1]):
                return item[1]  # Return the URL if it's valid
    return 'No href found'

def bs4_nav_scrape(website_url: str, soup) -> list[tuple[str,str],str]:
    bs4_nav_return, nav_trees = [], []

    # Find navs, construct trees, find max
    navs = soup.find_all('nav')
    for nav in navs:
        nav_trees.append(bs4_build_tree(website_url, nav))
    max_nav = ({},0)
    for tree in nav_trees:
        converted = bs4_convert_tree(tree)
        if converted[-1] > max_nav[-1]:
            max_nav = converted
    bs4_max_nav_tree = max_nav[:-1]

    # Construct return
    if not max_nav[0]: #or len(max_nav[-1]) < x:
        # If no/not enough navs, find all relevant atags
        relevant_hrefs = bs4_find_relevant_hrefs(soup, website_url)  
        bs4_nav_return.append(relevant_hrefs)
        first_href = relevant_hrefs[0][1] if relevant_hrefs else ''
    else:
        bs4_nav_return.append(bs4_max_nav_tree)
        first_href = bs4_find_first_href(max_nav)

    bs4_nav_return.append(first_href)
    return bs4_nav_return

# url = 'https://www.dentalxchange.com/'
# url = 'https://ecmins.com/'
# url = 'https://iquartic.com/' # blocked on requests
# url = 'https://www.ripoffreportremovalhelp.com/' # blocked on requests
# url = 'https://pulseca.com/'
url_test = 'https://www.scorpion.co/'

html = requests.get(url_test).content
soupt = BeautifulSoup(html, 'html.parser')

url_test = 'https://www.cessco.ca/'
response = requests.get(url_test)
soupy = BeautifulSoup(response.text, 'html.parser')

# response = requests.get(url_test).content
# soupr = BeautifulSoup(response, 'html.parser')
bs4_nav_scrape(url_test, soupy)
# soupy.find_all('a')
# print(soupr.find('h2'))


200


[[('Scroll To Top', '#')], '#']

**Report:** it seems as though the javascript:void(0); case is handled because validators.url thinks it's valid, but the netloc's are not the same, so it's labelled irrelevant. \
**TODO:** Need to figure out nav name (the text only in the nav element, not in the contained a's, create tree-like structure. 

### Scraping Methods

In [43]:
def word_count(seg):
    count = 0
    for i in seg:
        if i == ' ':
            count += 1
    return count+1

#### bs4

In [12]:
def bs4_pages_scrape(urls: list[str]) -> list[dict]:
    pages = []
    for url in urls:
        if url:
            response = requests.get(url).text
            soup = BeautifulSoup(response, 'html.parser')
            # Split on any whitespace (\n and \t) -> maybe this is causing weird headers
            page_text = soup.get_text("|",strip=True).split("|")
            # Extract the first two pieces of text with more than (7) words -> to be tested
            first_relevant = {'first_relevant': [i for i in page_text if word_count(i) > 7][:2]}
            # Two longest pieces of text on the page. Test if this produces relevant results
            two_longest = {'two_longest': sorted(page_text,key=len)[-2:]}
            # Find all h1s and h2s
            h1s = soup.find_all('h1')
            h2s = soup.find_all('h2')
            h1_texts = [h1.get_text(strip=True) for h1 in h1s]
            h2_texts = [h2.get_text(strip=True) for h2 in h2s]
            headers = {'headers': list(filter(None,h1_texts+h2_texts))}
            pages.append({**first_relevant, **two_longest,**headers})
        else:
            pages.append('Could not access nav')

    return pages

# Takes response_text instead of a URL since the request is required to determine bs4/sel
def bs4_scrape(website_url, response_text):
    soup = BeautifulSoup(response_text,'html.parser')
    url_results = {}

    #Scrape nav
    # print(website_url,soup)
    nav_list = bs4_nav_scrape(website_url, soup)
    url_results['nav'] = nav_list if nav_list[0] else 'Could not find nav'
    print(f'{up.urlparse(website_url).netloc} bs4 naver',nav_list)

    # Scrape home page and if there, first page
    urls = [website_url]
    if nav_list[1]:
        urls.append(nav_list[1])
    print('asdf',[website_url].append(nav_list[1]) if nav_list[1] else [website_url])
    pages = bs4_pages_scrape(urls)
    home_page_obj = pages[0]
    first_page_obj = pages[1] if nav_list[1] else 'Could not find nav'
    url_results['home_page'], url_results['first_page'] = home_page_obj, first_page_obj

    return url_results

urler = 'https://www.forsalebyowner.com/'
response = requests.get(urler)
print(response.url)
souper = BeautifulSoup(response.content, 'html.parser')
bs4_scrape(urler, response.text)

https://www.forsalebyowner.com/
www.forsalebyowner.com bs4 naver [[], '']
asdf ['https://www.forsalebyowner.com/']


{'nav': 'Could not find nav',
 'home_page': {'first_relevant': ['FSBO Real Estate Listings: Buy or Sell a House '],
  'two_longest': [' ForSaleByOwner',
   'FSBO Real Estate Listings: Buy or Sell a House '],
  'headers': []},
 'first_page': 'Could not find nav'}

#### Selenium

In [42]:
# Gathers first two relevant chunks of texts, two longest chunks of text and all h1s and h2s from every url in list then closes stealth driver fed in
def sel_pages_scrape(driver, urls: list[str]) -> dict:
    pages = []
    for url in urls:
        if url:
            driver.get(url)    
            time.sleep(2)
            page_text = driver.find_element("xpath","/html/body").text
            # Split on any whitespace (\n and \t)
            page_array = re.split(r'[\n\t]+',page_text)
            # Extract the first two pieces of text with more than (7) words -> to be tested
            first_relevant = {'first_relevant': [i for i in page_array if word_count(i) > 7][:2]}
            # Two longest pieces of text on the page. Test if this produces relevant results
            two_longest = {'two_longest': sorted(page_array,key=len)[-2:]}
            h1s = driver.find_elements("xpath","//h1")
            h2s = driver.find_elements("xpath","//h2")
            h1_texts = [h1.text for h1 in h1s if h1]
            h2_texts = [h2.text for h2 in h2s if h2]
            headers = {'headers':list(filter(None,h1_texts+h2_texts))}
            pages.append({**first_relevant, **two_longest,**headers})
        else:
            pages.append({'first_page':'Page not available'})
    driver.close()
    return pages

def sel_scrape(url: str) -> dict[str,str]:
    url_results = {}

    #Scrape nav
    driver = webdriver.Chrome()
    driver.get(url)
    nav_list = sel_nav_scrape(driver)
    driver.close()
    url_results['nav'] = nav_list
    print(f'{up.urlparse(url).netloc} sel naver', nav_list)

    # Configure driver to be passed throughout
    options = webdriver.ChromeOptions()
    options.add_argument("--start-maximized")
    stealth_driver = webdriver.Chrome(options=options)
    stealth(stealth_driver,
            languages=["en-US", "en"],
            vendor="Google Inc.",
            platform="Win32",
            webgl_vendor="Intel Inc.",
            renderer="Intel Iris OpenGL Engine",
            fix_hairline=True,
            )
    # stealth_driver.get(url)
    
    # Scrape home and first pages (requires both of these to have urls).
    home_page_obj, first_page_obj = sel_pages_scrape(stealth_driver, [url, nav_list[1]])
    url_results['home_page'], url_results['first_page'] = home_page_obj, first_page_obj

    # stealth_driver.close()
    
    return url_results

# https://www.appliedlearning.com
# https://sunstonepartners.com
# https://ecmins.com
# https://www.dentalxchange.com/
# https://pulseca.com/
# https://cessco.ca/
# sel_scrape('https://www.dentalxchange.com/')
# options = Options()

driver = webdriver.Chrome()
sel_pages_scrape(driver,['https://www.forsalebyowner.com/','https://www.forsalebyowner.com/sellyourhome/package'])
# navs = driver.find_elements("xpath","//nav")
# driver.get('https://pulseca.com/')
# sel_nav1 = sel_nav_scrape(driver)
# driver.get('https://cessco.ca/')
# sel_nav2 = sel_nav_scrape(driver)
# driver.close()
# # navs
# print(sel_nav1,'BKEH',sel_nav2)
# # driver.get('https://cessco.ca/')
# # naver = sel_nav_scrape(driver)
# # driver.close()

# options = webdriver.ChromeOptions()
# # #     # Set any desired options for the Chrome driver here
# options.add_argument("--start-maximized")
# # # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36")
# stealth_driver = webdriver.Chrome(options=options)
# stealth(stealth_driver,
#             languages=["en-US", "en"],
#             vendor="Google Inc.",
#             platform="Win32",
#             webgl_vendor="Intel Inc.",
#             renderer="Intel Iris OpenGL Engine",
#             fix_hairline=True,
#             )
# pgs = sel_pages_scrape(stealth_driver,['https://pulseca.com/','https://cessco.ca/','https://cessco.ca/pressure-vessel-fabrication/'])
# stealth_driver.close()
# pgs
# print(pgs,naver)
# stealth_driver.close()

# atages = driver.find_elements('xpath','//a')
# sel_nav = sel_nav_scrape(driver)
# driver.close()
# atages

# nav_driver = webdriver.Chrome()
# nav_driver.get('https://cessco.ca/')
# dd = sel_nav_scrape(nav_driver)
# nav_driver.close()
# dd

[{'first_relevant': ['Search our exclusive home inventory. Enter an address, neighborhood, or city',
   'From pricing your home to fielding offers, be in complete control of your home sale from your personalized dashboard.'],
  'two_longest': ['Taking steps, no matter how small, toward a sustainable lifestyle can feel incredibly gratifying – which is probably why...',
   'Selling a home yourself, without a big real estate agent’s network to help you, comes down to marketing your asset to bring in the best offers. You need to know your buyers and their needs.'],
  'headers': ['List Your Home With Confidence & Save',
   'Sell Your Home On Your Terms',
   'The Ultimate FSBO Seller Guide',
   'From pricing your home to fielding offers, be in complete control of your home sale from your personalized dashboard.',
   'Explore Top Marketplaces',
   'View Listings For Sale',
   'Take advantage of end to end support so you can spend more time on the things that matter.',
   'See Recently Sold & 

In [40]:
urlt = 'https://www.forsalebyowner.com/'
repsonse = requests.get(urlt)
# bs4_scrape(urlt,response.text)
# sel_scrape(urlt)

[{'first_relevant': ['FSBO Real Estate Listings: Buy or Sell a House '],
  'two_longest': [' ForSaleByOwner',
   'FSBO Real Estate Listings: Buy or Sell a House '],
  'headers': []},
 {'first_relevant': ['FSBO Real Estate Listings: Buy or Sell a House '],
  'two_longest': [' ForSaleByOwner',
   'FSBO Real Estate Listings: Buy or Sell a House '],
  'headers': []}]

### Mistral exploration
Get HTML from a couple pages, see what GPT 3.5 gives. Mistral has 32k context window.

In [None]:
def query_mistral(website_object: str):
    # Point to the local server
    client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed")
    messages=[
            {"role": "system", "content": "As a master company assessment officer with 3000 years of experience, you have a task. Given an object with a data structure representing the navigation tree of a website and a handful of textual data like first relevant chunks of text and header values for the home page and first product page in the navigation tree, you have been instructed by the Prime Chancellor to tell me about the company and what your advice on investing in it would be. For example, information like the types of products that it sells and the sector that it is in."},
            # {"role": "system", "content": "{'nav': [[[[('Your Challenges', 'https://www.appliedlearning.com/your-challenges/'), [[('Overview', 'https://www.appliedlearning.com/your-challenges/')], [('Strategy Execution', 'https://www.appliedlearning.com/your-challenges/strategy-execution/'), [[('Overview', 'https://www.appliedlearning.com/your-challenges/strategy-execution/')], [('Achieve Strategy Alignment Within Executive Teams', 'https://www.appliedlearning.com/your-challenges/strategy-execution/achieve-strategy-alignment-within-executive-teams/')], [('Equip Managers to Turn Strategy into Action', 'https://www.appliedlearning.com/your-challenges/strategy-execution/equip-managers-to-turn-strategy-into-action/')], [('Gain Buy-in and Emotional Connection By Employees', 'https://www.appliedlearning.com/your-challenges/strategy-execution/gain-buy-in-and-emotional-connection-by-employees/')], [('Implement Strategy by Changing Key Processes', 'https://www.appliedlearning.com/your-challenges/strategy-execution/implement-strategy-by-changing-key-processes/')]]], [('Leadership Development', 'https://www.appliedlearning.com/your-challenges/leadership-development/')], [('Business Acumen', 'https://www.appliedlearning.com/your-challenges/business-acumen/')], [('Culture & Performance', 'https://www.appliedlearning.com/your-challenges/high-performance-culture/')], [('Operational Excellence', 'https://www.appliedlearning.com/your-challenges/operational-excellence/')]]], [('Our Approach', 'https://www.appliedlearning.com/authentic-employee-engagement/'), [[('Overview', 'https://www.appliedlearning.com/authentic-employee-engagement/')], [('Knowledge Maps', 'https://www.appliedlearning.com/authentic-employee-engagement/knowledge-maps/')], [('Visual Narratives', 'https://www.appliedlearning.com/authentic-employee-engagement/visual-narratives/')], [('Spark! Discussion Starters', 'https://www.appliedlearning.com/authentic-employee-engagement/discussion-starters/')], [('Now I See Workshops', 'https://www.appliedlearning.com/authentic-employee-engagement/now-i-see-workshops/')], [('eLearning', 'https://www.appliedlearning.com/authentic-employee-engagement/elearning/')]]], [('Clients', 'https://www.appliedlearning.com/clients/')], [('Insights', 'https://www.appliedlearning.com/insights/')], [('About', 'https://www.appliedlearning.com/about/')], [('Contact Us', 'https://www.appliedlearning.com/contact/')], [('', 'https://www.appliedlearning.com/#')]]], 'https://www.appliedlearning.com/your-challenges/'], 'home_page': {'first_relevant': ['Applied Learning has extensive experience addressing key business challenges. Our authentic employee engagement approach produces fresh solutions tailored to each clientâ€™s goals.', 'We believe that strategy execution requires leadership alignment as well as emotional buy-in at the employee level. We help align leadership teams and equip them to address the social side of strategy across your organization.'], 'two_longest': ['We provide expertise in knowing how to create authentic and engaging learning experiences that instill sound judgment at all levels of your organization. You can specify the program details or choose from proven business acumen frameworks that we offer.', 'We partner with you to develop custom solutions that deliver authentic employee engagement. Each expert-designed product described below is an effective stand-alone solution; when integrated together this blended solution will multiply and magnify results.'], 'headers': ['Applied Learning has extensive experience addressing key business challenges. Our authentic employee engagement approach produces fresh solutions tailored to each clientâ€™s goals.', 'Strategy Execution', 'Leadership Development', 'Business Acumen', 'Culture &\nPerformance', 'Operational Excellence', 'Authentic employee engagement is our people-centered approach to training and communication that integrates the needs of people, the possibilities of learning methodologies, and the business needs of our clients.']}, 'first_page': {'first_relevant': ['Applied Learning has extensive experience empowering organizations. Our authentic employee engagement approach will produce fresh solutions tailored to address your challenges and opportunities.', 'We believe that strategy execution requires leadership alignment as well as emotional buy-in at the employee level. We help align leadership teams and equip them to address the social side of strategy across your organization.'], 'two_longest': ['We have supported operational initiatives with hundreds of clients over the last 17 years. We offer expertise in creating tailored, experiential tools and programs that guide employees to personalize and act on operational initiatives.', 'We provide expertise in knowing how to create authentic and engaging learning experiences that instill sound judgment at all levels of your organization. You can specify the program details or choose from proven business acumen frameworks that we offer.'], 'headers': ['Applied Learning has extensive experience empowering organizations. Our authentic employee engagement approach will produce fresh solutions tailored to address your challenges and opportunities.', 'Strategy Execution', 'Leadership Development', 'Business Acumen', 'Culture &\nPerformance', 'Operational Excellence']}}"},
            # {"role": "user", "content": "['Accelerate your business and unlock new potential with the dental payments platform that connects your data, workflows, teams, and patients like never before.','Say goodbye to complexity. We address the full lifecycle of dental payments. From verifying eligibility to filing accurate claims and beyond, our powerful platform, intelligent data, and massive network ensure payers get exactly what they need so dentists get exactly what they’ve earned. Quickly, reliably, and easily.'],'two_longest': ['By integrating DentalXChange into your software, you can offer a more comprehensive product and give your users a seamless interface for submitting claims and tracking payments. And with our easy-to-use APIs, you’ve got quick access to trusted data and a vast payments network that can scale—with added value just around the corner.','You’re constantly seeking ways to improve your practice so you can focus on what you do best—dentistry. You need to get paid on time. And you want the payment process to be as smooth, seamless, and simple as possible. That’s where we come in. We help reduce your costs and stress—by streamlining your workflows and eliminating manual processes and paperwork. Submit claims correctly the first time. Know where you stand at any time. And create better experiences across the board for your patients.'"}
            {"role": "user", "content": "Here is the object containing information about the website: " + website_object + ". Please help me out and give me a well-informed output. Thank you."}
        ]
    while input != 'exit':

        completion = client.chat.completions.create(
        model="local-model", # this field is currently unused
        messages=messages,
        temperature=0.7,
        )

        new_message = {"role":"assistant","content":""}

        # type(completion) == <class 'openai.types.chat.chat_completion.ChatCompletion'>; type(chunk) == tuple
        for chunk in completion:
            print('chUNK:',chunk,type(chunk))
            if chunk[0] == 'choices':#.delta.content:
                new_message["content"] = chunk[1][0].message.content
                return new_message["content"]
                # TODO: Potentially continue the chat here but return repsonse for now. Also means change while loop to assess most recent message
                # messages.append(new_message)

        
        # print()
        # messages.append({"role":"user", "content": input("> ")})

objer = {'nav': [[[[('Your Challenges', 'https://www.appliedlearning.com/your-challenges/'), [[('Overview', 'https://www.appliedlearning.com/your-challenges/')], [('Strategy Execution', 'https://www.appliedlearning.com/your-challenges/strategy-execution/'), [[('Overview', 'https://www.appliedlearning.com/your-challenges/strategy-execution/')], [('Achieve Strategy Alignment Within Executive Teams', 'https://www.appliedlearning.com/your-challenges/strategy-execution/achieve-strategy-alignment-within-executive-teams/')], [('Equip Managers to Turn Strategy into Action', 'https://www.appliedlearning.com/your-challenges/strategy-execution/equip-managers-to-turn-strategy-into-action/')], [('Gain Buy-in and Emotional Connection By Employees', 'https://www.appliedlearning.com/your-challenges/strategy-execution/gain-buy-in-and-emotional-connection-by-employees/')], [('Implement Strategy by Changing Key Processes', 'https://www.appliedlearning.com/your-challenges/strategy-execution/implement-strategy-by-changing-key-processes/')]]], [('Leadership Development', 'https://www.appliedlearning.com/your-challenges/leadership-development/')], [('Business Acumen', 'https://www.appliedlearning.com/your-challenges/business-acumen/')], [('Culture & Performance', 'https://www.appliedlearning.com/your-challenges/high-performance-culture/')], [('Operational Excellence', 'https://www.appliedlearning.com/your-challenges/operational-excellence/')]]], [('Our Approach', 'https://www.appliedlearning.com/authentic-employee-engagement/'), [[('Overview', 'https://www.appliedlearning.com/authentic-employee-engagement/')], [('Knowledge Maps', 'https://www.appliedlearning.com/authentic-employee-engagement/knowledge-maps/')], [('Visual Narratives', 'https://www.appliedlearning.com/authentic-employee-engagement/visual-narratives/')], [('Spark! Discussion Starters', 'https://www.appliedlearning.com/authentic-employee-engagement/discussion-starters/')], [('Now I See Workshops', 'https://www.appliedlearning.com/authentic-employee-engagement/now-i-see-workshops/')], [('eLearning', 'https://www.appliedlearning.com/authentic-employee-engagement/elearning/')]]], [('Clients', 'https://www.appliedlearning.com/clients/')], [('Insights', 'https://www.appliedlearning.com/insights/')], [('About', 'https://www.appliedlearning.com/about/')], [('Contact Us', 'https://www.appliedlearning.com/contact/')], [('', 'https://www.appliedlearning.com/#')]]], 'https://www.appliedlearning.com/your-challenges/'], 'home_page': {'first_relevant': ['Applied Learning has extensive experience addressing key business challenges. Our authentic employee engagement approach produces fresh solutions tailored to each clientâ€™s goals.', 'We believe that strategy execution requires leadership alignment as well as emotional buy-in at the employee level. We help align leadership teams and equip them to address the social side of strategy across your organization.'], 'two_longest': ['We provide expertise in knowing how to create authentic and engaging learning experiences that instill sound judgment at all levels of your organization. You can specify the program details or choose from proven business acumen frameworks that we offer.', 'We partner with you to develop custom solutions that deliver authentic employee engagement. Each expert-designed product described below is an effective stand-alone solution; when integrated together this blended solution will multiply and magnify results.'], 'headers': ['Applied Learning has extensive experience addressing key business challenges. Our authentic employee engagement approach produces fresh solutions tailored to each clientâ€™s goals.', 'Strategy Execution', 'Leadership Development', 'Business Acumen', 'Culture &\nPerformance', 'Operational Excellence', 'Authentic employee engagement is our people-centered approach to training and communication that integrates the needs of people, the possibilities of learning methodologies, and the business needs of our clients.']}, 'first_page': {'first_relevant': ['Applied Learning has extensive experience empowering organizations. Our authentic employee engagement approach will produce fresh solutions tailored to address your challenges and opportunities.', 'We believe that strategy execution requires leadership alignment as well as emotional buy-in at the employee level. We help align leadership teams and equip them to address the social side of strategy across your organization.'], 'two_longest': ['We have supported operational initiatives with hundreds of clients over the last 17 years. We offer expertise in creating tailored, experiential tools and programs that guide employees to personalize and act on operational initiatives.', 'We provide expertise in knowing how to create authentic and engaging learning experiences that instill sound judgment at all levels of your organization. You can specify the program details or choose from proven business acumen frameworks that we offer.'], 'headers': ['Applied Learning has extensive experience empowering organizations. Our authentic employee engagement approach will produce fresh solutions tailored to address your challenges and opportunities.', 'Strategy Execution', 'Leadership Development', 'Business Acumen', 'Culture &\nPerformance', 'Operational Excellence']}}
query_mistral(str(objer))


chUNK: ('id', 'chatcmpl-oln4hbaaaug0fdvq7eg37bl') <class 'tuple'>
chUNK: ('choices', [Choice(finish_reason='stop', index=0, message=ChatCompletionMessage(content='Based on the information provided, Applied Learning appears to be a company that specializes in addressing business challenges and providing authentic employee engagement solutions. Their approach focuses on strategy execution, leadership development, business acumen, culture and performance, and operational excellence. They also offer expertise in creating tailored, experiential tools and programs that guide employees to personalize and act on operational initiatives.\nGiven this information, I would advise investing in Applied Learning if you are looking for a company that can help your organization address key business challenges and empower your employees through authentic employee engagement solutions. Their expertise in strategy execution, leadership development, business acumen, culture and performance, and operational

'Based on the information provided, Applied Learning appears to be a company that specializes in addressing business challenges and providing authentic employee engagement solutions. Their approach focuses on strategy execution, leadership development, business acumen, culture and performance, and operational excellence. They also offer expertise in creating tailored, experiential tools and programs that guide employees to personalize and act on operational initiatives.\nGiven this information, I would advise investing in Applied Learning if you are looking for a company that can help your organization address key business challenges and empower your employees through authentic employee engagement solutions. Their expertise in strategy execution, leadership development, business acumen, culture and performance, and operational excellence suggests that they have a strong understanding of the factors that contribute to organizational success. Additionally, their ability to create tailore

#### Demonstration

In [9]:
urly = 'https://www.dentalxchange.com/'
response = requests.get(urly)
soupy = BeautifulSoup(response.text, 'html.parser')
soupy
# bs4_scrape(urly,response.text)
sel_scrape(urly)

atags [<selenium.webdriver.remote.webelement.WebElement (session="b9c41f14c9bb116069321cf5d110b3df", element="2AE9D3B91A92398A31CB0F578013975F_element_300")>, <selenium.webdriver.remote.webelement.WebElement (session="b9c41f14c9bb116069321cf5d110b3df", element="2AE9D3B91A92398A31CB0F578013975F_element_24")>, <selenium.webdriver.remote.webelement.WebElement (session="b9c41f14c9bb116069321cf5d110b3df", element="2AE9D3B91A92398A31CB0F578013975F_element_26")>, <selenium.webdriver.remote.webelement.WebElement (session="b9c41f14c9bb116069321cf5d110b3df", element="2AE9D3B91A92398A31CB0F578013975F_element_301")>, <selenium.webdriver.remote.webelement.WebElement (session="b9c41f14c9bb116069321cf5d110b3df", element="2AE9D3B91A92398A31CB0F578013975F_element_75")>, <selenium.webdriver.remote.webelement.WebElement (session="b9c41f14c9bb116069321cf5d110b3df", element="2AE9D3B91A92398A31CB0F578013975F_element_87")>, <selenium.webdriver.remote.webelement.WebElement (session="b9c41f14c9bb116069321cf5d1

{'nav': [[[[[[('Independent Practices',
        'https://www.dentalxchange.com/solutions/for-providers')],
      [[('Credentialing',
         'https://www.dentalxchange.com/product/credentialconnect')],
       [('Eligibility',
         'https://www.dentalxchange.com/product/eligibilityconnect')],
       [('Eligibility AI',
         'https://www.dentalxchange.com/product/eligibility-ai')],
       [('Claims', 'https://www.dentalxchange.com/product/claimconnect')],
       [('Attachments',
         'https://www.dentalxchange.com/product/attachmentconnect')],
       [('Claim Status',
         'https://www.dentalxchange.com/product/statusconnect')],
       [('ERA', 'https://www.dentalxchange.com/product/eraconnect')],
       [('Merchant Services',
         'https://www.dentalxchange.com/product/payconnect')],
       [('Patient Statements',
         'https://www.dentalxchange.com/product/billconnect')],
       [('Dental Billing Services',
         'https://www.dentalxchange.com/product/dental

### Loop through excel

For each url imported from the excel:
0. Construct an output object
1. Get then add meta tags with JsonLink API
2. Try requests.get(url) and get output object with bs4 scraping methods
3. If response is not good, scrape with Selenium
4. Potentially add some color with local instance

In [15]:
def update_scrape_results(file_path: str, scrape_results: list[dict], index_range: slice):
    df = pd.read_csv(file_path, low_memory=False)
    if len(df) > len(scrape_results):
        scrape_results += ['']*(len(df)-len(scrape_results))
    if 'Website Scrape' in df:
        print(len(df),index_range,len(scrape_results))
        df['Website Scrape'] = scrape_results #[index_range] = scrape_results
    else:
        # print(len(df),len(scrape_results))
        df['Website Scrape'] = scrape_results
    df.to_csv(file_path, index=False)

def main():    
    start_time = time.time()
    index_range = slice(0,5)

    file_path = './Website_Redirects_230919.csv'
    df = pd.read_csv(file_path, low_memory=False)

    if 'Website' not in df.columns:
        print("The CSV file must have a 'Website' column containing the URLs.")
    else:
        backup_urls = df['Website'][index_range].tolist() 
        redirect_urls = df.get('Website Redirect', pd.Series(dtype=str)).tolist()[index_range]
        
        scrapes = []

        # Check if 'Website Redirect' column is already populated (with valid URL)
        for i, redirect_url in enumerate(redirect_urls):
            current_url, current_data = '', {}
            if redirect_url and validators.url(redirect_url):
                current_url = redirect_url
            elif backup_urls[i] and validators.url(backup_urls[i]):
                current_url = backup_urls[i]
            else:
                # Both the redirect_url and the backup_url are either nonexistent or invalid
                scrapes += [current_data]
                continue
            
            # Do scraping here

            # minimum layer is JsonLink
            # current_data = get_meta_tags(current_url)
            
            # Try request, if don't receive 200 status, use selenium
            # Either way, merge the result with current_data
            current_response = requests.get(current_url)
            if current_response.status_code//100 == 2:
                #BeautifulSoup scraping
                current_data = current_data | bs4_scrape(current_url, current_response.text)
            else:
                current_data = current_data | sel_scrape(current_url)

            # TODO: try mistral here
            # Requires server loaded and started in LM Studio
            # current_data["AI scrape"] = query_mistral(current_data)

            scrapes += [current_data]
            print(f'processed {current_url}')
        
        update_scrape_results(file_path, scrapes, index_range)
            
    
        # Asynchronous infrastructure

        # # Run the asynchronous function using asyncio.run()
        # loop = asyncio.get_event_loop()
        # final_urls = loop.run_until_complete(process_urls(valid_urls))

        # # Update 'Website Redirect' column in the CSV file with final URLs
        # update_scrape_results(file_path, valid_urls, final_urls, index_range)

        print(f"'Website Redirect' column updated in {time.time()-start_time} seconds.")

main()

sunstonepartners.com bs4 naver [[[[[('Why Us', 'https://sunstonepartners.com/why-us/')], [('Team', 'https://sunstonepartners.com/team/')], [('Portfolio', 'https://sunstonepartners.com/portfolio/')], [('News', 'https://sunstonepartners.com/news/')], [('Contact', 'https://sunstonepartners.com/contact/'), [[('LP Relations', 'https://sunstonepartners.com/lprelations/')]]]]]], 'https://sunstonepartners.com/why-us/']
asdf None
processed https://sunstonepartners.com
atags [<selenium.webdriver.remote.webelement.WebElement (session="6760c86f504b0aab78f59a25d9491887", element="073DE53AD37A4F60D15A450EB47740A6_element_338")>, <selenium.webdriver.remote.webelement.WebElement (session="6760c86f504b0aab78f59a25d9491887", element="073DE53AD37A4F60D15A450EB47740A6_element_339")>, <selenium.webdriver.remote.webelement.WebElement (session="6760c86f504b0aab78f59a25d9491887", element="073DE53AD37A4F60D15A450EB47740A6_element_30")>, <selenium.webdriver.remote.webelement.WebElement (session="6760c86f504b0aa