In [1]:
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

True

In [317]:
import os
from autogen import AssistantAgent, UserProxyAgent, ConversableAgent
import tempfile
from autogen import ConversableAgent
from autogen.coding import LocalCommandLineCodeExecutor

llm_config = {"model": "gpt-3.5-turbo", "api_key": os.environ["OPENAI_API_KEY"]}
assistant = AssistantAgent("assistant", llm_config=llm_config)
conv_agent = ConversableAgent("conv_agent", llm_config=llm_config)



# Create a temporary directory to store the code files.
temp_dir = tempfile.TemporaryDirectory()

# Create a local command line code executor.
executor = LocalCommandLineCodeExecutor(
    timeout=10,  # Timeout for each code execution in seconds.
    work_dir=temp_dir.name,  # Use the temporary directory to store the code files.
)

# Create an agent with code executor configuration.
user_proxy = UserProxyAgent(
    "code_executor_agent",
    code_execution_config={"executor": executor}  # Use the local command line code executor.
)

## Defining functions for agents

### One Dry Run Manually

In [342]:
from googlesearch import search

def fetch_top_search_results(query: str, num_results: int = 3) -> list[str]:
    """
    Performs a Google Search Query of the given string and displays the top X results (by default 3).
    
    Parameters:
    query (str): The search text for the google search, e.g. "Best remote working companies".
    num_results (int): The number of results to be displayed, e.g. by default the top 3 links.
    
    Returns:
    list: A list of the top X (num_results) links that were found when perforing the Google Query.
    """
    search_results = search(query, num_results=num_results)
    results_list = []
    for items in search_results:
        results_list.append(items)
    return results_list

In [367]:
from bs4 import BeautifulSoup  # Import the BeautifulSoup library for parsing HTML
import requests  # Import the requests library for making HTTP requests

def extract_text_html_companies(url: str) -> str:
    """
    Extracts and cleans text content from a given URL.
    
    Parameters:
    url (str): The URL of the webpage from which to extract text.
    
    Returns:
    str: The cleaned text content from the webpage, or an error message if the URL could not be fetched.
    """
    try:
        # Set headers to mimic a browser visit, which can help avoid being blocked by the website
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        # Make an HTTP GET request to the specified URL with the given headers and a timeout of 30 seconds
        response = requests.get(url, headers=headers, timeout=30)
        
        # Raise an HTTPError if the HTTP request returned an unsuccessful status code
        response.raise_for_status()
        
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract and clean text content from the HTML
        # The separator '|' ensures spaces between the text of different tags
        text_content = soup.get_text(separator='|')
        
        # Remove extra spaces from the extracted text content
        clean_text = ' '.join(text_content.split())

        if len(clean_text)>25000:
            pattern = r'\|([^|]+)\|'
    
            # Use re.findall to extract all occurrences of the pattern
            tokens = re.findall(pattern, clean_text)

            clean_text = "|".join([word for word in tokens if len(word)<50])
            
        
        # Return the cleaned text content
        return clean_text
    
    except requests.exceptions.RequestException as e:
        # Return an error message if an exception occurs during the HTTP request
        return f"Error fetching URL: {e}"

### Fetch Results for top lists of companies for remote work

In [334]:
results = fetch_top_search_results("best remote working companies")

In [335]:
results

['https://www.indeed.com/career-advice/finding-a-job/best-remote-work-companies',
 'https://weworkremotely.com/top-remote-companies',
 'https://www.forbes.com/sites/laurabegleybloom/2024/01/25/work-from-home-or-anywhere-top-30-companies-for-remote-jobs-in-2024/']

### In Example extract HTML from one of those links (later will be loop)

In [282]:
extract = extract_text_html("https://www.forbes.com/sites/laurabegleybloom/2024/01/25/work-from-home-or-anywhere-top-30-companies-for-remote-jobs-in-2024/")

In [283]:
len(extract)

11719

### Different measures to compress and reduce text length

In [286]:
#temp
len(extract)

11719

In [297]:
import re

def custom_tokenizer(text):
    # Define the pattern to find content within | |
    pattern = r'\|([^|]+)\|'
    
    # Use re.findall to extract all occurrences of the pattern
    tokens = re.findall(pattern, text)
    
    return tokens

tokens = custom_tokenizer(extract)

In [290]:
clean_text = " | ".join(token for token in tokens)

In [293]:
len(clean_text)

4236

### Checking with Chat Bot to extract company names (usually losing a few companies here)

In [296]:
reply = conv_agent.generate_reply(messages=[{"content": f"""This text is sequential and after some point a top list of companies to
work remotely for is given. Please list to me all of these companies that are great for working remote. Don't use any other words than the company name, dont use a bullet list, 
dont use numbers, just comma separate all companies you find: {clean_text}""", "role": "user"}])
print(reply)

[31m
>>>>>>>> USING AUTO REPLY...[0m
FluentU, Static Media, Kraken, Chainlink Labs, Veeva, Invisible Technologies, Wikimedia Foundation, Finixio, Oyster HR, Canonical, Remote Technology, Inc., Study.com, Magic Media & Entertainment Group, Superside, Yodo1, Outliant, Cozymeal, Nethermind, Sourcegraph, Verra, Carry1st, Consensys, Hypixel Studios, Screen Rant, Crimson Education, e2f, Xapo Bank, Cash App, Scopic Software, Binance.


### Now we need to somehow store this response in a list

In [298]:
# Split the response string into individual company names
company_names = reply.split(", ")

# Print the list of company names
print(company_names)

['FluentU', 'Static Media', 'Kraken', 'Chainlink Labs', 'Veeva', 'Invisible Technologies', 'Wikimedia Foundation', 'Finixio', 'Oyster HR', 'Canonical', 'Remote Technology', 'Inc.', 'Study.com', 'Magic Media & Entertainment Group', 'Superside', 'Yodo1', 'Outliant', 'Cozymeal', 'Nethermind', 'Sourcegraph', 'Verra', 'Carry1st', 'Consensys', 'Hypixel Studios', 'Screen Rant', 'Crimson Education', 'e2f', 'Xapo Bank', 'Cash App', 'Scopic Software', 'Binance.']


### Next step would be iterating and finding career page of each company

In [301]:
example_company = company_names[3]

In [302]:
example_company

'Chainlink Labs'

In [303]:
search_results = fetch_top_search_results(f"{example_company} career page",5)

In [304]:
results_list = []

for items in search_results:
    results_list.append(items)
    print(items)

https://chainlinklabs.com/careers
https://jobs.lever.co/chainlink
https://www.linkedin.com/jobs/chainlink-labs-jobs-worldwide
https://chainlinklabs.com/
https://web3.career/web3-companies/chainlink
https://www.linkedin.com/company/chainlink-labs/jobs
https://cryptocurrencyjobs.co/startups/chainlink-labs/


In [305]:
jobs_link = results_list[0]

### Again fetch the HTML Page to find open positions

In [306]:
jobs_extract = extract_text_html(jobs_link)

In [307]:
jobs_extract

'Careers | Chainlink Labs| | | | | | | |Research|See open roles|NEW PILOT|Sygnum and Fidelity International Collaborate With Chainlink To Provide Fund NAV Data Onchain.|Read now.|Join us to help build a| world powered by truth|Explore the open positions at Chainlink Labs| | | | |Chainlink Labs|Research|Careers|WE ARE HIRING!|Resources|Chainlink Blog|Contact Us|research@chainlinklabs.com|Legal Support|Legal|© Chainlink Labs – |Code of Conduct| Privacy Policy|Terms of Use|Language| | |English|Spanish| | | |'

In [372]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time
import random

def scrape_career_website(url: str) -> str:
    """
    This function scrapes the career page of a company website for all text.

    Parameters:
    url (str): The URL of the website that needs to be scraped.

    Returns:
    str: The whole webpage formatted as text of the career page, useful for further analysis (to find open data positions).
    """

    # Selenium options
    options = Options()
    # Add the flag to disable webdriver detection
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.3")
    #options.add_argument("--headless")

    # Initialize WebDriver
    driver = webdriver.Chrome(options=options)

    try:
        # Open the URL
        driver.get(url)
        
        # Simulate human-like scrolling behavior
        scroll_count = 5
        for _ in range(scroll_count):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(1, 3))  # Random delay between 1 to 3 seconds
        
        # Wait for the entire page to load
        wait = WebDriverWait(driver, 30)
        
        # Parse the HTML after page is fully loaded
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Extract and clean text content
        text_content = soup.get_text(separator=' | ')  # Ensure spaces between tags' text
        clean_text = ' '.join(text_content.split())  # Remove extra spaces
        
        return clean_text
    
    finally:
        # Close the WebDriver session
        driver.quit()

In [309]:
# Parse the HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')

# Extract and clean text content
text_content = soup.get_text(separator=' | ')  # Ensure spaces between tags' text
clean_text = ' '.join(text_content.split())  # Remove extra spaces

print(clean_text)

Careers | Chainlink Labs | | | | | | | | Research | See open roles | NEW PILOT | Sygnum and Fidelity International Collaborate With Chainlink To Provide Fund NAV Data Onchain. | Read now. | Join us to help build a | world powered by truth | Explore the open positions at Chainlink Labs | | Engineering | Capital Markets | Finance | Go To Market | Marketing | Operations | People | Product | Research | Security | | Engineering | Engineering Manager, Blockchain Integrations | Engineering | United Kingdom / Remote | Remote - Full-time | Engineering Manager, Cross-Chain Interoperability Protocol (CCIP) | Engineering | United States / Remote | Remote - Full-time | Engineering Manager, Economics | Engineering | United States / Remote | Remote - Full-time | Engineering Manager, Payments | Engineering | United States / Remote | Remote - Full-time | Engineering Manager, Release Engineering | Engineering | United Kingdom / Remote | Remote - Full-time | Engineering Manager, SRE | Engineering | Germa

In [310]:
len(clean_text)

6270

In [311]:
job_title_examples = """
Data Scientist
Machine Learning Engineer
AI Research Scientist
Data Science Manager
Senior Data Scientist
NLP Data Scientist
Computer Vision Scientist
Deep Learning Engineer
Data Scientist - Healthcare
Data Scientist - Finance
Data Science Consultant
Research Analyst - Data Science
Data Mining Specialist
Predictive Modeling Analyst
Quantitative Analyst
Statistical Modeler
Business Intelligence Analyst
Data Science Instructor
Data Scientist - Marketing Analytics
Data Science Intern
Data Scientist - Remote Sensing
Data Science Team Lead
Data Science Architect
Data Scientist - Supply Chain
Data Engineer - Data Science
Data Engineer
Big Data Engineer
Cloud Data Engineer
Senior Data Engineer
Data Engineering Manager
ETL Developer
Data Integration Engineer
Database Engineer
Hadoop Engineer
Data Pipeline Engineer
Data Warehouse Architect
Data Platform Engineer
Real-time Data Engineer
Data Infrastructure Specialist
Streaming Data Engineer
Data Operations Engineer
Data Engineering Consultant
Data Engineering Analyst
Data Engineering Intern
Data Engineering Lead
Data Analyst
Business Data Analyst
Financial Data Analyst
Marketing Data Analyst
Healthcare Data Analyst
Operations Data Analyst
Senior Data Analyst
Reporting Analyst
Data Quality Analyst
Quantitative Data Analyst
Data Visualization Analyst
Research Data Analyst
Statistical Analyst
Data Insights Analyst
Data Analysis Consultant
Data Analysis Specialist
Data Analysis Manager
Data Analysis Intern
Data Analysis Team Lead
Solution Architect
Enterprise Solution Architect
Cloud Solution Architect
Software Solution Architect
Data Solution Architect
Infrastructure Solution Architect
Application Solution Architect
Senior Solution Architect
Technical Solution Architect
Solution Architecture Manager
Solution Design Architect
Business Solution Architect
Solution Architect Consultant
IoT Solution Architect
AI Solution Architect
Solution Architect - Healthcare
Solution Architect - Finance
Solution Architect - Telecom
Solution Architect - Retail
Solution Architect - Government
Solution Architect - Digital Transformation
Solution Architect - E-commerce
Solution Architect - ERP
Solution Architect - CRM
Solution Architect - Security
Solution Architect - DevOps
Solution Architect - Data Science
Data Science Engineer
Data Science Analyst
Data Science Solutions Architect
Data Engineering Scientist
Data Analytics Engineer
Data Analytics Architect
AI Data Scientist
Machine Learning Data Engineer
Big Data Solution Architect

"""

In [312]:
reply = conv_agent.generate_reply(messages=[{"content": f"From the given text find for me all open positions that are at all related to data science, data engineering, data analysis or similar. Any given job HAS to be within |, this is the seperator (but there can be weird text separated as well, but a job itself is never separated by |): {clean_text}", "role": "user"}])
print(reply)

[31m
>>>>>>>> USING AUTO REPLY...[0m
Here are the open positions related to data science, data engineering, data analysis, or similar at Chainlink Labs:

1. Senior Engineer, Data | Engineering | Spain / Remote
2. Growth Marketer, Data Analytics | Marketing | Spain / Remote
3. Senior Software Engineer/Analyst - Blockchain Integrations (North America / South America based) | Engineering | Canada / Remote
4. Senior Software Engineer, Golang | Engineering | Spain / Remote
5. People Data Analyst | People | Netherlands / Remote
6. Product Management Director, Data Feeds | Product | United Kingdom / Remote
7. Product Manager- Data Feeds | Product | United Kingdom / Remote


# Now trying to build the agents with the tools to run this in a loop

In [369]:
import os

from autogen import ConversableAgent

# Let's first define the assistant agent that suggests tool calls.
assistant = ConversableAgent(
    name="Assistant",
    system_message="You are a helpful AI assistant. "
    "You can help with extracting text from HTMLs and searching things on Google. "
    "Return 'TERMINATE' when the task is done.",
    llm_config={"config_list": [{"model": "gpt-3.5-turbo", "api_key": os.environ["OPENAI_API_KEY"]}]},
)

# The user proxy agent is used for interacting with the assistant agent
# and executes tool calls.
user_proxy = ConversableAgent(
    name="User",
    llm_config=False,
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
    human_input_mode="NEVER",
)

# Register the tool signature with the assistant agent.
assistant.register_for_llm(name="html_text_scraper_companies", description="Scraping HTML Link for Text for finding Company names")(extract_text_html_companies)
assistant.register_for_llm(name="fetch_top_search_results", description="Google Search for Top Results of best remote jobs")(fetch_top_search_results)
assistant.register_for_llm(name="scrape_career_website", description="Scrape all text from given URL of the career site of a company to find data related jobs")(scrape_career_website)

# Register the tool function with the user proxy agent.
user_proxy.register_for_execution(name="html_text_scraper_companies")(extract_text_html_companies)
user_proxy.register_for_execution(name="fetch_top_search_results")(fetch_top_search_results)
user_proxy.register_for_execution(name="scrape_career_website")(scrape_career_website)


<function __main__.scrape_career_website(url: str) -> str>

In [371]:
chat_result = user_proxy.initiate_chat(assistant, message="""Do NOT parallelize the workflow, follow this from top to bottom and if it 
fails check the previous output to go to another company or another link:

Please search Google for 'best fully remote companys (or similar)' and take the 
first link you receive to extract the html text from that url, try one link after but wait 15 seconds in between at least for it to process 
until one works. 

Once you have the cleaned text: 
This text is sequential and after some point a top list of companies to
work remotely for is given. Please list all of these companies that are great for working remote. 
Don't use any other words than the company name, dont use a bullet list, 
dont use numbers, just comma separate all companies you find. 

Then use this first company name to perform another Google Search with "{company_name} career site" and decide which of the resulted links is
most likely to be the company's career site. If none seem to make senes go with the next company from the previous list.

Then use that URL to perform the scrape_career_website tool in order to extract all text from the website. 

From the given scraped text find for me all open positions that are at all related to data science, data engineering, 
data analysis or similar. EXTREMELY IMPORTANT: Any given job HAS to be within |, this is the seperator (but there can be weird text separated as well, 
but a job itself is never separated by |). E.g.: | this is an example text | department | roles | data | product specialist | engineering manager | abc: 
in here the roles are "product specialist" and "engineering manager" the rest are not roles.

Only do this until it works once
and then TERMINATE.""")

[33mUser[0m (to Assistant):

Do NOT parallelize the workflow, follow this from top to bottom and if it 
fails check the previous output to go to another company or another link:

Please search Google for 'best fully remote companys (or similar)' and take the 
first link you receive to extract the html text from that url, try one link after but wait 15 seconds in between at least for it to process 
until one works. 

Once you have the cleaned text: 
This text is sequential and after some point a top list of companies to
work remotely for is given. Please list all of these companies that are great for working remote. 
Don't use any other words than the company name, dont use a bullet list, 
dont use numbers, just comma separate all companies you find. 

Then use this first company name to perform another Google Search with "{company_name} career site" and decide which of the resulted links is
most likely to be the company's career site. If none seem to make senes go with the next compa

KeyboardInterrupt: 