In [79]:
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

True

In [80]:
import os
from autogen import AssistantAgent, UserProxyAgent, ConversableAgent
import tempfile
from autogen import ConversableAgent
from autogen.coding import LocalCommandLineCodeExecutor
import re

llm_config = {"model": "gpt-3.5-turbo", "api_key": os.environ["OPENAI_API_KEY"]}
assistant = AssistantAgent("assistant", llm_config=llm_config)
conv_agent = ConversableAgent("conv_agent", llm_config=llm_config)



# Create a temporary directory to store the code files.
temp_dir = tempfile.TemporaryDirectory()

# Create a local command line code executor.
executor = LocalCommandLineCodeExecutor(
    timeout=10,  # Timeout for each code execution in seconds.
    work_dir=temp_dir.name,  # Use the temporary directory to store the code files.
)

# Create an agent with code executor configuration.
user_proxy = UserProxyAgent(
    "code_executor_agent",
    code_execution_config={"executor": executor}  # Use the local command line code executor.
)

## Defining functions for agents

In [97]:
from googlesearch import search

COMPANY_SET = set()

def fetch_top_search_results(query: str, num_results: int = 10) -> list[str]:
    """
    Performs a Google Search Query of the given string and displays the top X results (by default 10).
    
    Parameters:
    query (str): The search text for the google search, e.g. "Best remote working companies".
    num_results (int): The number of results to be displayed, e.g. by default the top 10 links.
    
    Returns:
    list: A list of the top X (num_results) links that were found when perforing the Google Query.
    """
    global COMPANY_SET
    if "career" in query:
        company_set.add(query.split(" ")[0])
    search_results = search(query, num_results=num_results)
    results_list = []
    for items in search_results:
        results_list.append(items)
    return results_list

In [82]:
from bs4 import BeautifulSoup  # Import the BeautifulSoup library for parsing HTML
import requests  # Import the requests library for making HTTP requests

def extract_text_html_companies(url: str) -> str:
    """
    Extracts and cleans text content from a given URL.
    
    Parameters:
    url (str): The URL of the webpage from which to extract text.
    
    Returns:
    str: The cleaned text content from the webpage, or an error message if the URL could not be fetched.
    """
    try:
        # Set headers to mimic a browser visit, which can help avoid being blocked by the website
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        # Make an HTTP GET request to the specified URL with the given headers and a timeout of 30 seconds
        response = requests.get(url, headers=headers, timeout=30)
        
        # Raise an HTTPError if the HTTP request returned an unsuccessful status code
        response.raise_for_status()
        
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract and clean text content from the HTML
        # The separator '|' ensures spaces between the text of different tags
        text_content = soup.get_text(separator=' | ')
        
        # Remove extra spaces from the extracted text content
        clean_text = ' '.join(text_content.split())

        if len(clean_text)>12500:
            pattern = r'\|([^|]+)\|'
    
            # Use re.findall to extract all occurrences of the pattern
            tokens = re.findall(pattern, clean_text)

            clean_text = " | ".join([word for word in tokens if len(word)<50])
            
        
        # Return the cleaned text content
        return clean_text
    
    except requests.exceptions.RequestException as e:
        # Return an error message if an exception occurs during the HTTP request
        return f"Error fetching URL: {e}"

### Now we need to somehow store this response in a list

In [83]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup
import time
import random

def scrape_career_website(url: str) -> str:
    """
    This function scrapes the career page of a company website for all text.

    Parameters:
    url (str): The URL of the website that needs to be scraped.

    Returns:
    str: The whole webpage formatted as text of the career page, useful for further analysis (to find open data positions).
    """

    # Selenium options
    options = Options()
    # Add the flag to disable webdriver detection
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option('useAutomationExtension', False)
    options.add_argument("--disable-blink-features=AutomationControlled")
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.3")
    #options.add_argument("--headless")

    # Initialize WebDriver
    driver = webdriver.Chrome(options=options)

    try:
        # Open the URL
        driver.get(url)
        
        # Simulate human-like scrolling behavior
        scroll_count = 5
        for _ in range(scroll_count):
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(1, 3))  # Random delay between 1 to 3 seconds
        
        # Wait for the entire page to load
        wait = WebDriverWait(driver, 30)
        
        # Parse the HTML after page is fully loaded
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Extract and clean text content
        text_content = soup.get_text(separator=' | ')  # Ensure spaces between tags' text
        clean_text = ' '.join(text_content.split())  # Remove extra spaces
        
        return clean_text
    
    finally:
        # Close the WebDriver session
        driver.quit()

In [84]:
import pandas as pd
from datetime import datetime

# Initialize the global DataFrame
GLOBAL_DATAFRAME = pd.DataFrame(columns=["titles", "company", "url", "date"])

def store_jobs_in_global_df(job_titles: list, company: str, url: str) -> str:
    """
    This function is used to store the found job titles, their company, and the url into a global DataFrame for later use.

    Parameters:
    job_titles (list): input a list object of all job titles you found in this search on this company site.
    company (str): input the company name of this current search.
    url (str): input the url of the current search that you found all the jobs on.

    Output:
    str - A message indicating the completion of the appending to the GLOBAL_DATAFRAME.
    """
    global GLOBAL_DATAFRAME  # Declare the global DataFrame

    # Create a new DataFrame
    new_df = pd.DataFrame()
    new_df["titles"] = job_titles
    new_df["company"] = company
    new_df["url"] = url
    new_df["date"] = datetime.today().strftime('%Y-%m-%d')

    # Append the new DataFrame to the global DataFrame
    GLOBAL_DATAFRAME = pd.concat([GLOBAL_DATAFRAME, new_df], ignore_index=True)

    return ("Appended jobs to GLOBAL_DATAFRAME - you can continue!")

In [98]:
def recall_short_memory_companies() -> str:
    """
    Returns the global variable SEARCHED_COMPANIES to check what companies were already searched

    Output:
    String - list of unique companies
    """
    return str(COMPANY_SET)

In [99]:
import pandas as pd
from datetime import datetime

def store_jobs_in_csv() -> str:
    """
    This function is used to store the dataframe with all results into a .csv.

    Parameters:
    None

    Output:
    str - A message indicating the completion of the CSV storage.
    """
    # Get today's date in YYYY-MM-DD format
    today_date = datetime.today().strftime('%Y-%m-%d')

    # Create the filename with today's date
    filename = f"JobSearch_{today_date}.csv"

    # Save the DataFrame to a CSV file with the filename
    GLOBAL_DATAFRAME.to_csv(filename, index=False)

    return "Stored all results in .csv - you can Terminate!"

In [52]:
import pandas as pd
import glob

def find_unique_companies_from_csv() -> list:
    """
    This function reads the stored CSV file and returns a unique list of companies.

    Returns:
    list - Unique list of company names.
    """
    # Define the pattern to search for the CSV file
    pattern = "JobSearch_*.csv"

    # Find all matching files
    files = glob.glob(pattern)

    if not files:
        return []

    # Assume we want the latest file (most recent date)
    latest_file = max(files, key=os.path.getctime)

    # Read the CSV file into a DataFrame
    df = pd.read_csv(latest_file)

    # Get unique company names
    unique_companies = df['company'].unique().tolist()

    return unique_companies

### Initiating RAG System to retrieve past information about searches

In [69]:
import autogen
from autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent



from chromadb.utils import embedding_functions

openai_ef = embedding_functions.OpenAIEmbeddingFunction(
                api_key=os.environ["OPENAI_API_KEY"],
                model_name="text-embedding-ada-002"
            )

assistant_aid = RetrieveUserProxyAgent(
    name="Assistant_Aid",
    is_termination_msg=lambda msg: msg.get('content') is not None and 'TERMINATE' in msg['content'],
    system_message="""Assistant who has extra content retrieval power for solving difficult problems. 
    You will retrieve which companies were already searched. At the end of your retrieval message, add 'DO NOT SEARCH THESE COMPANIES'""",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=3,
    retrieve_config={
        "task": "qa",
    },
    code_execution_config=False  # We don't want to execute code in this case.
)





In [70]:
from typing import Annotated

def call_searched_companies(
    message: Annotated[
        str,
        "Refined message which keeps the original meaning and can be used to retrieve content for code generation and question answering.",
    ],
    n_results: Annotated[int, "number of results"] = 3,
) -> str:
    assistant_aid.n_results = n_results  # Set the number of results to be retrieved.
    # Check if we need to update the context.
    update_context_case1, update_context_case2 = assistant_aid._check_update_context(message)
    if (update_context_case1 or update_context_case2) and assistant_aid.update_context:
        assistant_aid.problem = message if not hasattr(assistant_aid, "problem") else assistant_aid.problem
        _, ret_msg = assistant_aid._generate_retrieve_user_reply(message)
        ret_msg = ret_msg + " DO NOT SEARCH THESE COMPANIES! Continue your task but exclude those."
    else:
        _context = {"problem": message, "n_results": n_results}
        ret_msg = assistant_aid.message_generator(assistant_aid, None, _context)
    return ret_msg if ret_msg else message

In [71]:
#assistant.reset()
#ragproxyagent.initiate_chat(assistant, message=ragproxyagent.message_generator, problem="Return the csv to me as a dataframe and tell me a unique list of companies.")

# Now trying to build the agents with the tools to run this in a loop

In [75]:
import os

from autogen import ConversableAgent

assistant = ConversableAgent(
    name="Assistant",
    system_message="""You are a highly efficient AI Job Finding Assistant. Your primary task is to help find remote job opportunities 
    in the data science domain. Do not parallelize any function calls. Always wait for the output from user_proxy before proceeding with the next step.

Your workflow is as follows:


1. **Retrieve Previously Searched Companies**: Use `call_searched_companies` to get the list of companies that were already searched. 
Avoid these companies throughout your tasks.

2. **Search for Remote Companies**: Search Google for "fully remote companies". Adjust the query if necessary. 
Extract the HTML text from the first link, and if it fails, try another link after waiting at least 15 seconds.

3. **Identify Remote Companies**: From the extracted text, identify and list companies that are noted for remote work. The companies have to be separated
by two |
List only the company names, separated by commas.

4. **Compare previously searched companies and possible remote companies** To identify companys that still remain un-searched.

5. **Find Career Sites**: For each company that has not yet been searched, search Google by performing 'fetch_top_search_results'
with the query "{company_name} career site". Select the most likely link to be the company's career site. If none are suitable, 
proceed to the next company.

6. **Scrape Career Sites**: Use the `scrape_career_website` tool to extract all text from the identified career site by querying its url.

7. **Identify Relevant Job Positions**: From the scraped text, identify open positions related to data science, data engineering,
or data analysis. Only include job titles such as "Data Scientist", "Data Analyst", "Data Engineer", or similar roles. 
Do not include roles like "Software Engineer - Data".

8. **Store Jobs**: Store the identified job positions into the GLOBAL_DATAFRAME with the company and URL by performing 'store_jobs_in_global_df'.

1-8 Is a loop. After you are done with that loop follow the next steps. Do this loop 2 times.

9. **Save Results**: At the end of the process, save the results into a CSV file using the `store_jobs_in_csv` function. 
This step should only be done once and at the very end of the task.

10. **Termination**: Once all tasks are completed successfully, return 'TERMINATE'.

Remember, do not parallelize the workflow. Follow the steps sequentially and if any step fails, check the previous output to decide the next action.

""",
    llm_config={"config_list": [{"model": "gpt-3.5-turbo", "api_key": os.environ["OPENAI_API_KEY"]}]},
)

# The user proxy agent is used for interacting with the assistant agent
# and executes tool calls.
user_proxy = ConversableAgent(
    name="User",
    llm_config=False,
    is_termination_msg=lambda msg: msg.get("content") is not None and "TERMINATE" in msg["content"],
    human_input_mode="NEVER",
)


# Register the tool signature with the assistant agent.
#assistant.register_for_llm(name="find_unique_companies_from_csv", description="Finding previously searched companies from previous searches that should not be searched again.")(find_unique_companies_from_csv)
assistant.register_for_llm(name="html_text_scraper_companies", description="Scraping HTML Link for Text for finding Company names")(extract_text_html_companies)
assistant.register_for_llm(name="fetch_top_search_results", description="Google Search for Top Results of best remote jobs")(fetch_top_search_results)
assistant.register_for_llm(name="scrape_career_website", description="Scrape all text from given URL of the career site of a company to find data related jobs")(scrape_career_website)
assistant.register_for_llm(name="store_jobs_in_global_df", description="Append the found jobs as a comma separated list into the GLOBAL_DATAFRAME")(store_jobs_in_global_df)
assistant.register_for_llm(name="store_jobs_in_csv", description="Store the found jobs as a comma separated list into a csv for later use")(store_jobs_in_csv)
assistant.register_for_llm(name="call_searched_companies", description="To retrieve which companies were already searche, Exclude these from any further search and check this before conducting a search.")(call_searched_companies)
#assistant.register_for_llm(name="store_searched_companies", description="Store the already searched companies provided by assistant_aid")(store_searched_companies)
#assistant.register_for_llm(name="call_searched_companies", description="To retrieve which companies were already searche, Exclude these from any further search and check this before conducting a search.")(call_searched_companies)


# Register the tool function with the user proxy agent.
#user_proxy.register_for_execution(name="find_unique_companies_from_csv")(extract_text_html_companies)
user_proxy.register_for_execution(name="html_text_scraper_companies")(extract_text_html_companies)
user_proxy.register_for_execution(name="fetch_top_search_results")(fetch_top_search_results)
user_proxy.register_for_execution(name="scrape_career_website")(scrape_career_website)
user_proxy.register_for_execution(name="store_jobs_in_global_df")(store_jobs_in_global_df)
user_proxy.register_for_execution(name="store_jobs_in_csv")(store_jobs_in_csv)
#user_proxy.register_for_execution(name="store_searched_companies")(store_searched_companies)
#user_proxy.register_for_execution(name="call_searched_companies")(call_searched_companies)


assistant_aid.register_for_execution(name="call_searched_companies")(call_searched_companies)





<function __main__.call_searched_companies(message: typing.Annotated[str, 'Refined message which keeps the original meaning and can be used to retrieve content for code generation and question answering.'], n_results: typing.Annotated[int, 'number of results'] = 3) -> str>

In [76]:
from autogen.agentchat import GroupChat

# Set up a group chat with all agents
group_chat = GroupChat(
    agents=[assistant, assistant_aid, user_proxy],
    messages=[],  # You can define messages to start with if needed
    max_round=45,  # Example: limit the chat to 12 rounds
    speaker_selection_method="auto",  # Example: alternate speakers in a round-robin manner
    allow_repeat_speaker=False,  # Example: do not allow repeating the same speaker consecutively
)

In [77]:
manager = autogen.GroupChatManager(groupchat=group_chat, llm_config=llm_config
                                  )

assistant.initiate_chat(
    manager,
    message="""Pease start finding jobs for me. Start with the assistant and retrieving 'call_searched_companies'.""",
    summary_method="reflection_with_llm",
)


[33mAssistant[0m (to chat_manager):

Pease start finding jobs for me. Start with the assistant and retrieving 'call_searched_companies'.

--------------------------------------------------------------------------------
[32m
Next speaker: User
[0m
[33mUser[0m (to chat_manager):



--------------------------------------------------------------------------------
[32m
Next speaker: Assistant
[0m
[31m
>>>>>>>> USING AUTO REPLY...[0m
[33mAssistant[0m (to chat_manager):

[32m***** Suggested tool call (call_ayxtddCl0WAHV2Pwp3I8do4C): call_searched_companies *****[0m
Arguments: 
{"message":"Please retrieve the list of previously searched companies."}
[32m****************************************************************************************[0m

--------------------------------------------------------------------------------
[32m
Next speaker: Assistant_Aid
[0m
[35m
>>>>>>>> EXECUTING FUNCTION call_searched_companies...[0m


Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


VectorDB returns doc_ids:  [['562ec7d5']]
[32mAdding content of doc 562ec7d5 to context.[0m
[33mAssistant_Aid[0m (to chat_manager):

[33mAssistant_Aid[0m (to chat_manager):

[32m***** Response from calling tool (call_ayxtddCl0WAHV2Pwp3I8do4C) *****[0m
You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.

User's question is: Please retrieve the list of previously searched companies.

Context is: titles,company,url,date
Client Success Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Support Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Sales Manager (Toronto),Contra,https://contra.com/careers,2024-07-10
Client Success Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Support Lead (Toronto),Contra,https://contra.c

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


VectorDB returns doc_ids:  [['562ec7d5']]
[32mAdding content of doc 562ec7d5 to context.[0m
[33mAssistant_Aid[0m (to chat_manager):

[33mAssistant_Aid[0m (to chat_manager):

[32m***** Response from calling tool (call_gtbmoaFyQ9zxNY3gZclocDBj) *****[0m
You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.

User's question is: Please retrieve the list of previously searched companies.

Context is: titles,company,url,date
Client Success Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Support Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Sales Manager (Toronto),Contra,https://contra.com/careers,2024-07-10
Client Success Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Support Lead (Toronto),Contra,https://contra.c

Number of requested results 3 is greater than number of elements in index 1, updating n_results = 1


VectorDB returns doc_ids:  [['562ec7d5']]
[32mAdding content of doc 562ec7d5 to context.[0m
[33mAssistant_Aid[0m (to chat_manager):

[33mAssistant_Aid[0m (to chat_manager):

[32m***** Response from calling tool (call_0nRmBj9ZMNnT0W8ogYHM5JRN) *****[0m
You're a retrieve augmented chatbot. You answer user's questions based on your own knowledge and the
context provided by the user.
If you can't answer the question with or without the current context, you should reply exactly `UPDATE CONTEXT`.
You must give as short an answer as possible.

User's question is: Please retrieve the list of previously searched companies.

Context is: titles,company,url,date
Client Success Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Support Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Sales Manager (Toronto),Contra,https://contra.com/careers,2024-07-10
Client Success Lead (Toronto),Contra,https://contra.com/careers,2024-07-10
Support Lead (Toronto),Contra,https://contra.c

BadRequestError: Error code: 400 - {'error': {'message': "This model's maximum context length is 16385 tokens. However, your messages resulted in 18361 tokens. Please reduce the length of the messages.", 'type': 'invalid_request_error', 'param': 'messages', 'code': 'context_length_exceeded'}}

## Next step is to collect all data related positions and store them in a file (csv?) combined with company name and link to career page

## Another problem to fix is that my selenium does not find all jobs?

In [None]:
# Maybe put extract_text_html_companies as an alterntive in case Selenium fails? Also look into fixing Selenium

In [None]:
# Another idea: it could google search the role and company name to find the URL page of the role?

In [None]:
# It does not properly identify data science positions

## Trying to split up my agents

In [35]:
company_finder_agent = ConversableAgent(
    name="CompanyFinder",
    system_message="""You are an AI Company Finder. Your primary task is to identify remote companies and find their career sites.

Your workflow is as follows:

1. **Retrieve Previously Searched Companies**: Use `call_searched_companies` to get the list of companies that were already searched. Avoid these companies throughout your tasks.

2. **Search for Remote Companies**: Search Google for "fully remote companies". Adjust the query if necessary. Extract the HTML text from the first link, and if it fails, try another link after waiting at least 15 seconds.

3. **Identify Remote Companies**: From the extracted text, identify and list companies that are noted for remote work. List only the company names, separated by commas.

4. **Compare Previously Searched Companies and Possible Remote Companies**: Identify companies that have not yet been searched.

5. **Find Career Sites**: For each unsearched company, search Google with the query "{company_name} career site". Select the most likely link to be the company's career site. If none are suitable, proceed to the next company.

6. **Store Career Site URLs**: Store the company names and their career site URLs in a global variable.

Remember, do not parallelize the workflow. Follow the steps sequentially and if any step fails, check the previous output to decide the next action.

""",
    llm_config={"config_list": [{"model": "gpt-3.5-turbo", "api_key": os.environ["OPENAI_API_KEY"]}]},
)


In [37]:
job_finder_agent = ConversableAgent(
    name="JobFinder",
    system_message="""You are an AI Job Finder. Your primary task is to scrape career sites and identify relevant job positions in the data science domain.

Your workflow is as follows:

1. **Retrieve Career Site URLs**: Get the list of career site URLs from the global variable set by the Company Finder agent.

2. **Scrape Career Sites**: Use the `scrape_career_website` tool to extract all text from the identified career sites.

3. **Identify Relevant Job Positions**: From the scraped text, identify open positions related to data science, data engineering, or data analysis. Only include job titles such as "Data Scientist", "Data Analyst", "Data Engineer", or similar roles. Do not include roles like "Software Engineer - Data".

4. **Store Jobs**: Store the identified job positions into the GLOBAL_DATAFRAME with the company and URL.

5. **Loop**: Repeat steps 2-4 until all career sites are processed. Do this loop 2 times.

6. **Save Results**: At the end of the process, save the results into a CSV file using the `store_jobs_in_csv` function. This step should only be done once and at the very end of the task.

7. **Termination**: Once all tasks are completed successfully, return 'TERMINATE'.

Remember, do not parallelize the workflow. Follow the steps sequentially and if any step fails, check the previous output to decide the next action.

""",
    llm_config={"config_list": [{"model": "gpt-3.5-turbo", "api_key": os.environ["OPENAI_API_KEY"]}]},
)
