# Proof of concept search module

The idea with this module is to Google search 100+ results, then use an LLM to organize and re-rank the results.

## Ideas to consider

- Multiple searches: The company, company + product, search for product reviews, search all time vs just recent
- Provide a markdown template to fill out with an "other" category, and refine the "other" listings

In [1]:
from core import CompanyProduct, init_requests_cache, init_langchain_cache, make_experiment_dir

init_requests_cache()
init_langchain_cache()

'/home/keith/company-detective/.cache/langchain.sqlite'

In [2]:
target = CompanyProduct.same("Pomelo Care")
experiment_dir = make_experiment_dir(target)

In [3]:
from search import search
from pprint import pprint

search_results = list(search(f'"{target.company}"', num=100))
if target.product != target.company:
    search_results += list(search(f'"{target.company}" "{target.product}"', num=100))
pprint(search_results)

[SearchResult(title='Pomelo Care | Virtual Maternity Care Program', link='https://www.pomelocare.com/', snippet='“I just want you to know how grateful I am for Pomelo Care. ... does not provide any medical, nursing, or other healthcare provider services. © Pomelo Care, Inc.', formattedUrl='https://www.pomelocare.com/'),
 SearchResult(title='Pomelo Care | LinkedIn', link='https://www.linkedin.com/company/pomelo-care', snippet='Jul 20, 2024 ... Pomelo Care. Hospitals and Health Care. New York, NY 15,485 followers. Transforming outcomes for moms and babies through personalized,\xa0...', formattedUrl='https://www.linkedin.com/company/pomelo-care'),
 SearchResult(title='Careers', link='https://www.pomelocare.com/careers', snippet='Pomelo Care. Current Job Openings. Department. All Departments, Business ... Pomelo Care, Inc. does not provide any medical, nursing, or other healthcare\xa0...', formattedUrl='https://www.pomelocare.com/careers'),
 SearchResult(title='Pomelo Care scores $46M for 

In [4]:
from typing import List
from search import SearchResult

def result_to_markdown(search_result: SearchResult) -> str:
    return f"[{search_result.title}]({search_result.link})\n{search_result.snippet}"

def results_to_markdown(search_results: List[SearchResult]) -> str:
    return "\n\n".join(result_to_markdown(result) for result in search_results)

print(results_to_markdown(search_results))

[Pomelo Care | Virtual Maternity Care Program](https://www.pomelocare.com/)
“I just want you to know how grateful I am for Pomelo Care. ... does not provide any medical, nursing, or other healthcare provider services. © Pomelo Care, Inc.

[Pomelo Care | LinkedIn](https://www.linkedin.com/company/pomelo-care)
Jul 20, 2024 ... Pomelo Care. Hospitals and Health Care. New York, NY 15,485 followers. Transforming outcomes for moms and babies through personalized, ...

[Careers](https://www.pomelocare.com/careers)
Pomelo Care. Current Job Openings. Department. All Departments, Business ... Pomelo Care, Inc. does not provide any medical, nursing, or other healthcare ...

[Pomelo Care scores $46M for virtual maternity care platform and ...](https://www.mobihealthnews.com/news/pomelo-care-scores-46m-virtual-maternity-care-platform-and-more-digital-health-funding)
Jun 20, 2024 ... Pomelo Care, a virtual maternity care platform, announced it secured $46 million in Series B funding. Existing invest

In [15]:
from typing import List
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages.ai import AIMessage
from langchain_openai import ChatOpenAI

from core import CompanyProduct, URLShortener
from dotenv import load_dotenv

load_dotenv()


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
You're an expert at organizing search results.
Given search results for a company or product, organize them into the following headers:

# Official social media
# Job boards
# App stores
# Product reviews
# News articles (most recent first, grouped by event)
# Key employees (with subheaders by employee)
# Other pages on the company website
# Other

Include the publication date after the link, if available.

Unless otherwise specified, order the results in each section from most to least relevant.
Format the output as a markdown document, preserving any links in the source.
Organize ALL search results into these headers; do not omit any results.
            """,
        ),
        (
            "human",
            """
            Company: {company_name}
            Product: {product_name}
            
            Search results: 
            {text}
            """,
        ),
    ]
)

from loguru import logger

def summarize(
    target: CompanyProduct, search_results: List[SearchResult], debug=True, shorten_urls=False
) -> AIMessage:
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
    unified_markdown = results_to_markdown(search_results)
    input_len = len(unified_markdown)

    if shorten_urls:
        url_shortener = URLShortener()
        unified_markdown = url_shortener.shorten_markdown(unified_markdown)


    runnable = prompt | llm
    result = runnable.invoke({"text": unified_markdown, "company_name": target.company, "product_name": target.product})
    result.content = result.content.strip().strip("```markdown").strip("```")

    if shorten_urls:
        result.content = url_shortener.unshorten_markdown(result.content)

    logger.info(f"{input_len:,} -> {len(result.content):,} chars ({len(result.content) / input_len:.0%})")

    return result

# summary = summarize(target, search_results)
# print(summary.content)

# with open(f"{experiment_dir}/search_results.md", "w") as f:
#     f.write(summary.content)

#     f.write("\n# Sources\n")
#     for result in search_results:
#         f.write(result_to_markdown(result) + "\n\n")

In [16]:
summary = summarize(target, search_results, shorten_urls=True)
# print(summary.content)

with open(f"{experiment_dir}/search_results_url_shortener_v2.md", "w") as f:
    f.write(summary.content)

    f.write("\n# Sources\n")
    for result in search_results:
        f.write(result_to_markdown(result) + "\n\n")

[32m2024-08-18 14:23:14.457[0m | [1mINFO    [0m | [36mcore[0m:[36mshorten_markdown[0m:[36m146[0m - [1m19,034 -> 15,270 chars (80% of original)[0m
[32m2024-08-18 14:23:14.463[0m | [1mINFO    [0m | [36mcore[0m:[36munshorten_markdown[0m:[36m159[0m - [1m3,809 -> 5,900 chars (155% of original)[0m
[32m2024-08-18 14:23:14.464[0m | [1mINFO    [0m | [36m__main__[0m:[36msummarize[0m:[36m70[0m - [1m19,034 -> 5,900 chars (31%)[0m


In [None]:
# Test a few different URL compression schemes

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
for url in [
    "https://www.apollo.io/companies/Pomelo-Care/6196af0887796a008c77f450",
    "cache://www.apollo.io/1",
    "cache://www.apollo.io/15",
    "cache://apollo/15",
    "http://apollo/15",
    ]:
    print(f"{llm.get_num_tokens(url):,} tokens: {url}")

25 tokens: https://www.apollo.io/companies/Pomelo-Care/6196af0887796a008c77f450
8 tokens: cache://www.apollo.io/1
8 tokens: cache://www.apollo.io/15
5 tokens: cache://apollo/15
5 tokens: http://apollo/15


In [None]:
from core import test_extract_core_domain

test_extract_core_domain()