# Unified summary, Version 2!

Key changes from version 1:
- Organized by topic rather than data source
- More data sources: 
    - Indeed job descriptions
    - Crunchbase
    - General search results
- Technical
    - Permalinks in sources and piping them through, rather than each pipeline being different
    - Extract, organize, then abstract
    - Heavy use of caching

In [2]:
from core import CompanyProduct, init_langchain_cache, init_requests_cache

init_requests_cache()
init_langchain_cache()

target = CompanyProduct.same("98point6")

In [3]:
from pprint import pprint
import src.reddit.summarizer
import reddit.search
import reddit.fetch

from core import CompanyProduct
from search import SearchResult

from dataclasses import dataclass
from typing import List, Optional
from langchain_core.documents import Document

@dataclass
class RedditSummary:
    sources: List[SearchResult]
    threads: List[reddit.fetch.Submission]
    summary_markdown: str
    summary_intermediate_steps: List[str]
    summary_input_documents: List[Document]

def process_reddit(target: CompanyProduct, num_threads=2, min_comments=2) -> Optional[RedditSummary]:
    reddit_client = reddit.fetch.init()

    # Search for URLs
    search_results = reddit.search.find_submissions(target, num_results=10)

    # Fetch the Submissions from Reddit
    post_submissions = [reddit_client.submission(url=result.link) for result in search_results]

    # Filter Submissions to only those with enough comments
    post_submissions = [submission for submission in post_submissions if submission.num_comments >= min_comments]

    if len(post_submissions) == 0:
        print(f"No posts with enough comments found for {target}")
        return None

    # Limit the number of threads
    post_submissions = post_submissions[:num_threads]

    # Aggregate the summaries
    result = reddit.summarizer.summarize(target, post_submissions)

    return RedditSummary(
        sources=search_results,
        threads=post_submissions,
        summary_markdown=result["output_text"],
        summary_intermediate_steps=result["intermediate_steps"],
        summary_input_documents=result["input_documents"],
    )



In [3]:
from dataclasses import dataclass
from pprint import pprint
from typing import List

from core import CompanyProduct
from search import SearchResult


from glassdoor.search import find_review
from glassdoor.summarizer import summarize

import scrapfly_scrapers.glassdoor
from scrapfly_scrapers.glassdoor import scrape_reviews, scrape_jobs
from glassdoor.models import UrlBuilder, GlassdoorReview, GlassdoorJob

scrapfly_scrapers.glassdoor.BASE_CONFIG["cache"] = True

@dataclass
class GlassdoorResult:
    # inputs
    company: CompanyProduct

    # intermediate data
    review_page: SearchResult
    raw_reviews: dict
    reviews: List[GlassdoorReview]

    # outputs
    jobs: List[GlassdoorJob]
    summary_markdown: str
    
    @property
    def num_parsed_reviews(self):
        return len(self.reviews)
    
    @property
    def num_raw_reviews(self):
        return self.raw_reviews.get('allReviewsCount', 0)


async def process_glassdoor(target: CompanyProduct, max_review_pages=1, max_job_pages=0, debug=False) -> GlassdoorResult:
    review_page = find_review(target)
    company, company_id = UrlBuilder.parse_review_url(review_page.link)

    # job results, not 100% used yet
    jobs = []
    if max_job_pages > 0:
        job_results = await scrape_jobs(UrlBuilder.jobs(company, company_id), max_pages=max_job_pages)
        jobs = [GlassdoorJob(**result) for result in job_results]
        jobs = sorted(jobs, key=lambda job: job.jobTitleText)
    
    response = await scrape_reviews(review_page.link, max_pages=max_review_pages)

    if debug:
        pprint(response)
    reviews = GlassdoorReview.parse_reviews(company, response)
    
    review_summary = summarize(target, reviews)
    
    # TODO: Pull out allReviewsCount from glassdoor_results
    return GlassdoorResult(
        target, 
        review_page,
        response,
        reviews, 
        jobs,
        review_summary.content
        )


In [4]:
# Rename it to the old function name
from news import run as process_news


In [5]:
import re
from datetime import datetime
import os

def eval_filename(target: CompanyProduct, create_folder=True, extension="html") -> str:
    # Make the output folder
    folder_name = re.sub(r"[^a-zA-Z0-9]", "_", f"{target.company} {target.product}")
    folder_path = f"evaluation/{folder_name}"

    if create_folder:
        os.makedirs(folder_path, exist_ok=True)

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder_path}/{timestamp}.{extension}"

    return filename

In [6]:
from crunchbase import find_people_url, parse
import scrapfly_scrapers.crunchbase
import jinja2
from datetime import datetime

scrapfly_scrapers.crunchbase.BASE_CONFIG["cache"] = True
templates = jinja2.Environment(loader=jinja2.FileSystemLoader("templates"))

_response_cache = {}

async def process_crunchbase(target: CompanyProduct, debug=False) -> str:
    url = find_people_url(target)

    # For whatever reason, Scrapfly doesn't cache all the time
    if url not in _response_cache:
        _response_cache[url] = await scrapfly_scrapers.crunchbase.scrape_company(url)

    crunchbase_raw_response = _response_cache[url]
    if debug:
        pprint(crunchbase_raw_response)
    organization, employees = parse(crunchbase_raw_response)

    return templates.get_template("crunchbase.md").render(organization=organization, employees=employees, current_year=datetime.now().year)


In [7]:
import jinja2

templates = jinja2.Environment(
    loader=jinja2.FileSystemLoader("templates"),
)


In [8]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from core import CompanyProduct
from dotenv import load_dotenv

load_dotenv()


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
You're an expert in business and product development, writing a comprehensive report about a company and its products for a prospective candidate.
You're detail-oriented and seek to discover and share the truth of any information you find.
Your commitment to intellectual honesty and rigor is expressed in your analysis by providing a balanced view of both positive and negative information, providing citations to your sources, favoring information from more reliable sources, and sharing direct quotes as appropriate.


Carefully review all of the following information about a company and its product.
Write a comprehensive report of all information with citations to the original sources for reference.
Citations should follow the format [(Author, Source, Date)](url).

Follow this template to summarize the most important information about a company and its product. Each markdown section has tips on what information is most critical.

# About {company_name}

The About section should provide all the essential information about the company.
An ideal summary should incorporate the answers to the following questions:
- When was the company founded?
- Approximately how many employees work at the company?
- What products does the company produce? What services does the company offer?
- How does the company make money? Who are their customers in general? Is it B2B, B2C? If B2B, include example customers.
- Approximately how much revenue does the company generate annually?
- Describe the scale of the company if possible, including the number of customers, users, or clients.
- How are the company's products distributed or sold to users?

# Key personnel

# News (reverse chronological, grouped by event)

# Working at {company_name}

Questions that should be answered by this summary:
- Is the leadership team good?
- What benefits are provided?
- Is the company good at DEI?
- Whats's the work-life balance like and workload?
- How has working at the company changed over time?
- How does employee satisfaction vary by job function?
- Why do people like working here?
- Why do people dislike working here?

## Positive sentiments and experiences

## Negative sentiments and experiences

## Verifyable statements about working at {company_name}

# User reviews, sentiments, and feedback about {product_name}

## Positive sentiments and experiences

## Negative sentiments and experiences

## Verifyable statements about using {product_name}

# Bibliography

The Bibliography should include a list of all the sources used to compile the summary. If there are many sources, group them by type (e.g., Reddit, Glassdoor, News, Crunchbase).


Feel free to create subheadings or additional sections as needed to capture the most important information about the company and its product.
Format the output as a markdown document, using markdown links for citations.
            """,
        ),
        (
            "human",
            """
            Company: {company_name}
            Product: {product_name}
            
            Reddit sources: 
            {reddit_text}

            Glassdoor sources:
            {glassdoor_text}

            News sources:
            {news_text}

            Crunchbase information:
            {crunchbase_text}
            """,
        ),
    ]
)





async def unified_summary(target: CompanyProduct, num_reddit_threads=2, max_glassdoor_review_pages=1, max_glassdoor_job_pages=1, max_news_articles=10):
    crunchbase_markdown = await process_crunchbase(target)
    reddit_result = process_reddit(target, num_threads=num_reddit_threads)
    glassdoor_result = await process_glassdoor(target, max_review_pages=max_glassdoor_review_pages, max_job_pages=max_glassdoor_job_pages)
    news_result = process_news(target, max_results=max_news_articles)


    # feed results into LLM for summarization
    llm = ChatOpenAI(model="gpt-4o", temperature=0)

    runnable = prompt | llm
    result = runnable.invoke({
        "company_name": target.company, 
        "product_name": target.product,
        "reddit_text": reddit_result.summary_markdown,
        "glassdoor_text": glassdoor_result.summary_markdown,
        "news_text": news_result.summary_markdown,
        "crunchbase_text": crunchbase_markdown,
        })
    result.content = result.content.strip().strip("```markdown").strip("```")

    input_content_length = len(reddit_result.summary_markdown) + len(glassdoor_result.summary_markdown) + len(news_result.summary_markdown) + len(crunchbase_markdown)
    output_content_length = len(result.content)

    print(f"unified_summary: input_content_length={input_content_length:,} chars, output_content_length={output_content_length:,} chars ({output_content_length/input_content_length:.0%})")


    with open(eval_filename(target, extension="md"), "w") as f:
        f.write(result.content)

        # Write the raw Reddit summary too
        f.write(f"\n----\n## Reddit Summary\n{reddit_result.summary_markdown}\n\n")

        # Write the individual Reddit threads
        # for thread in reddit_result.threads:
        #     f.write(f"{reddit.fetch.submission_to_markdown(thread)}\n\n")

        # Write the raw Glassdoor summary too
        f.write(f"\n----\n## Glassdoor Summary\n{glassdoor_result.summary_markdown}\n\n")

        # Write the individual Glassdoor reviews
        # for review in glassdoor_result.reviews:
        #     review_md = templates.get_template("glassdoor_review.md").render(review=review)
        #     f.write(f"{review_md}\n\n")

        # Write the raw News summary too
        f.write(f"\n----\n## News Summary\n{news_result.summary_markdown}\n\n")

        # Write the raw Crunchbase summary too
        f.write(f"\n----\n## Crunchbase\n{crunchbase_markdown}\n\n")

        print(f"Written to {f.name}")

await unified_summary(
    CompanyProduct("Singularity 6", "Palia"), 
    num_reddit_threads=20, 
    max_glassdoor_review_pages=3, 
    max_glassdoor_job_pages=0,
    max_news_articles=50
    )

[32m2024-08-13 11:42:06.956[0m | [1mINFO    [0m | [36mscrapfly_scrapers.crunchbase[0m:[36mscrape_company[0m:[36m60[0m - [1mscraping company: https://www.crunchbase.com/organization/singularity-6/people[0m


Reddit: The prompt context has 309,441 characters in 10 threads


[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Please read the following Reddit thread and extract all opinions and facts relating to the user experience of the PRODUCT Palia by the COMPANY Singularity 6 from the perspective of current users.
Only include information about the COMPANY Singularity 6 and PRODUCT Palia. 
Do not extract information about other companies or products.
If the text does not contain any relevant information about the COMPANY or PRODUCT, please return an empty string.

Format the results as a Markdown list of quotes, each with a permalink to the source of the quote like so:
- "quote" [Author, Reddit, Date](permalink)

For example:

Input comment:
## Comment ID hrmpl3t with +3 score by [MarketWorldly9908 on 2022-01-07](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98poin

[32m2024-08-13 11:44:21.509[0m | [1mINFO    [0m | [36mscrapfly_scrapers.glassdoor[0m:[36mscrape_reviews[0m:[36m105[0m - [1mscraping reviews from https://www.glassdoor.com/Reviews/Singularity-6-Reviews-E3342660.htm[0m



[1m> Finished chain.[0m

[1m> Finished chain.[0m
Reddit: The summary has 4,725 characters, 2% of the input


[32m2024-08-13 11:44:21.543[0m | [1mINFO    [0m | [36mscrapfly_scrapers.glassdoor[0m:[36mscrape_reviews[0m:[36m113[0m - [1mscraped first page of reviews of https://www.glassdoor.com/Reviews/Singularity-6-Reviews-E3342660.htm, scraping remaining 2 pages[0m
[32m2024-08-13 11:44:23.066[0m | [1mINFO    [0m | [36mscrapfly_scrapers.glassdoor[0m:[36mscrape_reviews[0m:[36m123[0m - [1mscraped 21 reviews from https://www.glassdoor.com/Reviews/Singularity-6-Reviews-E3342660.htm in 3 pages[0m


Glassdoor: The context has 23,610 characters in 21 reviews
Glassdoor: The summary has 4,416 characters, 19% of the input
Failed to get article from https://www.mmorpg.com/news/daybreak-games-has-acquired-palia-developer-singularity-6-2000132085: 403
Failed to get article from https://forums.mmorpg.com/discussion/504931/palia-developer-singularity-6-rocked-by-more-layoffs-affecting-40-of-staff-mmorpg-com: 403
Failed to get article from https://www.pcgamesn.com/palia/new-coziest-mmorpg: 403
Failed to get article from https://www.sportskeeda.com/mmo/news-everything-coming-palia-june-25-update-that-know-far: 403
Failed to get article from https://support.palia.com/hc/en-us/articles/21548837853844--Nintendo-Switch-Account-Creation-Linking-and-Rewards: 403
Failed to get article from https://support.palia.com/hc/en-us/articles/25051277773332-Patch-Known-Issues: 403
News: 82,374 characters of context, 29 articles
News: The summary has 4,136 characters, 5% of the input
unified_summary: input_co

In [4]:
reddit_result = process_reddit(CompanyProduct("Singularity 6", "Palia"), num_threads=20)

Reddit: The prompt context has 310,437 characters in 10 threads


[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Please read the following Reddit thread and extract all opinions and facts relating to the user experience of the PRODUCT Palia by the COMPANY Singularity 6 from the perspective of current users.
Only include information about the COMPANY Singularity 6 and PRODUCT Palia. 
Do not extract information about other companies or products.
If the text does not contain any relevant information about the COMPANY or PRODUCT, please return an empty string.

Format the results as a Markdown list of quotes, each with a permalink to the source of the quote like so:
- "quote" [Author, Reddit, Date](permalink)

For example:

Input comment:
## Comment ID hrmpl3t with +3 score by [MarketWorldly9908 on 2022-01-07](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98poin