# Unified summary, Version 2!

Key changes from version 1:
- Organized by topic rather than data source
- More data sources: 
    - Indeed job descriptions
    - Crunchbase
    - General search results
- Technical
    - Permalinks in sources and piping them through, rather than each pipeline being different
    - Extract, organize, then abstract
    - Heavy use of caching

In [1]:
from core import CompanyProduct, init_langchain_cache, init_requests_cache

init_requests_cache()
init_langchain_cache()

target = CompanyProduct.same("98point6")

In [1]:
from pprint import pprint
import reddit.summarizer
import reddit.map_reduce_summarizer
import reddit.search
import reddit.fetch

from core import CompanyProduct
from search import SearchResult

from dataclasses import dataclass
from typing import List, Optional
from langchain_core.documents import Document

@dataclass
class RedditSummary:
    sources: List[SearchResult]
    threads: List[reddit.fetch.Submission]
    summary_markdown: str
    summary_intermediate_steps: List[str]
    summary_input_documents: List[Document]

def process_reddit(target: CompanyProduct, num_threads=2, min_comments=2) -> Optional[RedditSummary]:
    reddit_client = reddit.fetch.init()

    # Search for URLs
    search_results = reddit.search.find_submissions(target, num_results=10)

    # Fetch the Submissions from Reddit
    post_submissions = [reddit_client.submission(url=result.link) for result in search_results]

    # Filter Submissions to only those with enough comments
    post_submissions = [submission for submission in post_submissions if submission.num_comments >= min_comments]

    if len(post_submissions) == 0:
        print(f"No posts with enough comments found for {target}")
        return None

    # Limit the number of threads
    post_submissions = post_submissions[:num_threads]

    # Aggregate the summaries
    result = reddit.map_reduce_summarizer.summarize(target, post_submissions)

    return RedditSummary(
        sources=search_results,
        threads=post_submissions,
        summary_markdown=result["output_text"],
        summary_intermediate_steps=result["intermediate_steps"],
        summary_input_documents=result["input_documents"],
    )



In [3]:
from dataclasses import dataclass
from pprint import pprint
from typing import List

from core import CompanyProduct
from search import SearchResult

import markdown
import re

from glassdoor.search import find_review
from glassdoor.scraper import scrape_reviews, GlassdoorJob, GlassdoorReview, Url
from glassdoor.summarizer import summarize

@dataclass
class GlassdoorResult:
    # inputs
    company: CompanyProduct

    # intermediate data
    review_page: SearchResult
    raw_reviews: dict
    reviews: List[GlassdoorReview]

    # outputs
    jobs: List[GlassdoorJob]
    summary_markdown: str
    
    @property
    def num_parsed_reviews(self):
        return len(self.reviews)
    
    @property
    def num_raw_reviews(self):
        return self.raw_reviews.get('allReviewsCount', 0)


async def process_glassdoor(target: CompanyProduct, max_review_pages=1, max_job_pages=1, debug=False) -> GlassdoorResult:
    review_page = find_review(target)
    company, company_id = Url.parse_review_url(review_page.link)

    # job results, not 100% used yet
    # job_results = await scrape_jobs(Url.jobs(company, company_id), max_pages=max_job_pages)
    # jobs = [GlassdoorJob(**result) for result in job_results]
    # jobs = sorted(jobs, key=lambda job: job.jobTitleText)
    
    glassdoor_results = await scrape_reviews(review_page.link, max_pages=max_review_pages)

    if debug:
        pprint(glassdoor_results)
    reviews = GlassdoorReview.parse_reviews(company, glassdoor_results)
    
    review_summary = summarize(target, reviews)
    
    # TODO: Pull out allReviewsCount from glassdoor_results
    return GlassdoorResult(
        target, 
        review_page,
        glassdoor_results,
        reviews, 
        None, 
        review_summary.content
        )


In [4]:
from typing import NamedTuple, Mapping, List

from core import CompanyProduct
from search import SearchResult
import news.search
import news.scrape
import news.summarize

class NewsSummary(NamedTuple):
    # input
    target: CompanyProduct

    # intermediates
    search_results: List[SearchResult]
    article_markdowns: Mapping[str, str]

    # output
    summary_markdown: str


def process_news(target: CompanyProduct) -> NewsSummary:
    search_results = news.search.find_news_articles(target, num_results=30)

    print(f"{len(search_results)} URLs found: {search_results}")
    article_markdowns = {result.link: news.scrape.get_article_markdown(result.link) for result in search_results}

    article_markdown_list = [article for article in article_markdowns.values() if article]

    llm_result = news.summarize.summarize(target, article_markdown_list)

    return NewsSummary(
        target=target, 
        search_results=search_results, 
        article_markdowns=article_markdowns, 
        summary_markdown=llm_result.content
        )



In [5]:
import re
from datetime import datetime
import os

def eval_filename(target: CompanyProduct, create_folder=True, extension="html") -> str:
    # Make the output folder
    folder_name = re.sub(r"[^a-zA-Z0-9]", "_", f"{target.company} {target.product}")
    folder_path = f"evaluation/{folder_name}"

    if create_folder:
        os.makedirs(folder_path, exist_ok=True)

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder_path}/{timestamp}.{extension}"

    return filename

In [6]:
import jinja2

templates = jinja2.Environment(
    loader=jinja2.FileSystemLoader("templates"),
)


In [13]:
from typing import List
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages.ai import AIMessage
from langchain_openai import ChatOpenAI

from core import CompanyProduct
from dotenv import load_dotenv

load_dotenv()


prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
You're an expert at summarizing business and product information for prospective employees, and you're able to distill and integrate information from a variety of sources while maintaining links to the original sources for reference.

Follow this template to summarize the most important information about a company and its product, adding subheadings as needed:

# About {company_name}

# Key personnel

# News (reverse chronological, grouped by event)

# Working at {company_name}

## Positive sentiments and experiences

## Negative sentiments and experiences

## Verifyable statements

# User reviews, sentiments, and feedback about {product_name}

## Positive sentiments and experiences

## Negative sentiments and experiences

## Verifyable statements



Format the output as a markdown document, preserving any links in the source.
            """,
        ),
        (
            "human",
            """
            Company: {company_name}
            Product: {product_name}
            
            Reddit sources: 
            {reddit_text}

            Glassdoor sources:
            {glassdoor_text}

            News sources:
            {news_text}
            """,
        ),
    ]
)





async def unified_summary(target: CompanyProduct, num_reddit_threads=2, max_glassdoor_review_pages=1, max_glassdoor_job_pages=1):
    reddit_result = process_reddit(target, num_threads=num_reddit_threads)
    glassdoor_result = await process_glassdoor(target, max_review_pages=max_glassdoor_review_pages, max_job_pages=max_glassdoor_job_pages)
    news_result = process_news(target)

    # feed results into LLM for summarization
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    runnable = prompt | llm
    result = runnable.invoke({
        "company_name": target.company, 
        "product_name": target.product,
        "reddit_text": reddit_result.summary_markdown,
        "glassdoor_text": glassdoor_result.summary_markdown,
        "news_text": news_result.summary_markdown
        })
    result.content = result.content.strip().strip("```markdown").strip("```")


    with open(eval_filename(target, extension="md"), "w") as f:
        f.write(result.content)

        # Write the raw Reddit summary too
        f.write(f"\n----\n## Reddit Summary\n{reddit_result.summary_markdown}\n\n")

        # Write the individual Reddit threads
        for thread in reddit_result.threads:
            f.write(f"{reddit.fetch.submission_to_markdown(thread)}\n\n")

        # Write the raw Glassdoor summary too
        f.write(f"\n----\n## Glassdoor Summary\n{glassdoor_result.summary_markdown}\n\n")

        # Write the individual Glassdoor reviews
        for review in glassdoor_result.reviews:
            review_md = templates.get_template("glassdoor_review.md").render(review=review)
            f.write(f"{review_md}\n\n")

        # Write the raw News summary too
        f.write(f"\n----\n## News Summary\n{news_result.summary_markdown}\n\n")

        print(f"Written to {f.name}")

await unified_summary(
    CompanyProduct.same("98point6"), 
    num_reddit_threads=5, 
    max_glassdoor_review_pages=3, 
    max_glassdoor_job_pages=2
    )

[32m2024-08-12 14:02:56.225[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_reviews[0m:[36m127[0m - [1mscraping reviews from https://www.glassdoor.com/Reviews/98point6-Reviews-E1181484.htm[0m
[32m2024-08-12 14:02:56.245[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_reviews[0m:[36m135[0m - [1mscraped first page of reviews of https://www.glassdoor.com/Reviews/98point6-Reviews-E1181484.htm, scraping remaining 2 pages[0m


Reddit pipeline output:
{'company': '98point6',
 'input_documents': [Document(page_content='# Post ID bg7ip2: Internet medicine is awesome, 98point6 was so so helpful for me with +55 score by [FrugalChef13 on 2019-04-22](https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/)\n**TL;DR- $20 got me an awesome appointment with a nice doctor and a prescription for a medication I could afford that solved my issue.**\n\n*Disclaimer: This particular thing worked well for me so I\'m going to tell you about it. Everyone is different, so it might not work as well (or at all) for you.  Take what you find useful from this post and ignore the rest.  I\'m not compensated or connected to the website I\'m discussing.*\n\nSo like a lot of people on here I\'m usually either uninsured or underinsured.  Right now it\'s underinsured with a high deductible, so when I messed my back up badly enough that I could barely move I freaked.  I\'ve got scoliosis, a f

[32m2024-08-12 14:02:57.774[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_reviews[0m:[36m153[0m - [1mscraped 30 reviews from https://www.glassdoor.com/Reviews/98point6-Reviews-E1181484.htm in 3 pages[0m


The prompt context has 16,256 characters in 30 reviews
30 URLs found: [SearchResult(title='98point6 Technologies Announces the Acquisition of Bright.md to ...', link='https://www.prnewswire.com/news-releases/98point6-technologies-announces-the-acquisition-of-brightmd-to-accelerate-the-launch-of-its-asynchronous-care-module-302034295.html', snippet='Jan 16, 2024 ... SEATTLE, Jan. 16, 2024 /PRNewswire/ -- 98point6 Technologies, a leader in licensed on-demand virtual care software, announced the addition of a new asynchronous\xa0...', formattedUrl='https://www.prnewswire.com/news.../98point6-technologies-announces-the-...'), SearchResult(title='98point6 hit by new layoffs in latest change at health tech startup ...', link='https://www.geekwire.com/2024/98point6-hit-by-new-layoffs-in-latest-change-at-health-tech-startup/', snippet='Apr 23, 2024 ... In March of last year, 98point6 announced that it was selling its virtual care platform and primary care business to Transcarent for $100 milli