In [2]:
from core import CompanyProduct

target = CompanyProduct.same("98point6")

In [13]:
import reddit.summarizer
import reddit.search
import reddit.fetch

from core import CompanyProduct
from search import SearchResult

from collections import ChainMap
from dataclasses import dataclass
from typing import List, Mapping, Optional

@dataclass
class RedditSummary:
    sources: List[SearchResult]

    overall_summary: reddit.summarizer.AggregatedSummaryResult
    summaries: List[reddit.summarizer.ThreadSummaryResult]
    permalinks: Mapping[str, str]

def process_reddit(target: CompanyProduct, num_threads=2, min_comments=2) -> Optional[RedditSummary]:
    reddit_client = reddit.fetch.init()

    # Search for URLs
    search_results = reddit.search.find_submissions(target, num_results=10)

    # Fetch the Submissions from Reddit
    post_submissions = [reddit_client.submission(url=result.link) for result in search_results]

    # Filter Submissions to only those with enough comments
    post_submissions = [submission for submission in post_submissions if submission.num_comments >= min_comments]

    if len(post_submissions) == 0:
        print(f"No posts with enough comments found for {target}")
        return None

    # Limit the number of threads
    post_submissions = post_submissions[:num_threads]

    # Summarize each
    post_summaries = [reddit.summarizer.summarize_submission(target, submission) for submission in post_submissions]

    # Filter out any with over 1 hallucinated comment id
    post_summaries = [summary for summary in post_summaries if summary.is_under_max_hallucinations(1, debug=True)]

    if len(post_summaries) == 0:
        print(f"No valid summaries found for {target}")
        return None
    
    # Index permalinks
    permalinks = ChainMap(*[reddit.fetch.index_permalinks(summary.submission) for summary in post_summaries])

    # Aggregate the summaries
    aggregate_summary = reddit.summarizer.summarize_summaries(target, post_summaries)

    return RedditSummary(
        sources=search_results,
        overall_summary=aggregate_summary,
        summaries=post_summaries,
        permalinks=permalinks
    )

reddit_result = process_reddit(target)
reddit_result

RedditSummary(sources=[SearchResult(title='Internet medicine is awesome, 98point6 was so so helpful for me : r ...', link='https://www.reddit.com/r/povertyfinance/comments/bg7ip2/internet_medicine_is_awesome_98point6_was_so_so/', snippet='Apr 22, 2019 ... So the 98point6 doctor could prescribe me the long-acting prescription anti-inflammatory meloxicam, but not the controlled substance Vicodin or\xa0...', formattedUrl='https://www.reddit.com/.../internet_medicine_is_awesome_98point6_was_s...'), SearchResult(title='Has anyone used Amazon care app or 98point6 app to get a doctors ...', link='https://www.reddit.com/r/AmazonFC/comments/rgxxbw/has_anyone_used_amazon_care_app_or_98point6_app/', snippet='Dec 15, 2021 ... Has anyone used Amazon care app or 98point6 app to get a doctors note to be excused? FOr like food poisoning, really bad stomach aches,\xa0...', formattedUrl='https://www.reddit.com/.../has_anyone_used_amazon_care_app_or_98poin...'), SearchResult(title='Has anyone used the 98

In [19]:
from dataclasses import dataclass
from typing import List

from core import CompanyProduct
from search import SearchResult

import markdown

from glassdoor.search import find_review
from glassdoor.scraper import scrape_reviews, scrape_jobs, GlassdoorJob, GlassdoorReview, Url
from glassdoor.summarizer import summarize

@dataclass
class GlassdoorResult:
    # inputs
    company: CompanyProduct

    # intermediate data
    review_page: SearchResult
    raw_reviews: dict
    reviews: List[GlassdoorReview]

    # outputs
    jobs: List[GlassdoorJob]
    summary_markdown: str

    def __add_links(self, html):
        employer, _ = Url.parse_review_url(self.review_page.link)
        gloss_to_link = {}
        for review in self.reviews:
            title = review.jobTitle or "Anonymous"
            date = review.dateTime.strftime('%Y-%m-%d')
            url = Url.review(employer, review.reviewId)

            gloss_to_link[f"{title} on {date}"] = url
            gloss_to_link[f"{title}, {date}"] = url

        for gloss, link in gloss_to_link.items():
            html = html.replace(gloss, f'<a href="{link}">{gloss}</a>')

        return html

    @property
    def summary_html(self):
        html_out = markdown.markdown(self.summary_markdown)

        html_out = self.__add_links(html_out)

        return html_out
    
    @property
    def num_parsed_reviews(self):
        return len(self.reviews)
    
    @property
    def num_raw_reviews(self):
        return self.raw_reviews.get('allReviewsCount', 0)


async def process_glassdoor(target: CompanyProduct, max_pages=1) -> GlassdoorResult:
    review_page = find_review(target)
    company, company_id = Url.parse_review_url(review_page.link)

    # job results, not 100% used yet
    # job_results = await scrape_jobs(Url.jobs(company, company_id), max_pages=1)
    # jobs = [GlassdoorJob.from_dict(result) for result in job_results]
    # jobs = sorted(jobs, key=lambda job: job.jobTitleText)
    
    glassdoor_results = await scrape_reviews(review_page.link, max_pages=max_pages)
    reviews = GlassdoorReview.parse_reviews(glassdoor_results)
    
    review_summary = summarize(target, reviews)
    
    # TODO: Pull out allReviewsCount from glassdoor_results
    return GlassdoorResult(
        target, 
        review_page,
        glassdoor_results,
        reviews, 
        None, 
        review_summary.content
        )

glassdoor_result = await process_glassdoor(target)
glassdoor_result

[32m2024-08-01 13:07:57.701[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_reviews[0m:[36m110[0m - [1mscraping reviews from https://www.glassdoor.com/Reviews/98point6-Reviews-E1181484.htm[0m
[32m2024-08-01 13:07:58.261[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_reviews[0m:[36m118[0m - [1mscraped first page of reviews of https://www.glassdoor.com/Reviews/98point6-Reviews-E1181484.htm, scraping remaining 0 pages[0m
[32m2024-08-01 13:07:58.262[0m | [1mINFO    [0m | [36mglassdoor.scraper[0m:[36mscrape_reviews[0m:[36m128[0m - [1mscraped 10 reviews from https://www.glassdoor.com/Reviews/98point6-Reviews-E1181484.htm in 1 pages[0m


The prompt context has 3,500 characters in 10 reviews


GlassdoorResult(company=CompanyProduct(company='98point6', product='98point6'), review_page=SearchResult(title='98point6 Reviews: What Is It Like to Work At 98point6? | Glassdoor', link='https://www.glassdoor.com/Reviews/98point6-Reviews-E1181484.htm', snippet='167 98point6 reviews. A free inside look at company reviews and salaries posted anonymously by employees.', formattedUrl='https://www.glassdoor.com/Reviews/98point6-Reviews-E1181484.htm'), raw_reviews={'__typename': 'EmployerReviewsRG', 'allReviewsCount': 178, 'currentPage': 1, 'filteredReviewsCount': 167, 'lastReviewDateTime': '2024-06-27T00:39:17.030', 'numberOfPages': 17, 'queryJobTitle': None, 'queryLocation': None, 'ratedReviewsCount': 167, 'ratings': {'__typename': 'EmployerRatings', 'businessOutlookRating': 0.62, 'careerOpportunitiesRating': 3.5, 'ceoRating': 0.42, 'compensationAndBenefitsRating': 4, 'cultureAndValuesRating': 4, 'diversityAndInclusionRating': 3.8, 'overallRating': 3.8, 'ratedCeo': {'__typename': 'Ceo', 'i

In [None]:
# link to an individual review
# https://www.glassdoor.com/Reviews/Employee-Review-Singularity-6-RVW86375944.htm
# https://www.glassdoor.com/Reviews/Employee-Review-Singularity-6-RVW82917491.htm

In [5]:
import re
from datetime import datetime
import os

def eval_filename(target: CompanyProduct, create_folder=True) -> str:
    # Make the output folder
    folder_name = re.sub(r"[^a-zA-Z0-9]", "_", f"{target.company} {target.product}")
    folder_path = f"evaluation/{folder_name}"

    if create_folder:
        os.makedirs(folder_path, exist_ok=True)

    # Create the filename using the current timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"{folder_path}/{timestamp}.html"

    return filename

In [21]:
import jinja2

templates = jinja2.Environment(
    loader=jinja2.FileSystemLoader("templates"),
)


with open(eval_filename(CompanyProduct.same("98point6")), "w") as f:
    template = templates.get_template("overall.html")
    html = template.render(
        reddit_summary=reddit_result,
        glassdoor_summary=glassdoor_result
    )

    f.write(html)
    print(f"Written to {f.name}")

Written to evaluation/98point6_98point6/20240801_130957.html
