In [1]:
from core import Seed, init

init()

In [2]:
# load all the companies from Airtable
import airtable

companies = airtable.load_into_pandas()
companies

Unnamed: 0,id,createdTime,fields.Name,fields.Website,fields.Status,fields.Key Product Name,domain
0,rec8leUf8FyDrIa8x,2024-09-30T20:37:50.000Z,Akili Interactive,https://www.akiliinteractive.com/,Approved,,akiliinteractive.com
1,recC1pqGYRxzcmoFs,2024-09-30T20:37:09.000Z,Synthesize Bio,https://www.synthesize.bio/,Approved,,synthesize.bio
2,recE8NhvREVz0cNIa,2024-09-30T20:34:11.000Z,98point6,98point6.com,Approved,,98point6.com
3,recFDpfbo41ICr8O8,2024-09-30T20:35:54.000Z,Kevala,kevala.care,Approved,,kevala.care
5,recSxlXd9ZOPUr8dL,2024-09-30T20:34:55.000Z,Imagine Pediatrics,imaginepediatrics.org,Approved,,imaginepediatrics.org
6,recU56sRD2MLNN9lz,2024-09-30T20:35:03.000Z,Pomelo Care,pomelocare.com,Approved,,pomelocare.com
7,recaVe1KAvdyjHkR6,2024-09-30T20:35:22.000Z,Singularity 6,singularity6.com,Approved,Palia,singularity6.com
8,recwZi2dVuPxfaDnd,2024-09-30T20:36:13.000Z,Rippling,rippling.com,Approved,,rippling.com


In [3]:
company_objects = airtable.pandas_to_seeds(companies)
company_objects

0    (Akili Interactive, Akili Interactive, akiliin...
1     (Synthesize Bio, Synthesize Bio, synthesize.bio)
2                   (98point6, 98point6, 98point6.com)
3                        (Kevala, Kevala, kevala.care)
5    (Imagine Pediatrics, Imagine Pediatrics, imagi...
6           (Pomelo Care, Pomelo Care, pomelocare.com)
7             (Singularity 6, Palia, singularity6.com)
8                   (Rippling, Rippling, rippling.com)
dtype: object

In [4]:
from datetime import datetime, timedelta
import os
from typing import Optional

target = Seed.init("Imagine Pediatrics", domain="imaginepediatrics.org")
data_folder = "../output/data"

output_json = f"{data_folder}/{target.as_path_v2()}.json"
output_json

def get_file_age(file_path: str) -> Optional[timedelta]:
    if os.path.exists(file_path):
        file_mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
        return datetime.now() - file_mod_time
    else:
        return None
    
def should_rebuild(file_path: str, max_age: timedelta = timedelta(days=7)) -> bool:
    age = get_file_age(file_path)
    if age is None:
        return True
    return age > max_age

should_rebuild(output_json)


False

In [5]:
# Set the log level
import sys
from loguru import logger
logger.remove()
logger.add(sys.stderr, level="INFO")

import unified

for target in company_objects:
    output_json = f"{data_folder}/{target.as_path_v2()}.json"

    if not should_rebuild(output_json):
        logger.info(f"Skipping {output_json} as it is up to date.")
    else:
        logger.info(f"Building {output_json}...")

        # TODO: Catch exceptions and keep running
        unified_result = await unified.run(
            target, 
            # TODO: Allow some customization of these parameters
            num_reddit_threads=10, 
            max_glassdoor_review_pages=5, 
            max_glassdoor_job_pages=0,
            max_news_articles=20,
            )
        
        with open(output_json, 'w') as json_file:
            json_file.write(unified_result.model_dump_json(indent=2))



[32m2024-10-02 10:35:05.939[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mBuilding ../output/data/Akili_Interactive.json...[0m
[32m2024-10-02 10:35:10.717[0m | [1mINFO    [0m | [36mcompany_webpage[0m:[36mrun[0m:[36m66[0m - [1m68,358 -> 3,770 chars (6%) [0m
[32m2024-10-02 10:35:10.722[0m | [1mINFO    [0m | [36mcompany_webpage[0m:[36mrun[0m:[36m66[0m - [1mExtractive fraction: 12% [0m
[32m2024-10-02 10:35:10.723[0m | [1mINFO    [0m | [36mcompany_webpage[0m:[36mrun[0m:[36m66[0m - [1mPercent of URLs in sources: 100% ✅[0m
[32m2024-10-02 10:35:10.723[0m | [1mINFO    [0m | [36mcompany_webpage[0m:[36mrun[0m:[36m66[0m - [1mCitation density: 7.5% (percent of output used by URLs/link syntax) [0m
[32m2024-10-02 10:35:10.724[0m | [1mINFO    [0m | [36mcompany_webpage[0m:[36mrun[0m:[36m66[0m - [1mDistinct URLs (summary / input): 6 / 30[0m
[32m2024-10-02 10:35:10.724[0m | [1mINFO    [0m | [36mcompany_web