In [1]:
from core import Seed, init

init()

In [2]:
# load all the companies from Airtable
import airtable

companies = airtable.load_into_pandas()
companies.head()

Unnamed: 0,id,createdTime,fields.extra_json,fields.Name,fields.Website,fields.Status,fields.Created,fields.Last Modified,fields.Created By.id,fields.Created By.email,fields.Created By.name,fields.Last Modified By.id,fields.Last Modified By.email,fields.Last Modified By.name,fields.Key Product Name,fields.Keywords,fields.Refresh Days,domain
3,rec5nPLxKw5vzPAgP,2024-10-02T16:56:55.000Z,"{""name"":""TBD"",""description"":""TBD is Block\u201...",TBD,https://tbd.website/,Approved,2024-10-02T16:56:55.000Z,2024-10-02T20:34:40.000Z,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,,,,tbd.website
5,rec7STu2pyWvo9cIp,2024-10-02T16:56:58.000Z,"{""name"":""Veritone"",""description"":""Veritone has...",Veritone,http://www.veritone.com/,Approved,2024-10-02T16:56:58.000Z,2024-10-07T21:34:58.000Z,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,,,,veritone.com
6,rec7Xm4DXzYQ9uoeb,2024-10-02T16:56:56.000Z,"{""name"":""DoubleDown Interactive LLC"",""descript...",DoubleDown Interactive,https://www.doubledowninteractive.com/jobs/,Approved,2024-10-02T16:56:56.000Z,2024-10-04T19:39:49.000Z,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,,,,doubledowninteractive.com
7,rec7zrOPWtNbHBjUp,2024-10-08T18:11:30.000Z,,Sourcegraph,https://sourcegraph.com/,Approved,2024-10-08T18:11:30.000Z,2024-10-10T15:02:54.000Z,usrPAGESHARE00000,anonymous+formpage@noreply.airtable.com,Anonymous,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,,,,sourcegraph.com
8,rec8VeVqBxJsUpNaD,2024-10-02T16:56:56.000Z,"{""name"":""ThousandEyes (part of Cisco)"",""descri...",ThousandEyes,http://www.thousandeyes.com,Approved,2024-10-02T16:56:56.000Z,2024-10-08T18:31:51.000Z,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,usrlUO0c7FWWC9lIZ,keith.trnka@gmail.com,Keith Trnka,,,,thousandeyes.com


In [3]:
sorted_companies = companies.sort_values(by="fields.Last Modified", ascending=False)
sorted_companies[["fields.Name", "fields.Last Modified"]].head(10)

Unnamed: 0,fields.Name,fields.Last Modified
80,Optimize Health,2024-10-14T19:21:57.000Z
33,The Coalition,2024-10-11T16:03:02.000Z
7,Sourcegraph,2024-10-10T15:02:54.000Z
48,Roam,2024-10-10T15:02:51.000Z
65,Sudowrite,2024-10-10T15:02:35.000Z
30,promptfoo,2024-10-10T15:02:34.000Z
81,Pave.dev,2024-10-10T15:02:31.000Z
74,Disney,2024-10-08T18:32:09.000Z
8,ThousandEyes,2024-10-08T18:31:51.000Z
47,Verkada Inc,2024-10-07T21:38:25.000Z


In [4]:
from datetime import datetime, timedelta
import os
from typing import Optional

def get_file_age(file_path: str) -> Optional[timedelta]:
    if os.path.exists(file_path):
        file_mod_time = datetime.fromtimestamp(os.path.getmtime(file_path))
        return datetime.now() - file_mod_time
    else:
        return None

# NOTE: It does exact matching on the company name
force_rebuild_companies = {
    # "Sudowrite"
}

def should_rebuild(target: Seed, file_path: str, max_age: timedelta = timedelta(days=7)) -> bool:
    if target.company in force_rebuild_companies:
        return True
    
    age = get_file_age(file_path)
    if age is None:
        return True
    return age > max_age


In [5]:
from loguru import logger

import unified

data_folder = "../output/data"

for _, row in companies.sort_values("fields.Name").iterrows():
    target = airtable.row_to_seed(row)
    output_json = f"{data_folder}/{target.as_path_v2()}.json"

    # NOTE: Without the dropna, it returns NaN which then causes the timedelta to fail
    refresh_days = row.dropna().get("fields.Refresh Days", 21)

    if should_rebuild(target, output_json, max_age=timedelta(days=refresh_days)):
        logger.info(f"Building {output_json}...")

        try:
            unified_result = await unified.run(
                target, 
                # TODO: Allow some customization of these parameters
                num_reddit_threads=10, 
                max_glassdoor_review_pages=5, 
                max_glassdoor_job_pages=0,
                max_news_articles=20,
                )
            
            with open(output_json, 'w') as json_file:
                json_file.write(unified_result.model_dump_json(indent=2))
        except IndexError as e:
            logger.error(f"Error, skipping {target.company}: {e}")



[32m2024-10-14 12:46:30.499[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m15[0m - [1mBuilding ../output/data/Optimize_Health.json...[0m
[32m2024-10-14 12:46:31.062[0m | [1mINFO    [0m | [36mdata_sources.company_webpage[0m:[36mrun[0m:[36m53[0m - [1msearch took 0.6 seconds[0m
[32m2024-10-14 12:46:31.331[0m | [1mINFO    [0m | [36mdata_sources.company_webpage[0m:[36mrun[0m:[36m56[0m - [1mscrape took 0.3 seconds[0m
[32m2024-10-14 12:46:34.666[0m | [1mINFO    [0m | [36mdata_sources.company_webpage[0m:[36mrun[0m:[36m60[0m - [1mparse took 3.3 seconds[0m
[32m2024-10-14 12:46:34.868[0m | [1mINFO    [0m | [36mdata_sources.company_webpage[0m:[36mrun[0m:[36m66[0m - [1msummarize took 0.2 seconds[0m
[32m2024-10-14 12:46:34.869[0m | [1mINFO    [0m | [36mdata_sources.company_webpage[0m:[36mrun[0m:[36m77[0m - [1m140,009 -> 4,061 chars (3%) [0m
[32m2024-10-14 12:46:34.879[0m | [1mINFO    [0m | [36mdata_sources.compan