In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-'):
    print("API key loaded successfully")
else:
    print("Check your .env file")

API key loaded successfully


In [2]:
pip install openai beautifulsoup4 requests python-dotenv tldextract

Note: you may need to restart the kernel to use updated packages.


In [5]:
"""
Test for web_scrapper
"""
from src.web_scraper import Website

url = "https://www.sapien.io/"  # Just an example from a startup!
site = Website(url)

print(site.summary())

{'url': 'https://www.sapien.io/', 'title': 'Fine-tune LLM Models With Human Feedback & Data Labelling | Sapien', 'num_links': 32, 'text_preview': "Check out the Sapien's Guide to AI in 2025\nRead Now >\nSapien.io: A Decentralized Data Foundry Raises $10.5M,\nRead More\nSchedule a Consult\nLanguage\nLanguage\nSchedule a Consult\nTrain AI with Expert Human Feedback\nAccuracy. Scalability. Expertise.\nCustom data collection and labeling services, powered b..."}


In [6]:
""""
Test for prompt_engine
"""
from src.web_scraper import Website
from src.prompt_engine import classify_links

# Scrape the website
site = Website("https://www.sapien.io/")

# Filter the links
filtered = classify_links(site.url, site.links)
print(filtered)

{'links': [{'type': 'about', 'url': 'https://www.sapien.io/'}, {'type': 'blog', 'url': 'https://www.sapien.io/blog/ai-in-2025-navigating-the-future-of-innovation-with-sapiens-guide'}, {'type': 'blog', 'url': 'https://www.sapien.io/blog/sapien-io-a-decentralized-data-foundry-raises-10-5m'}, {'type': 'careers', 'url': 'https://wellfound.com/jobs/3042964-demand-generation-specialist?utm_campaign=startup_share&utm_content=startup_share_module&utm_medium=social&utm_term=sapien-io'}, {'type': 'careers', 'url': 'https://wellfound.com/jobs/3034338-junior-software-engineer?utm_campaign=startup_share&utm_content=startup_share_module&utm_medium=social&utm_term=sapien-io'}, {'type': 'careers', 'url': 'https://wellfound.com/jobs/3034337-senior-software-engineer-web3?utm_campaign=startup_share&utm_content=startup_share_module&utm_medium=social&utm_term=sapien-io'}]}


In [7]:
pip install google-search-results

Note: you may need to restart the kernel to use updated packages.


In [8]:
'''
Test for Tech Crunch Scrapper
'''
from src.techcrunch_scraper import find_techcrunch_articles, scrape_techcrunch_article

articles = find_techcrunch_articles("PostHog")
print("Found articles:\n", articles)

# Get info for first article
if articles:
    full_article = scrape_techcrunch_article(articles[0]["url"])
    print("\n--- FULL ARTICLE ---")
    print(full_article["title"])
    print(full_article["date"])
    print(full_article["text"][:500], "...")

Found articles:
 [{'title': "All the companies from Y Combinator's W20 Demo Day ...", 'url': 'https://techcrunch.com/2020/03/17/all-the-companies-from-y-combinators-w20-demo-day-part-iii-hardware-robots-ai-and-developer-tools/', 'snippet': "PostHog: PostHog is a software service that lets developers understand how their users are actually working with their products. It's a ..."}, {'title': 'Tyson Clark, a general partner with GV, has passed away', 'url': 'https://techcrunch.com/2021/12/09/tyson-clark-a-general-partner-with-gv-has-passed-away/', 'snippet': 'In December of last year, for example, he led a Series A round for PostHog, a product analytics platform whose founder he had not met in ...'}]

--- FULL ARTICLE ---
All the companies from Y Combinator’s W20 Demo Day, Part III: Hardware, Robots, AI and Developer Tools
2020-03-17T12:43:46-07:00
 ...


In [None]:
"""
Test for analyzer
"""
from src.analyzer import collect_all_details
from IPython.display import Markdown, display

summary_data = collect_all_details("ScaleAI", "https://scale.com/")

# Preview a portion of each section
print(f"Company: {summary_data['company_name']}")
print(f"Homepage title: {summary_data['homepage']['title']}")
print(f"Homepage preview: {summary_data['homepage']['text'][:300]}...\n")

for page in summary_data['subpages']:
    print(f"--- {page['type'].upper()} PAGE ---")
    print(page['text'][:300], "...\n")

for article in summary_data["techcrunch_articles"]:
    print(f"\n {article['title']} ({article['date']})\n{article['text'][:500]}...\n")

In [14]:
from src.analyzer import collect_all_details
from src.prompt_engine import generate_markdown_brief
from IPython.display import Markdown, display

# Get the company data
summary = collect_all_details("Scale AI", "https://scale.com/")

#Generate the report
markdown_report = generate_markdown_brief(summary)

display(Markdown(markdown_report))

# Scale AI Company Report

## Company Snapshot
**Company Name**: Scale AI  
**Headquarters**: San Francisco, CA  
**Founded**: 2016  
**Founders**: Alexandr Wang, Lucy Guo  
**Employees**: Approximately 900  
**Latest Funding**: $1.35 billion  

## Product & Technology
Scale AI specializes in providing a comprehensive data-centric solution for managing the entire machine learning (ML) lifecycle. Their flagship offerings include:

- **Scale Data Engine**: A platform for collecting, curating, and annotating data. This engine is trusted by leading machine learning teams to accelerate model development through high-quality labeling and cost-effective optimization of data spend.
  
- **GenAI Platform**: Allows businesses to build, test, and optimize generative AI applications. It utilizes advanced retrieval-augmented generation (RAG) pipelines to enhance domain-specific LLM performance and improve productivity through custom copilots for employees.

- **SEAL (Safety, Evaluations, and Alignment Lab)**: A research initiative aimed at improving model capabilities through rigorous evaluations and research on AI safety and alignment.

## Market Opportunity and Traction
Scale AI is positioned at the intersection of significant growth trends in AI and machine learning application deployment. They serve diverse sectors, including government, private enterprises, and generative AI companies, indicating a broad market opportunity. 

The demand for high-quality labeled datasets continues to grow in tandem with the acceleration in AI adoption across industries, and Scale AI's operational capabilities uniquely enable them to meet this demand. The firm has annotated a staggering 13 billion data points to date, with 87 million of those pertaining to generative AI, reflecting strong traction and a robust client base. 

Recent mentions in the media, such as their initiatives to influence policymakers on AI regulations and the unveiling of the 'Defense Llama' model for national security, underscore their strategic importance and visibility in the AI ecosystem.

## Competitive Landscape
Scale AI operates in a competitive landscape characterized by other data labeling and AI service providers, including companies like Appen, Labelbox, and AWS SageMaker Ground Truth. While these competitors offer similar services, Scale AI differentiates itself with its strong focus on quality, scalability, and expertise, along with a robust platform that integrates end-to-end solutions for customers across various sectors. 

Moreover, their continuous investment in research and development through initiatives like SEAL positions them as thought leaders in AI safety and model evaluation, which can provide them a competitive edge in gaining customer trust.

## Investment View
- **Base Case**: Scale AI embodies a strong growth potential underpinned by increasing reliance on AI technologies across businesses and government. Its end-to-end solutions and significant dataset generation capabilities position it favorably in a growing market.

- **Upside Drivers**: 
  - Expanding partnerships with government entities and large enterprises can accelerate growth.
  - Continuous advancements in their data engine and GenAI platform can lead to enhancements in model performance and customer satisfaction.

- **Key Risks**: 
  - The ever-evolving regulatory environment around AI could impose constraints on operations.
  - Rising competition from both established tech giants and emerging startups might pressure pricing and margins.

- **Milestones to Watch**: 
  - Updates on government contracts or partnerships, particularly with the U.S. government and defense sectors.
  - Announcements related to product advancements or new customer acquisitions.

## Bottom Line
Scale AI has established itself as a leader in the data-centric AI landscape, enabling a diverse set of enterprises and government agencies to unlock the true value of their data. With a strong foundation in quality and scalability, alongside its innovations in generative AI, Scale AI is well-positioned for continued growth. An investment in Scale AI appears justified given its market position and the critical role it plays in the AI development ecosystem. However, investors should keep a vigilant eye on competitive pressures and regulatory developments that may affect its operational landscape.