In [2]:
import requests

def fetch_html(url: str) -> str:
    response = requests.get(url)
    return response.content

company_list_html = fetch_html('https://www.builtinseattle.com/companies')
company_list_html

b'<!DOCTYPE html>\n<html lang="en">\n<head>\n    <meta charset="utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1.0" />\n    <meta name="description" content="See the complete list of Seattle, WA technology companies, many of which are hiring now. See company benefits, info, interviews and more at Built In Seattle." />\n        <meta name="robots" content="INDEX,FOLLOW" />\n    <title>Top Tech Companies in Seattle, WA 2024 | Built In Seattle</title>\n    <link rel="icon" type="image/x-icon" href="/companies/images/favicon.png">\n        <link rel="canonical" href="https://www.builtinseattle.com/companies" />\n    <link rel="dns-prefetch" href="https://www.googletagmanager.com/">\n    <link rel="preload" href="https://static.builtin.com/dist/fonts/barlow-condensed-v12-latin-500.woff2" as="font" type="font/woff2" crossorigin>\n    <link rel="preload" href="https://static.builtin.com/dist/fonts/barlow-condensed-v12-latin-regular.woff2" as="font" type="font/

In [14]:
from typing import List, Dict
from bs4 import BeautifulSoup


def parse_companies(html: str) -> List[Dict[str, str]]:
    soup = BeautifulSoup(html, 'html.parser')

    company_list_section = soup.select_one('#main-container')

    company_rows = company_list_section.select('div.company-unbounded-responsive')
    
    companies = []
    for row in company_rows:
        name_tag = row.select_one('div:nth-of-type(2) > div:nth-of-type(1) > a:nth-of-type(2) > h2')
        name = name_tag.text.strip()
        link = name_tag.find_parent('a')['href']
        description = row.select_one('div:nth-of-type(2) > p').text.strip()

        industry_tag = row.select_one('div:nth-of-type(2) > div:nth-of-type(2)')
        industry = industry_tag.text.strip() if industry_tag else 'N/A'

        size_tag = row.select_one('div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(2) > span')
        size = size_tag.text.strip() if size_tag else 'N/A'

        companies.append({'name': name, 'description': description, 'link': link, 'industry': industry, 'size': size})
    
    return companies

companies = parse_companies(company_list_html)

from pprint import pprint
pprint(companies)

[{'description': '2K is headquartered in Novato, California and is a wholly '
                 'owned label of Take-Two Interactive Software, Inc. (NASDAQ: '
                 'TTWO). Founded in 2005, 2K Games is a global video game '
                 'company, publishing titles developed by some of the most '
                 'influential game development studios in the world. Our '
                 'studios responsible for developing 2K’s portfolio of '
                 'world-class games across multiple platforms, include Visual '
                 'Concepts, Firaxis, Hangar 13, CatDaddy, Cloud Chamber, 31st '
                 'Union, and HB Studios. Our portfolio of titles is expanding '
                 'due to our global strategic plan, building and acquiring '
                 'exciting studios whose content continues to inspire all of '
                 'us! 2K publishes titles in today’s most popular gaming '
                 'genres, including sports, shooters, action, role-pla

In [10]:
detail_url = "https://www.builtinseattle.com/company/2k"
company_html = fetch_html(detail_url)

In [15]:
import re
def clean_text(text: str) -> str:
    # This is mainly needed for the number of employees
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def parse_company_details(html: str) -> Dict[str, str]:
    soup = BeautifulSoup(html, 'html.parser')

    # The header part with the name
    name = soup.select_one('div.card-head > h1').text.strip()

    # The grid part with basic company info
    company_info = soup.select_one('div.company-info')
    location = company_info.select_one('div:nth-of-type(1)').text.strip()
    employees = clean_text(company_info.select_one('div:nth-of-type(2)').text)
    founded = company_info.select_one('div:nth-of-type(3)').text.strip()
    website = company_info.select_one('div:nth-of-type(4) > a')['href'].strip()

    # The we-are-hiring part
    hiring_card = soup.select_one('#we-are-hiring')
    hiring_by_category = {}

    if hiring_card:
        for category_row in hiring_card.select('a'):
            try:
                category_name = category_row.select_one('span:nth-of-type(1)').text.strip()
                num_roles = category_row.select_one('span:nth-of-type(2) > b').text.strip()
                link = category_row['href']

                hiring_by_category[category_name] = {'num_roles': num_roles, 'link': link}
            except AttributeError:
                # Links that don't have the span structure in there
                continue

    # Info that isn't parsed:
    # The description from the top and industries
    # The benefits section (note: It's shortened on the main page, need to go to another page to get the full list)
    
    return {'name': name, 'location': location, 'employees': employees, 'founded': founded, 'website': website, 'hiring_by_category': hiring_by_category}

pprint(parse_company_details(company_html))

{'employees': '3,505 Total Employees',
 'founded': 'Year Founded: 2005',
 'hiring_by_category': {'Content': {'link': '/jobs/content?companyId=177680&allLocations=true',
                                    'num_roles': '1'},
                        'Design + UX': {'link': '/jobs/design-ux?companyId=177680&allLocations=true',
                                        'num_roles': '1'},
                        'Developer + Engineer': {'link': '/jobs/dev-engineering?companyId=177680&allLocations=true',
                                                 'num_roles': '1'},
                        'Operations': {'link': '/jobs/operations?companyId=177680&allLocations=true',
                                       'num_roles': '1'},
                        'Product': {'link': '/jobs/product?companyId=177680&allLocations=true',
                                    'num_roles': '1'}},
 'location': 'Kirkland, WA',
 'name': '2K',
 'website': 'https://2k.com/en-us/jobs/'}
