# Companies House Public Data API Project Using PySpark

In [33]:
# for making HTTP requests to the Companies House API
import requests
# for working with JASON data (parsing API responses, encoding addresses)
import json
# for adding delays between API called to avoid rate limiting
import time
# for writing data to a csv file in a structured format
import csv
# for interacting with the operating system (managing file paths or environment variables)
import os

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [19]:
def load_api_key(filepath="config.txt"):
    """
    Load an API Key from a config file.
    The file must contain a line likeL API_KEY=your-key-here
    """
    with open(filepath, "r") as file:
        for line in file:
            if line.startswith("API_KEY="):
                return line.strip().split("=")[1]
    raise ValueError("API_KEY not found in config file.")

In [20]:
# load API key from config file - this avoids hardcoding the key into the script
API_KEY = load_api_key()
# set the base URL for all API calls
BASE_URL = "https://api.company-information.service.gov.uk"
# specify that the client expects JSON responses from the API
HEADERS = {"Accept": "application/json"}
# file name where all fetched company profile data will be saved as a CSV
OUTPUT_FILE = "company_profiles.csv"
# time delay (in seconds) between consecutive API requests to avoid hitting rate limits
DELAY_BETWEEN_REQUESTS = 1

In [34]:
def get_ftse100_from_lse(headless=True, save_to_csv=False):
    url = "https://www.londonstockexchange.com/indices/ftse-100/constituents/table"

    # Setup ChromeDriver
    options = Options()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    driver = webdriver.Chrome(options=options)
    driver.get(url)

    wait = WebDriverWait(driver, 10)
    companies = []

    print("Loading FTSE 100 companies from all pages...")

    for page in range(5):  # 5 pages, 20 companies each
        # Wait for table to be present
        wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "table tbody tr")))
        time.sleep(2)  # slight buffer

        rows = driver.find_elements(By.CSS_SELECTOR, "table tbody tr")
        for row in rows:
            try:
                cells = row.find_elements(By.TAG_NAME, "td")
                name = cells[0].text.strip()
                ticker = cells[1].text.strip()
                companies.append((name, ticker))
            except Exception as e:
                print(f"[WARN] Skipped row: {e}")

        # try clicking the "Next" pagination button if not on last page
        if page < 4:
            try:
                next_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "button.pagination__btn--next")))
                next_button.click()
            except Exception as e:
                print(f"[ERROR] Failed to click 'Next' on page {page+1}: {e}")
                break

    driver.quit()

    if save_to_csv:
        pd.DataFrame(companies, columns=["Company", "Ticker"]).to_csv("ftse100_lse.csv", index=False)
        print("Saved all FTSE 100 companies to ftse100_lse.csv")

    return companies

In [None]:
if __name__ == "__main__":
    ftse_companies = get_ftse100_from_lse(save_to_csv=True)
    print(f"Retrieved {len(ftse_companies)} FTSE 100 companies.")
    for name, ticker in ftse_companies[:40]:
        print(f"- {name} ({ticker})")

In [22]:
# fetches the core company profile from the Companies House API
def fetch_company_profile(company_number):
    # construct the API URL for the company's main profile
    url = f"{BASE_URL}/company/{company_number}"
    try:
        # send GET request with basic auth (API key only)
        response = requests.get(url, auth=(API_KEY, ""), headers=HEADERS)
        if response.status_code == 200:
            # return JSON data if successful
            return response.json()
        else:
            # log status code if note successful
            print(f"[{response.status_code}] Failed to fetch {company_number}")
            return None
    except Exception as e:
        # catch and log connection or parsing errors
        print(f"[ERROR] {company_number}: {str(e)}")
        return None

# retrieves a list of up to 3 officer (director) names for the company
def fetch_officers(company_number):
    url = f"{BASE_URL}/company/{company_number}/officers"
    response = requests.get(url, auth=(API_KEY, ""), headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        # limit to first 3 officers
        officers = data.get("items", [])[:3]
        # return only officer names, filtering out any nulls
        return [str(o.get("name", "Unknown")) for o in officers if o.get("name")]
    # return empty list if request fails or no officers
    return []

# retrieves up to 5 most recent filing history entries
def fetch_filing_history(company_number):
    url = f"{BASE_URL}/company/{company_number}/filing-history"
    response = requests.get(url, auth=(API_KEY, ""), headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        # limit to 5 filings
        filings = data.get("items", [])[:5]
        # format: "filing_type on date"
        return [f"{str(f.get('type', 'UNKNOWN'))} on {str(f.get('date', 'UNKNOWN'))}" for f in filings]
    return []

# retrieves up to 3 charge entries (e.g. mortgage or secured lending)
def fetch_charges(company_number):
    url = f"{BASE_URL}/company/{company_number}/charges"
    response = requests.get(url, auth=(API_KEY, ""), headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        # limits to 3 charges
        charges = data.get("items", [])[:3]
        # get charge codes, safely convert to string
        return [str(c.get("charge_code", "Unknown")) for c in charges if c.get("charge_code")]
    return []

# retrieves up to 3 names of Persons with Significant Control (PSC)
def fetch_psc(company_number):
    url = f"{BASE_URL}/company/{company_number}/persons-with-significant-control"
    response = requests.get(url, auth=(API_KEY, ""), headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        # limit to 3 PSC entries
        pscs = data.get("items", [])[:3]
        # return PSC names if present
        return [str(p.get("name", "Unknown")) for p in pscs if p.get("name")]
    return []

In [23]:
def main():
    # print the number of companies to be processed
    print(f"Fetching data for {len(COMPANY_NUMBERS)} companies...")

    # initialise an empty list to collect all results (one dict per company)
    results = []

    # loop over each company number in the list
    for number in COMPANY_NUMBERS:
        # fetch the company's profile (core metadata)
        data = fetch_company_profile(number)

        # if data was successfully retrieved, enrich and store it
        if data:
            results.append({
                # extract basic company information from the profile
                "company_number": data.get("company_number"),
                "company_name": data.get("company_name"),
                "status": data.get("company_status"),
                "date_of_creation": data.get("date_of_creation"),
                "type": data.get("type"),
                "company_category": data.get("company_category"),
                "jurisdiction": data.get("jurisdiction"),

                # financial reporting data
                "last_accounts_date": data.get("accounts", {}).get("last_accounts", {}).get("made_up_to"),
                "next_accounts_due": data.get("accounts", {}).get("next_due"),

                # latest confirmation statement date
                "confirmation_statement_date": data.get("confirmation_statement", {}).get("last_made_up_to"),

                # whether the company has a history of insolvency
                "has_insolvency_history": data.get("has_insolvency_history"),

                # flatten the address JSON structure into a string
                "registered_office_address": json.dumps(data.get("registered_office_address", {})),

                # list of SIC (Standard Industrial Classification) codes, joined into one string
                "sic_codes": ",".join(data.get("sic_codes", [])),

                # enriched data from additional endpoints
                "officers": "; ".join(fetch_officers(number)), # top 3 officers
                "recent_filings": "; ".join(fetch_filing_history(number)), # latest 5 filings
                "charges": "; ".join(fetch_charges(number)), # top 3 charge codes
                "psc": "; ".join(fetch_psc(number)) # up to 3 persons with significant control
            })

        # wait between requests to avoid hitting rate limits
        time.sleep(DELAY_BETWEEN_REQUESTS)

        # save the current state of the results to CSV after each company
        keys = results[0].keys() if results else [] # get column headers from first result
        with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=keys)
            writer.writeheader()
            writer.writerows(results)

        # log the progress
        print(f"Saved {len(results)} company profiles to {OUTPUT_FILE}")

# entry point of the script - only runs when file is executed directly
if __name__ == "__main__":
    main()

Fetching data for 20 companies...
Saved 1 company profiles to company_profiles.csv
Saved 2 company profiles to company_profiles.csv
Saved 3 company profiles to company_profiles.csv
Saved 4 company profiles to company_profiles.csv
Saved 5 company profiles to company_profiles.csv
Saved 6 company profiles to company_profiles.csv
Saved 7 company profiles to company_profiles.csv
Saved 8 company profiles to company_profiles.csv
Saved 9 company profiles to company_profiles.csv
Saved 10 company profiles to company_profiles.csv
Saved 11 company profiles to company_profiles.csv
Saved 12 company profiles to company_profiles.csv
Saved 13 company profiles to company_profiles.csv
Saved 14 company profiles to company_profiles.csv
Saved 15 company profiles to company_profiles.csv
Saved 16 company profiles to company_profiles.csv
Saved 17 company profiles to company_profiles.csv
Saved 18 company profiles to company_profiles.csv
Saved 19 company profiles to company_profiles.csv
Saved 20 company profiles