# Import libraries

In [1]:
import sqlite3
import requests
import time
import os
import time
import ollama
from dotenv import load_dotenv
import subprocess
import signal
from datetime import datetime
import json

# Database

In [2]:
# Create and/or connect to database
conn = sqlite3.connect("seq_data.db")
cur = conn.cursor()

# Enable foreign keys
cur.execute("PRAGMA foreign_keys = ON;")

<sqlite3.Cursor at 0x1f67e3cf440>

In [3]:
# Create a company dimension table
cur.execute("""
CREATE TABLE IF NOT EXISTS dim_company (
    cik INTEGER PRIMARY KEY,
    cik_padded TEXT NOT NULL,
    ticker TEXT,
    company_name TEXT NOT NULL,
    mgt_insert_timestamp TEXT DEFAULT (datetime('now'))
);
""")

# Create a table with the documents
cur.execute("""
CREATE TABLE IF NOT EXISTS fct_form (
	accession_number TEXT PRIMARY KEY,
	cik INTEGER NOT NULL,
	acceptance_timestamp TEXT,
	form_type TEXT,
	investor TEXT,
	ownership REAL,
	form TEXT,
	mgt_insert_timestamp TEXT DEFAULT (datetime('now')),
	FOREIGN KEY (cik) REFERENCES dim_company(cik)
);
""")

<sqlite3.Cursor at 0x1f67e3cf440>

# Get data and extract information

In [4]:
# Get our user agent header data from .env
# Has format "name e-mail address"
load_dotenv()
user_agent = os.getenv('user_agent')

In [5]:
# Get all the different companies listed at the SEC
headers = {"User-Agent": user_agent}
url = "https://www.sec.gov/files/company_tickers.json"
res = requests.get(url, headers=headers)
companies = res.json()

In [6]:
# Make an array with the company data and insert it into our dimension table
# The cik number is a unique identifier for the company
# A padded cik number with 0s to length 10 is required for certain endpoints
# The ticker is a short string identifier for the company
# Company name is the full name of the company
company_data = []
for _, c in companies.items():
	company_data.append((
		c['cik_str'],
		str(c['cik_str']).zfill(10),
		c.get('ticker'),
		c['title']
	))

cur.executemany("""
INSERT OR REPLACE INTO dim_company (cik, cik_padded, ticker, company_name)
VALUES (?, ?, ?, ?);
""", company_data)

conn.commit()

In [7]:
# Start sub process group which starts Ollama server
proc = subprocess.Popen(["ollama", "serve"], creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
model = "qwen3:4b"

In [None]:
# We only want to extract relevant forms
relevant_forms = (
	"SC 13 D", # Schedule 13D: ownership >5% with intent to influence/control
	"SC 13D/A", # Amendment to Schedule 13D (updates ownership/intent changes)
	"SC 13G", # Schedule 13G: ownership >5% filed by passive investors
	"SC 13G/A" # Amendment to Schedule 13G (updates to prior 13G filing)
)

# We can only make 5 requests per second
min_wait = 0.2

# Check how many new records we've inserted into the database, for logging purposes
num_forms = 0

# Loop through data array and get all forms
for idx, (cik, cik_padded, ticker, title) in enumerate(company_data, start=1):
	print(f"Processing company {idx}/{len(company_data)}: {title}")

	url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
	res = requests.get(url, headers=headers)

	# Get the form metadata
	if res.status_code == 200:
		form_data = res.json()
		forms = form_data['filings']['recent']['form']
		accession_numbers = form_data['filings']['recent']['accessionNumber']
		acceptance_datetimes = form_data['filings']['recent']['acceptanceDateTime']
	
	# Loop through all forms
	for form, acc_number, acc_timestamp in zip(forms, accession_numbers, acceptance_datetimes):

		start_time = time.time()

		# Check if form is already present in the database
		cur.execute("SELECT 1 FROM fct_form WHERE accession_number = ?", (acc_number,))
		form_check = cur.fetchone()

		# Only relevant forms
		if form not in relevant_forms or form_check is not None:
			continue

		# Retrieve form data
		acc_no = acc_number.replace("-", "")
		filing_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_no}/{acc_number}.txt"
		filing_res = requests.get(filing_url, headers=headers)
		doc = filing_res.text

		# Prompt used to extract information
		prompt = f"""
		Generate a JSON object with the following keys only: 
		- "investor": string
		- "ownership": float

		You are analysing the text from an SEC filing document.
		An investory has purchased a significant percentage of {title}'s voting stock, causing them to file this document with the SEC.
		Your job is to find the name of the investor, and to find the ownership that was acquired.
		Ownership must be a percentage, value between 0 and 1.
		If a piece of information is not present, return null for that field.

		This is the SEC document you need to analyse:
		\"\"\"{doc}\"\"\"
		"""

		# Use LLM to extract information
		response = ollama.chat(
			model=model,
			messages=[{"role": "user", "content": prompt}],
			format='json'
		)
		
		# Ensure the correct JSON formatting
		try:
			raw = json.loads(response['message']['content'])
		except Exception:
			raw = {}
		
		# Investor must be string
		investor = raw.get("investor")
		if not isinstance(investor, str):
			investor = None
		
		# Ownership must be float
		ownership = raw.get("ownership")
		if isinstance(ownership, (int, float)):
			ownership = float(ownership)
		else:
			ownership = None
		
		result = {
			"investor": investor,
			"ownership": ownership
		}

		# Convert timestamp data to a format used in SQLite3 database
		dt = datetime.strptime(acc_timestamp, "%Y-%m-%dT%H:%M:%S.%fZ")
		sqlite_timestamp = dt.strftime("%Y-%m-%d %H:%M:%S")

		# Insert data into database
		cur.execute("""
		INSERT INTO fct_form (accession_number, cik, acceptance_timestamp, form_type, investor, ownership, form)
		VALUES (?, ?, ?, ?, ?, ?, ?);
		""", (acc_number, cik, sqlite_timestamp, form, result.get('investor'), result.get('ownership'), doc)
		)

		conn.commit()

		num_forms += 1

		# Rate limiting
		wait_time = min_wait - (time.time() - start_time)
		if wait_time > 0:
			time.sleep(wait_time)

# Logging
print(f"Number of new forms: {num_forms}")

Processing company 1/10123: NVIDIA CORP
Processing company 2/10123: MICROSOFT CORP
Processing company 3/10123: Apple Inc.
Processing company 4/10123: Alphabet Inc.
Processing company 5/10123: AMAZON COM INC
Processing company 6/10123: Meta Platforms, Inc.
Processing company 7/10123: Broadcom Inc.
Processing company 8/10123: Tesla, Inc.
Processing company 9/10123: BERKSHIRE HATHAWAY INC
Processing company 10/10123: ORACLE CORP
Processing company 11/10123: JPMORGAN CHASE & CO
Processing company 12/10123: Walmart Inc.
Processing company 13/10123: ELI LILLY & Co
Processing company 14/10123: VISA INC.
Processing company 15/10123: SPDR S&P 500 ETF TRUST
Processing company 16/10123: Mastercard Inc
Processing company 17/10123: NETFLIX INC
Processing company 18/10123: EXXON MOBIL CORP
Processing company 19/10123: JOHNSON & JOHNSON
Processing company 20/10123: COSTCO WHOLESALE CORP /NEW
Processing company 21/10123: HOME DEPOT, INC.
Processing company 22/10123: Palantir Technologies Inc.
Processi

In [None]:
# Kill the Ollama process group
os.kill(proc.pid, signal.CTRL_BREAK_EVENT)

In [None]:
# Close the database connection
conn.close()