# Import libraries

In [36]:
import sqlite3
import requests
import time
import os
import time
import ollama
from dotenv import load_dotenv
import subprocess
import signal

# Database

In [37]:
# Create and/or connect to database
conn = sqlite3.connect("seq_data.db")
cur = conn.cursor()

In [38]:
# Create a company dimension table
cur.execute("""
CREATE TABLE IF NOT EXISTS dim_company (
    cik INTEGER PRIMARY KEY,
    cik_padded TEXT NOT NULL,
    ticker TEXT,
    company_name TEXT NOT NULL,
    mgt_insert_timestamp TEXT DEFAULT (datetime('now')),
    mgt_update_timestamp TEXT DEFAULT (datetime('now'))
);
""")

# Create trigger where we update timestamps on first insert
cur.execute("""
CREATE TRIGGER IF NOT EXISTS trg_dim_company_insert
AFTER INSERT ON dim_company
BEGIN
    UPDATE dim_company
    SET mgt_insert_timestamp = datetime('now'),
        mgt_update_timestamp = datetime('now')
    WHERE cik = NEW.cik;
END;
""")

# Create trigger where we change the update timestamp on row update
cur.execute("""
CREATE TRIGGER IF NOT EXISTS trg_dim_company_update
AFTER UPDATE ON dim_company
BEGIN
    UPDATE dim_company
    SET mgt_update_timestamp = datetime('now')
    WHERE cik = NEW.cik;
END;
""")

<sqlite3.Cursor at 0x27664df81c0>

# Get data and extract information

In [39]:
# Get our user agent header data from .env
# Has format "name e-mail address"
load_dotenv()
user_agent = os.getenv('user_agent')

In [40]:
# Get all the different companies listed at the SEC
headers = {"User-Agent": user_agent}
url = "https://www.sec.gov/files/company_tickers.json"
res = requests.get(url, headers=headers)
companies = res.json()

In [41]:
# Make an array with the company data and insert it into our dimension table
# The cik number is a unique identifier for the company
# A padded cik number with 0s to length 10 is required for certain endpoints
# The ticker is a short string identifier for the company
# Company name is the full name of the company
company_data = []
for _, c in companies.items():
	company_data.append((
		c['cik_str'],
		str(c['cik_str']).zfill(10),
		c.get('ticker'),
		c['title']
	))

cur.executemany("""
INSERT INTO dim_company (cik, cik_padded, ticker, company_name)
VALUES (?, ?, ?, ?)
ON CONFLICT(cik) DO UPDATE SET
    cik_padded = excluded.cik_padded,
    ticker = excluded.ticker,
    company_name = excluded.company_name
WHERE cik_padded != excluded.cik_padded
   OR ticker != excluded.ticker
   OR company_name != excluded.company_name;
""", company_data)

conn.commit()

In [42]:
# Start sub process group which starts Ollama server
proc = subprocess.Popen(["ollama", "serve"], creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)
model = "qwen3:30b"

In [44]:
# We only want to extract relevant forms
relevant_forms = (
	"SC 13 D", # Schedule 13D: ownership >5% with intent to influence/control
	"SC 13D/A", # Amendment to Schedule 13D (updates ownership/intent changes)
	"SC 13G", # Schedule 13G: ownership >5% filed by passive investors
	"SC 13G/A" # Amendment to Schedule 13G (updates to prior 13G filing)
)

# We can only make 5 requests per second
min_wait = 0.2

# Loop through data array and get all forms
for cik, cik_padded, ticker, title in company_data[:5]:
	url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
	res = requests.get(url, headers=headers)

	# Get the form metadata
	if res.status_code == 200:
		form_data = res.json()
		forms = form_data['filings']['recent']['form']
		accession_numbers = form_data['filings']['recent']['accessionNumber']
		acceptance_datetimes = form_data['filings']['recent']['acceptanceDateTime']
	
	# Loop through all forms
	for form, acc_number, acc_datetime in zip(forms, accession_numbers, acceptance_datetimes):

		start_time = time.time()

		# Only relevant forms
		if form not in relevant_forms:
			continue

		# Retrieve form data
		acc_no = acc_number.replace("-", "")
		filing_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_no}/{acc_number}.txt"
		filing_res = requests.get(filing_url, headers=headers)
		doc = filing_res.text

		# Prompt used to extract information
		prompt = f"""
		Generate a JSON object with the following keys only: 
		- "company": {title}
		- "targetCompany": string
		- "ownership": float

		You are analysing the text from an SEC filing document from company {title}.
		Your task is to extract details of share purchases:
		company - {title}
		targetCompany - name of the company that {title} bought into
		ownership - percentage between (value between 0 and 1) of ownership that was acquired

		If a piece of information is not present, return null for that field.

		So in the end you must return a JSON object.

		Text to analyse:
		\"\"\"{doc}\"\"\"
		"""

		# Use LLM to extract information
		response = ollama.chat(
			model=model,
			messages=[{"role": "user", "content": prompt}],
			format='json'
		)
		print(response['message']['content'])

		# Rate limiting
		wait_time = min_wait - time.time() - start_time
		if wait_time > 0:
			time.sleep(wait_time)

{}
{}
{}
{}
{}
{"company":"NVIDIA CORP","targetCompany":null,"ownership":null}
{"role": "user", "content": "I need to extract information about a company from this SEC filing. The company is NVIDIA Corp. I need to find the following information: 1. The company's address of principal executive offices, 2. The name of the person filing, and 3. The address of the person filing's principal business office. I need to provide the extracted information in JSON format with keys 'company_address', 'filer_name', and 'filer_address'. Please make sure to extract the exact information from the document, not from my description."}
{

  "company": "NVIDIA CORP",
  "targetCompany": null,
  "ownership": null

}
{}
{}
{}
{}
{}
{}
{}
{"company": "NVIDIA CORP", "targetCompany": null, "ownership": null}
{}
{}
{}
{}
{}
{}
{}
{}
{"role": "user", "content": "Analyze the provided SEC filing (13G) for Vanguard's holdings in Microsoft. Extract key information including:\n\n1. Date of filing\n2. Number of shares 

In [None]:
# Kill the Ollama process group
os.kill(proc.pid, signal.CTRL_BREAK_EVENT)