# Import libraries

In [29]:
import sqlite3
import requests
import time
import os
import time
import ollama
from dotenv import load_dotenv
import subprocess
import signal

# Database

In [14]:
# Create and/or connect to database
conn = sqlite3.connect("seq_data.db")
cur = conn.cursor()

In [15]:
# Create a company dimension table
cur.execute("""
CREATE TABLE IF NOT EXISTS dim_company (
    cik INTEGER PRIMARY KEY,
    cik_padded TEXT NOT NULL,
    ticker TEXT,
    company_name TEXT NOT NULL,
    mgt_insert_timestamp TEXT DEFAULT (datetime('now')),
    mgt_update_timestamp TEXT DEFAULT (datetime('now'))
);
""")

# Create trigger where we update timestamps on first insert
cur.execute("""
CREATE TRIGGER IF NOT EXISTS trg_dim_company_insert
AFTER INSERT ON dim_company
BEGIN
    UPDATE dim_company
    SET mgt_insert_timestamp = datetime('now'),
        mgt_update_timestamp = datetime('now')
    WHERE cik = NEW.cik;
END;
""")

# Create trigger where we change the update timestamp on row update
cur.execute("""
CREATE TRIGGER IF NOT EXISTS trg_dim_company_update
AFTER UPDATE ON dim_company
BEGIN
    UPDATE dim_company
    SET mgt_update_timestamp = datetime('now')
    WHERE cik = NEW.cik;
END;
""")

<sqlite3.Cursor at 0x2024d996c40>

# Get data

In [16]:
# Get our user agent header data from .env
# Has format "name e-mail address"
load_dotenv()
user_agent = os.getenv('user_agent')

In [17]:
# Get all the different companies listed at the SEC
headers = {"User-Agent": user_agent}
url = "https://www.sec.gov/files/company_tickers.json"
res = requests.get(url, headers=headers)
companies = res.json()

In [18]:
# Make an array with the company data and insert it into our dimension table
# The cik number is a unique identifier for the company
# A padded cik number with 0s to length 10 is required for certain endpoints
# The ticker is a short string identifier for the company
# Company name is the full name of the company
company_data = []
for _, c in companies.items():
	company_data.append((
		c['cik_str'],
		str(c['cik_str']).zfill(10),
		c.get('ticker'),
		c['title']
	))

cur.executemany("""
INSERT INTO dim_company (cik, cik_padded, ticker, company_name)
VALUES (?, ?, ?, ?)
ON CONFLICT(cik) DO UPDATE SET
    cik_padded = excluded.cik_padded,
    ticker = excluded.ticker,
    company_name = excluded.company_name
WHERE cik_padded != excluded.cik_padded
   OR ticker != excluded.ticker
   OR company_name != excluded.company_name;
""", company_data)

conn.commit()

In [46]:
relevant_forms = (
	"SC 13 D", # Schedule 13D: ownership >5% with intent to influence/control
	"SC 13D/A", # Amendment to Schedule 13D (updates ownership/intent changes)
	# "SC 13G", # Schedule 13G: ownership >5% filed by passive investors
	# "SC 13G/A" # Amendment to Schedule 13G (updates to prior 13G filing)
)

# Loop through data array and get all forms
for cik, cik_padded, ticker, title in company_data[:20]:
	url = f"https://data.sec.gov/submissions/CIK{cik_padded}.json"
	res = requests.get(url, headers=headers)

	# Get the form metadata
	if res.status_code == 200:
		form_data = res.json()
		forms = form_data['filings']['recent']['form']
		accession_numbers = form_data['filings']['recent']['accessionNumber']
		acceptance_datetimes = form_data['filings']['recent']['acceptanceDateTime']
	
	# Loop through all forms
	for form, acc_number, acc_datetime in zip(forms, accession_numbers, acceptance_datetimes):

		# Only relevant forms
		if form not in relevant_forms:
			continue

		# Retrieve form data
		acc_no = acc_number.replace("-", "")
		filing_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{acc_no}/{acc_number}.txt"
		filing_res = requests.get(filing_url, headers=headers)

		print(filing_url)

		# Only 5 requests per second are allowed
		time.sleep(0.2)

https://www.sec.gov/Archives/edgar/data/1067983/000119312524265100/0001193125-24-265100.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312524197170/0001193125-24-197170.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312524172349/0001193125-24-172349.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312524038549/0001193125-24-038549.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312523283406/0001193125-23-283406.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312523281502/0001193125-23-281502.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312523172282/0001193125-23-172282.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312522292023/0001193125-22-292023.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312522211399/0001193125-22-211399.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312522174841/0001193125-22-174841.txt
https://www.sec.gov/Archives/edgar/data/1067983/000119312522041102/0001193125-22

# Extracting information

In [47]:
# Start sub process group which starts Ollama server
proc = subprocess.Popen(["ollama", "serve"], creationflags=subprocess.CREATE_NEW_PROCESS_GROUP)

In [48]:
doc = filing_res.text

prompt = f"""
You are analysing the text from an SEC filing document.
Your task is to extract details of stock purchases.

These are the rules for the output:
- You may only output a valid JSON object.
- If a piece of information is not present, return null for that field
- The output format must be exactly:
{{
	"company": <string or null>,
	"stock_price": <float or null>,
	"number_of_stocks": <int or null>
}}

{title} bought "number_of_stocks" of "company" for "stock_price"

So the output is for instance:
{{
	"company": "ING",
	"stock_price": 2.56,
	"number_of_stocks": 48930
}}

Or if this information is not available in the document:
{{
	"company": "ING",
	"stock_price": null,
	"number_of_stocks": null
}}

Text to analyse:
\"\"\"{doc}\"\"\"
"""

In [49]:
response = ollama.chat(
	model="qwen3:30b",
	messages=[{"role": "user", "content": prompt}]
)

In [50]:
response['message']['content']

'<think>\nWe are given an HTML document that appears to be a Registration Rights Agreement between a company and an investor.\n The task is to extract the key information from this document.\n\n Steps:\n 1. Identify the company name, investor name, and the date of the agreement.\n 2. Extract the address for notice (as provided in the signature page for the investor).\n 3. Note the names and titles of the signatories.\n\n However, note that the document is provided in a string that is part of a SEC filing.\n\n Let\'s break down the document:\n\n - The company is "Global Clean Energy Holdings, Inc."\n - The investor is "ExxonMobil Renewables LLC"\n - The date is not explicitly stated in the main body, but the agreement is "as of the date first set forth above". \n   However, in the signature block, the company\'s signature has:\n        Name: Richard Palmer\n        Title: Chief Executive Officer\n   and the investor\'s signature has:\n        Name: Gloria Moncada\n        Title: Preside

In [32]:
# Kill the Ollama process group
os.kill(proc.pid, signal.CTRL_BREAK_EVENT)