# Initial prompts

Creating the initial prompt

In [1]:
#%pip install --upgrade --quiet pydantic-ai-slim[anthropic,openai]

In [1]:
GEMINI="gemini-2.0-flash"

import os
from dotenv import load_dotenv
load_dotenv("../keys.env")
assert os.environ["GEMINI_API_KEY"][:2] == "AI",\
       "Please specify the GEMINI_API_KEY access token in keys.env file"

In [2]:
# Needed in Jupyter environment See: https://ai.pydantic.dev/troubleshooting/ 
import nest_asyncio
nest_asyncio.apply()

## CIK to Symbol lookup

Edgar filings are based on a id key.  Build a lookup to go from that CIK to stock symbols for the companies we care about

In [3]:
!wget --user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36" "https://httpbin.io/user-agent" https://www.sec.gov/files/company_tickers.json

--2025-04-14 18:18:48--  https://httpbin.io/user-agent
Resolving httpbin.io (httpbin.io)... 3.224.228.208, 52.70.33.41, 44.211.11.205
Connecting to httpbin.io (httpbin.io)|3.224.228.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 138 [application/json]
Saving to: ‘user-agent.1’


2025-04-14 18:18:48 (139 MB/s) - ‘user-agent.1’ saved [138/138]

--2025-04-14 18:18:48--  https://www.sec.gov/files/company_tickers.json
Resolving www.sec.gov (www.sec.gov)... 23.196.153.179, 2600:1409:9800:1687::17b2, 2600:1409:9800:1689::17b2
Connecting to www.sec.gov (www.sec.gov)|23.196.153.179|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/json]
Saving to: ‘company_tickers.json.1’

company_tickers.jso     [ <=>                ] 723.08K  --.-KB/s    in 0.03s   

2025-04-14 18:18:49 (21.5 MB/s) - ‘company_tickers.json.1’ saved [740434]

FINISHED --2025-04-14 18:18:49--
Total wall clock time: 0.5s
Downloaded: 2 files, 723K in 0

In [4]:
import json
with open("company_tickers.json") as ifp:
    lookup = json.load(ifp)
lookup = {item['cik_str']:item['ticker']  for key, item in lookup.items()}

In [5]:
import pandas as pd
lookup_df = pd.DataFrame.from_dict(lookup, orient='index', columns=['symbol'])
lookup_df

Unnamed: 0,symbol
320193,AAPL
789019,MSFT
1045810,NVDA
1018724,AMZN
1652044,GOOG
...,...
1839586,DMXCF
1663712,NRSAX
1762400,LVCE
1506721,BAFBF


In [6]:
lookup_df.to_csv('symbol_lookup.csv', index=True, header=False)

In [7]:
!head -3 symbol_lookup.csv

320193,AAPL
789019,MSFT
1045810,NVDA


## Question creation

based on a single filing.

In [8]:
import pandas as pd
lookup_df = pd.read_csv('symbol_lookup.csv', names=['cik', 'symbol'])
lookup_df

Unnamed: 0,cik,symbol
0,320193,AAPL
1,789019,MSFT
2,1045810,NVDA
3,1018724,AMZN
4,1652044,GOOG
...,...,...
7624,1839586,DMXCF
7625,1663712,NRSAX
7626,1762400,LVCE
7627,1506721,BAFBF


In [36]:
from dataclasses import dataclass
from typing import List
from pydantic_ai import Agent

@dataclass
class QuestionAnswer:
    question: str
    answer: str

def create_questions(filing, key, num_questions=3, model_id=GEMINI) -> List[str]:
    symbol = lookup_df[lookup_df['cik'] == int(filing['cik'])]['symbol'].values[0]
    system_prompt=f"""
    You are a professor in a MBA program.
    You will be given a passage from a SEC filing from {filing['company']} (symbol: {symbol}) made on {filing['filing_date']}
    Create {num_questions} analytical questions suitable for students of a class on company strategy based on this filing.
    
    Good questions should be:
    * Standalone. For example, make sure the question includes the name of the company, product, and year being referenced.
    * Avoid asking for factual numerical information such as revenue or capital expenditures
    * Ask "how", "why", "compare", etc.
    
    Example question: How might Google's (GOOG) reorganization of its hardware divisions affect its ability to grow Pixel phones' marketshare in 2023?"
    """.strip()
    agent = Agent(model_id, 
                  result_type=List[str],
                  system_prompt=system_prompt)
    result = agent.run_sync(filing[key])
    return (result.data)

In [37]:
import json
with open("edgar-crawler/datasets/EXTRACTED_FILINGS/10-K/2969_10K_2021_0000002969-21-000055.json") as ifp:
    filing = json.load(ifp)
    print(filing.keys())

dict_keys(['cik', 'company', 'filing_type', 'filing_date', 'period_of_report', 'sic', 'state_of_inc', 'state_location', 'fiscal_year_end', 'filing_html_index', 'htm_filing_link', 'complete_text_filing_link', 'filename', 'item_1', 'item_1A', 'item_1B', 'item_1C', 'item_2', 'item_3', 'item_4', 'item_5', 'item_6', 'item_7', 'item_7A', 'item_8', 'item_9', 'item_9A', 'item_9B', 'item_9C', 'item_10', 'item_11', 'item_12', 'item_13', 'item_14', 'item_15', 'item_16'])


In [38]:
questions = create_questions(filing, 'item_7') # management discussion
questions

["How might Air Products & Chemicals (APD)'s strategy of focusing on gasification, carbon capture, and hydrogen projects, as mentioned in their 2021 10-K filing, impact their competitive positioning in the industrial gases market compared to competitors with different strategic priorities?",
 "Considering Air Products & Chemicals' (APD) reorganization of its industrial gases segments announced in November 2021 and detailed in their 2021 10-K filing, how could this restructuring affect the company's ability to respond to regional market dynamics and customer needs in the Americas, EMEA, and Asia?",
 "Air Products & Chemicals (APD) states in its 2021 10-K filing that it expects higher costs from planned maintenance activities and higher pension expenses in fiscal year 2022; how might these rising costs impact the company's profitability and its ability to invest in new growth opportunities, such as carbon capture and hydrogen projects?"]

In [39]:
def write_answers(questions, filing, model_id=GEMINI) -> List[QuestionAnswer]:
    system_prompt=f"""
    You are a top student in a highly-ranked MBA program.
    You will be given an SEC filing from {filing['company']} made on {filing['filing_date']}
    Use that filing to answer the following questions.
    Each answer should be 2-3 sentences and be informed by your market insights and knowledge of business strategy.
    """.strip()
    agent = Agent(model_id, 
                  result_type=List[QuestionAnswer],
                  system_prompt=system_prompt)
    
    prompt = "\n".join([f"Question {idx+1}: {question}" for idx, question in enumerate(questions)])
    result = agent.run_sync(prompt)
    return result.data

answers = write_answers(questions, filing)
answers

[QuestionAnswer(question="How might Air Products & Chemicals (APD)'s strategy of focusing on gasification, carbon capture, and hydrogen projects, as mentioned in their 2021 10-K filing, impact their competitive positioning in the industrial gases market compared to competitors with different strategic priorities?", answer="Air Products & Chemicals (APD)'s focus on gasification, carbon capture, and hydrogen projects could significantly enhance its competitive positioning. By specializing in these high-growth areas, APD can differentiate itself from competitors, particularly as sustainability becomes a key market driver. This focus allows APD to attract environmentally conscious customers and potentially benefit from government incentives and regulations favoring clean energy solutions, creating a strategic advantage."),
 QuestionAnswer(question="Considering Air Products & Chemicals' (APD) reorganization of its industrial gases segments announced in November 2021 and detailed in their 20

## Put it all together

In [28]:
def generate_question_answers(filename, num_questions=3) -> List[QuestionAnswer]:
    import json
    with open(f"edgar-crawler/datasets/EXTRACTED_FILINGS/10-K/{filename}") as ifp:
        filing = json.load(ifp)
    questions = create_questions(filing, 'item_7') # management discussion
    return write_answers(questions, filing)

generate_question_answers("2488_10K_2022_0000002488-23-000047.json")

[QuestionAnswer(question="1.  In 2022, AMD acquired both Xilinx and Pensando. How might these acquisitions affect AMD's competitive positioning relative to Intel (INTC) and Nvidia (NVDA) in the data center market, considering the breadth of solutions AMD can now offer?", answer="The acquisitions of Xilinx and Pensando significantly bolster AMD's competitive stance against Intel and Nvidia in the data center. Xilinx brings adaptable FPGA solutions, while Pensando adds advanced networking capabilities. This enables AMD to offer a more comprehensive suite of products, potentially attracting customers seeking diverse solutions beyond CPUs and GPUs, allowing them to compete more effectively for a larger share of the data center market.\n\n2. AMD should strategically balance investments between its Client and Gaming segments by diversifying its product offerings and targeting high-growth sub-segments within the PC market, such as premium laptops and workstations. Simultaneously, AMD should l