Construct a dataset of Microsoft's Directors & Principal officers over time. You can skip the data wrangling and head straight over to the datasetbuilder code.

In [None]:
# First, lets extract every item 5.02 Departure of Directors or Principal Officers; Election of Directors; Appointment of Principal Officers. from 8-K disclosures

from datamule import Portfolio

portfolio = Portfolio('data/msft_8k')
# Note: This is slow because I'm at a hotel with slow internet.
portfolio.download_submissions(submission_type=['8-K'],ticker='MSFT', provider='sec')

In [None]:
# Now we extract item 5.02 into a new csv. We will have three columns, accession_number, filing_date, and text.
# Note: this workflow may change in the future, as I update the datamule parser to parse more documents.
def extract_item_5_02(submission):

    try:
        row_dict = {}
        submission_metadata = submission.metadata['submission']
        filing_date = submission_metadata['FILED AS OF DATE']
        accession_number = submission_metadata['ACCESSION NUMBER']
        for document in submission.document_type('8-K'):
            document.parse()
            row_dict['accession_number'] = accession_number
            row_dict['filing_date'] = filing_date
            row_dict['text'] = document.data['document']['item502']
            return row_dict
    except:
        return None

    return row_dict

rows = portfolio.process_submissions(extract_item_5_02)
rows = [row for row in rows if row]

# we get 49 rows as of 1/14/25
print(len(rows))

In [31]:
# save to csv
import pandas as pd

df = pd.DataFrame(rows)
df.to_csv('data/msft_8k_item_5_02.csv', index=False)

In [2]:
# Now, we build the dataset
# Note: You can skip previous steps if you have the csv file already.
from txt2dataset import DatasetBuilder
import os


builder = DatasetBuilder()

builder.set_api_key(os.environ["GEMINI_API_KEY"])

# set base prompt, e.g. what the model looks for
base_prompt = """Extract officer changes and movements to JSON format.
    Track when officers join, leave, or change roles.
    Provide the following information:
    - date (YYYYMMDD)
    - name (First Middle Last)
    - title
    - action (one of: ["HIRED", "RESIGNED", "TERMINATED", "PROMOTED", "TITLE_CHANGE"])
    Return an empty dict if info unavailable."""

response_schema = {
    "type": "ARRAY",
    "items": {
        "type": "OBJECT",
        "properties": {
            "date": {"type": "STRING", "description": "Date of action in YYYYMMDD format"},
            "name": {"type": "STRING", "description": "Full name (First Middle Last)"},
            "title": {"type": "STRING", "description": "Official title/position"},
            "action": {
                "type": "STRING", 
                "enum": ["HIRED", "RESIGNED", "TERMINATED", "PROMOTED", "TITLE_CHANGE"],
                "description": "Type of personnel action"
            }
        },
        "required": ["date", "name", "title", "action"]
    }
}

builder.set_rpm(1500)
builder.set_model('gemini-1.5-flash-8b')

  from .autonotebook import tqdm as notebook_tqdm


<txt2dataset.dataset_builder.DatasetBuilder at 0x265145aab90>

In [1]:
# build the data
builder.build(base_prompt=base_prompt,
               response_schema=response_schema,
               text_column='text',
               index_column='accession_number',
               input_path="data/msft_8k_item_5_02.csv",
               output_path='data/msft_officers.csv') # index_column is the unique identifier, if none is specified, will use row index

NameError: name 'builder' is not defined

In [3]:
builder.standardize(response_schema=response_schema,input_path='data/msft_officers.csv', output_path='data/msft_officers_standardized.csv',columns=['name'])

Loading data...
Standardized 28 unique values in name
Saved standardized data to data/msft_officers_standardized.csv


In [4]:
builder.standardize(response_schema=response_schema,input_path="data/msft_officers_standardized.csv", output_path='data/msft_officers_standardized.csv',columns=['title'])

Loading data...
Standardized 23 unique values in title
Saved standardized data to data/msft_officers_standardized.csv


In [3]:
builder.validate(input_path='data/msft_8k_item_5_02.csv',
                 output_path= 'data/msft_officers_standardized.csv', 
                 text_column='text',
                 base_prompt=base_prompt,
                 response_schema=response_schema,
                 n=5,
                 quiet=False)

[{'input_text': 'Item 5.02 Compensatory Arrangements of Certain Officers The Compensation Committee of the Microsoft Corporation ("Company") Board of Directors has approved a new executive officer incentive plan ("Plan") for the Company\'s executive officers. The Plan replaces the existing annual cash bonus and equity award programs for the Company\'s executive officers beginning with fiscal year 2009. The Plan allows the Compensation Committee to establish award programs for specified performance periods (e.g., one or more fiscal years). The maximum amount payable to a participating executive officer is a percentage of an incentive pool for a performance period. For fiscal year 2009, awards will be granted from an incentive pool with maximum funding of 0.35% of Microsoft\'s fiscal year 2009 corporate operating income. The awards granted to each participating executive officer will be limited to a fixed share of the incentive pool, and these awards may be further reduced or eliminated 