In [16]:
# Exploring what documents the SEC has both human readable and machine readable versions of
# Human:   https://www.sec.gov/Archives/edgar/data/1318605/000128825718000026/xslFormDX01/primary_doc.xml
# Machine: https://www.sec.gov/Archives/edgar/data/1318605/000128825718000026/primary_doc.xml

import polars as pl

# Read the CSV file
df = pl.read_csv('data/submissions_index.csv')

# Clean up the data
df = df.with_columns(
    pl.col('accession_number').str.replace_all('-', '').alias('accession_number')
)

# Filter rows where primary_doc_url contains a slash and create item_before_slash column
df_filtered = df.filter(pl.col('primary_doc_url').str.contains('/')).with_columns(
    pl.col('primary_doc_url').str.split('/').list.first().alias('item_before_slash')
)

# Get unique items
unique_items = df_filtered.select('item_before_slash').unique()
for item in unique_items['item_before_slash'].to_list():
    print(item)

xslF345X02
xsl1-Z_X01
xslSBSE-C_X01
xsl144X01
xsl1-A_X01
xslF345X03
xslFormDX01
xslQUALIFX01
xslSCHEDULE_13D_X01
xslFormNPORT-P_X01
xslEFFECTX01
xslFormMA-I_X01
xslForm13F_X02
xslC_X01
xslSBSE-A_X01
xslN-PX_X01
xslSCHEDULE_13G_X01
xsl1-K_X01
xslF345X01
xslF25X02
xslF345X05
xslFormN-CEN_X01
xslF345X04


In [18]:

example_urls = df_filtered.group_by('item_before_slash').agg(
    pl.col('cik').first().alias('cik'),
    pl.col('accession_number').first().alias('accession_number'),
    pl.col('primary_doc_url').first().alias('primary_doc_url')
).with_columns(
    pl.concat_str(
        pl.lit("https://www.sec.gov/Archives/edgar/data/"),
        pl.col('cik'),
        pl.lit("/"),
        pl.col('accession_number'),
        pl.lit("/"),
        pl.col('primary_doc_url')
    ).alias('full_url')
)

# Print the results
for row in example_urls.iter_rows(named=True):
    print(row['full_url'])

# If you want to see the count of each item
item_counts = df_filtered.group_by('item_before_slash').count().sort('count', descending=True)
print("Item Counts:")
print(item_counts)

https://www.sec.gov/Archives/edgar/data/1651721/000116169721000148/xsl1-K_X01/primary_doc.xml
https://www.sec.gov/Archives/edgar/data/1827087/000095017024034150/xslF345X02/ownership.xml
https://www.sec.gov/Archives/edgar/data/58411/000139390523000189/xsl1-Z_X01/primary_doc.xml
https://www.sec.gov/Archives/edgar/data/1827087/000180280622000010/xslFormDX01/primary_doc.xml
https://www.sec.gov/Archives/edgar/data/1729427/000168316820001280/xslC_X01/primary_doc.xml
https://www.sec.gov/Archives/edgar/data/1737995/000149315224021891/xsl1-A_X01/primary_doc.xml
https://www.sec.gov/Archives/edgar/data/1827087/000089924323007030/xslF345X03/doc4.xml
https://www.sec.gov/Archives/edgar/data/1827087/000095017024070843/xslF345X05/ownership.xml
https://www.sec.gov/Archives/edgar/data/1604174/000110465924092878/xslFormNPORT-P_X01/primary_doc.xml
https://www.sec.gov/Archives/edgar/data/720005/000072000523000058/xslFormMA-I_X01/primary_doc.xml
https://www.sec.gov/Archives/edgar/data/1827821/00019679402400

  item_counts = df_filtered.group_by('item_before_slash').count().sort('count', descending=True)


Item Counts:
shape: (23, 2)
┌─────────────────────┬─────────┐
│ item_before_slash   ┆ count   │
│ ---                 ┆ ---     │
│ str                 ┆ u32     │
╞═════════════════════╪═════════╡
│ xslF345X03          ┆ 2443015 │
│ xslF345X02          ┆ 891785  │
│ xslF345X05          ┆ 245072  │
│ xslF345X04          ┆ 74968   │
│ xsl144X01           ┆ 59564   │
│ …                   ┆ …       │
│ xslC_X01            ┆ 222     │
│ xslSBSE-C_X01       ┆ 72      │
│ xsl1-Z_X01          ┆ 68      │
│ xslSCHEDULE_13D_X01 ┆ 47      │
│ xslFormMA-I_X01     ┆ 4       │
└─────────────────────┴─────────┘
