In [62]:
import pandas as pd
import requests, json, time
from bs4 import BeautifulSoup

In [78]:
def get_s1_filings():
    base_url = "https://efts.sec.gov/LATEST/search-index"

    start_date = "2024-01-02"
    end_date = "2024-12-31"

    requests_per_second = 10
    delay = 1.0 / requests_per_second
    
    start_from = 0
    size = 100
    all_filings = []

    headers = {
        "User-Agent": "MyApp/1.0 (ruoyu@lockrmail.com)",
        "Accept": "application/json",
        "Content-Type": "application/json"
    }
    
    initial_url = f"{base_url}?dateRange=custom&category=custom&startdt={start_date}&enddt={end_date}&forms=S-1&page=1&from=0"
    response = requests.get(initial_url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        total_hits = data.get("hits", {}).get("total", {}).get("value", 0)
        print(f"Total S-1 filings found: {total_hits}")
        
        while start_from < total_hits:
            url = f"{base_url}?dateRange=custom&category=custom&startdt={start_date}&enddt={end_date}&forms=S-1&page={start_from // size + 1}&from={start_from}"
            response = requests.get(url, headers=headers)
            time.sleep(delay)
            
            if response.status_code == 200:
                data = response.json()
                filings = data.get("hits", {}).get("hits", [])
                
                for filing in filings:
                    source_data = filing.get("_source")
                    all_filings.append({
                        "display_names": source_data.get("display_names"),
                        "file_date": source_data.get("file_date"),
                        "file_type": source_data.get("file_type"),
                        "biz_locations": source_data.get("biz_locations"),
                        "sequence": source_data.get("sequence"),
                        "inc_states": source_data.get("inc_states"),
                    })
                
                start_from += size 
            else:
                print(f"Error: Received status code {response.status_code}")
                print(response.text)
                break
    else:
        print(f"Error: Received status code {response.status_code}")
        print(response.text)
    
    return all_filings

In [79]:
list_of_filings = get_s1_filings()

Total S-1 filings found: 2663


In [81]:
df = pd.DataFrame(list_of_filings)

In [84]:
df.head()

Unnamed: 0,display_names,file_date,file_type,biz_locations,sequence,inc_states
0,[AEN Group Ltd. (CIK 0002033750)],2024-12-31,S-1,"[Calgary, A0]",1,[]
1,[Capstone Holding Corp. (CIK 0000887151)],2024-12-31,S-1,"[Alsip, IL]",1,[DE]
2,[Thoughtful Media Group Inc. (TMGX) (CIK 000...,2024-12-31,S-1/A,"[Carson City, NV]",1,[NY]
3,"[DMINT, Inc. (DMNT) (CIK 0001996450)]",2024-12-31,S-1/A,"[New York, NY]",1,[]
4,[Alpha Cognition Inc. (ACOG) (CIK 0001655923)],2024-12-31,S-1,"[Vancouver, A1]",1,[A1]


In [86]:
df.explode(["display_names", "biz_locations", "inc_states"])

Unnamed: 0,display_names,file_date,file_type,biz_locations,sequence,inc_states
0,AEN Group Ltd. (CIK 0002033750),2024-12-31,S-1,"Calgary, A0",1,
1,Capstone Holding Corp. (CIK 0000887151),2024-12-31,S-1,"Alsip, IL",1,DE
2,Thoughtful Media Group Inc. (TMGX) (CIK 0001...,2024-12-31,S-1/A,"Carson City, NV",1,NY
3,"DMINT, Inc. (DMNT) (CIK 0001996450)",2024-12-31,S-1/A,"New York, NY",1,
4,Alpha Cognition Inc. (ACOG) (CIK 0001655923),2024-12-31,S-1,"Vancouver, A1",1,A1
...,...,...,...,...,...,...
2658,MEMBERS Life Insurance Co (CIK 0001562577),2024-01-02,S-1,"Waverly, IA",1,IA
2659,"BrightSpring Health Services, Inc. (BTSG, BTS...",2024-01-02,S-1,"Louisville, KY",1,DE
2660,MEMBERS Life Insurance Co (CIK 0001562577),2024-01-02,S-1,"Waverly, IA",1,IA
2661,"GeoVax Labs, Inc. (GOVX, GOVXW) (CIK 0000832...",2024-01-02,S-1,"Smyrna, GA",1,DE
