In [1]:
import pandas as pd
import requests, json, time, os
from bs4 import BeautifulSoup

# Purpose: To understand how S-1 Filings can reveal information about a Company's lockup period, and give insight to how companies have performed after an IPO

We will be using Edgar's API 

Edgar's full text search UI, linked below, gives a listing of all 2024 S-1 filings. Below, we will programmatically pull these filings from Edgar. We will retrieve each S-1's filing date and filing URL (taking us to the actual S-1 filing). 


(https://www.sec.gov/edgar/search/#/dateRange=custom&category=custom&startdt=2024-01-01&enddt=2024-12-31&forms=S-1)

## Define Helpers

In [2]:
def get_s1_filings():
    base_url = "https://efts.sec.gov/LATEST/search-index"

    start_date = "2024-01-02"
    end_date = "2024-12-31"

    requests_per_second = 10
    delay = 1.0 / requests_per_second
    
    start_from = 0
    size = 100
    all_filings = []

    headers = {
        "User-Agent": "MyApp/1.0 (ruoyu@lockrmail.com)",
        "Accept": "application/json",
        "Content-Type": "application/json"
    }
    
    initial_url = f"{base_url}?dateRange=custom&category=custom&startdt={start_date}&enddt={end_date}&forms=S-1&page=1&from=0"
    response = requests.get(initial_url, headers=headers)
    
    if response.status_code == 200:
        data = response.json()
        total_hits = data.get("hits", {}).get("total", {}).get("value", 0)
        print(f"Total S-1 filings found: {total_hits}")
        
        while start_from < total_hits:
            url = f"{base_url}?dateRange=custom&category=custom&startdt={start_date}&enddt={end_date}&forms=S-1&page={start_from // size + 1}&from={start_from}"
            response = requests.get(url, headers=headers)
            time.sleep(delay)
            
            if response.status_code == 200:
                data = response.json()
                filings = data.get("hits", {}).get("hits", [])
                
                for filing in filings:
                    source_data = filing.get("_source")
                    all_filings.append({
                        "filing_id": filing.get("_id"),
                        "display_names": source_data.get("display_names"),
                        "file_date": source_data.get("file_date"),
                        "file_type": source_data.get("file_type"),
                        "biz_locations": source_data.get("biz_locations"),
                        "sequence": source_data.get("sequence"),
                        "inc_states": source_data.get("inc_states"),
                    })
                
                start_from += size 
            else:
                print(f"Error: Received status code {response.status_code}")
                print(response.text)
                break
    else:
        print(f"Error: Received status code {response.status_code}")
        print(response.text)
    
    return all_filings

In [3]:
def create_edgar_url(cik, filing_id):
    file_identifer, file_name = filing_id.split(":")
    file_identifer = file_identifer.replace("-", "")
    edgar_url = f"https://www.sec.gov/Archives/edgar/data/{cik}/{file_identifer}/{file_name}"
    return edgar_url

def clean_s1_filings(dataframe_original):
    dataframe = dataframe_original.copy()
    dataframe = dataframe.explode(["display_names", "biz_locations", "inc_states"])
    dataframe["CIK"] = dataframe["display_names"].str.extract(r"CIK (\d+)")
    dataframe["URL"] = dataframe.apply(lambda x: create_edgar_url(cik=x["CIK"], filing_id=x["filing_id"]), axis=1)
    return dataframe

In [43]:
def bulk_save_edgar_filing(list_of_edgar_links):
    save_location = "../data"
    headers = {
        "User-Agent": "MyApp/1.0 (ruoyu@lockrmail.com)",
        "Accept": "application/json",
        "Content-Type": "application/json"
    }
    requests_per_second = 10
    delay = 1.0 / requests_per_second
    for link in list_of_edgar_links:
        request = requests.get(link, headers=headers)
        file_name = link.split("/")[-1]
        file_name = file_name.replace(".html", "").replace(".htm", "") + ".html"
        if request.status_code == 200:
            content = request.text
            
            with open(f"{save_location}/{file_name}", "w", encoding="utf-8") as document_writer:
                print(f"Saving {file_name}...")
                document_writer.write(content)
            time.sleep(delay)
        else:
            print(f"Skipping {file_name}")
            continue

    

## Initialize Edgar request

In [4]:
list_of_filings = get_s1_filings()

Total S-1 filings found: 2663


## Clean Data

In [5]:
df = pd.DataFrame(list_of_filings)

In [6]:
df = clean_s1_filings(df)

In [21]:
df["URL"][10]

'https://www.sec.gov/Archives/edgar/data/0001174940/000149315224052623/forms-1a.htm'

In [45]:
headers = {
        "User-Agent": "MyApp/1.0 (ruoyu@lockrmail.com)",
        "Accept": "application/json",
        "Content-Type": "application/json"
    }
request = requests.get(df.URL[0], headers=headers)

In [46]:
request.status_code

200

In [44]:
bulk_save_edgar_filing(df.URL.iloc[:100].tolist())

Saving aen_s1.html...
Saving ea0214433-04.html...
Saving forms-1a.html...
Saving ea0225566-s1a1_dmint.html...
Saving ea0217603-04.html...
Saving forms-1a.html...
Saving pirs20241220_s1.html...
Saving d898161ds1.html...
Saving gevi_s1a.html...
Saving forms-1.html...
Saving forms-1a.html...
Saving tmb-20240930xs1.html...
Saving ea0208079-06.html...
Saving forms-1.html...
Saving forms-1.html...
Saving ea0226148-s1_nxuinc.html...
Saving tm2426182d6_s1a.html...
Saving amendment.html...
Saving d835594ds1a.html...
Saving e6227_s-1.html...
Saving forms-1a.html...
Saving forms-1a.html...
Saving tmgi_s1a5.html...
Saving forms-1.html...
Saving mayau_s1.html...
Saving nehc-20240930xs1.html...
Saving ea0226237-s1a1_inmedpharma.html...
Saving scpx_s1.html...
Saving lafa_s1.html...
Saving ea0225697-s1_heliocorp.html...
Saving tm2310971-13_s1a.html...
Saving forms-1a.html...
Saving tmgi_s1a4.html...
Saving aclarion_s1a1.html...
Saving forms-1.html...
Saving ny20039820x7_s1a.html...
Saving luxurbanhote