# Get and clean SEC filing data

In [None]:
def get_cik_and_name_from_ticker():

    url = "https://www.sec.gov/files/company_tickers.json"
    try:
        response = requests.get(url, headers = HEADERS)
        response.raise_for_status()
        data = response.json()
        
        for _, d in data.items():
            if d['ticker'] == TICKER.upper():
                cik = str(d['cik_str'])
                cik = cik.zfill(len(cik) + 3)
                company_name = d['title']
                
        return cik, company_name
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

def get_sec_data(cik):

    url = f"https://data.sec.gov/submissions/CIK{cik}.json"
    try:
        response = requests.get(url, headers = HEADERS)
        response.raise_for_status() 
        data = response.json()

        return data
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

def parse_sec_data(data):

    filing_data = data.get("filings", {}).get("recent", {})
    if not filing_data:
        raise Exception("No filing data found.")

    extracted_data = []
    num_filings = len(filing_data.get("form", []))
    if num_filings:
        for i in range(num_filings):
            form_type = filing_data.get("form", [])[i]
            if form_type in TARGET_FORMS:
                try:
                    accession_number = filing_data.get('accessionNumber', [])[i]
                    primary_document = filing_data.get('primaryDocument', [])[i]
                    filing_date = filing_data.get('filingDate', [])[i]
                    report_date = filing_data.get('reportDate', [])[i]

                    filing_details = {
                        'form_type': form_type,
                        'accession_number': accession_number,
                        'primary_document': primary_document,
                        'filing_date': filing_date,
                        'report_date': report_date
                    }
                    extracted_data.append(filing_details)

                except IndexError:
                    print(f"Warning: Data inconsistency at index {i}. Skipping this filing.")
                    continue
    return extracted_data

def construct_sec_url(cik, filing_details):

    url = "https://www.sec.gov/Archives/edgar/data"
    target_cik = cik.lstrip("0")
    accession_number = filing_details.get("accession_number", "").replace("-", "")
    primary_document = filing_details.get("primary_document", "")
    
    if accession_number and primary_document:
        url = f"{url}/{target_cik}/{accession_number}/{primary_document}"
    else:
        raise Exception("Info is missing.")

    return url

def get_html_text(url):

    try:
        res = requests.get(url, headers = HEADERS)
        res.raise_for_status()
        html_text = res.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")

    return html_text

def html_table_to_markdown(table_tag):
    """
    Converts a BeautifulSoup table tag into a Markdown formatted string.
    This helps preserve the structure of financial data for the LLM.
    """

    markdown_lines = []
    # process table headers
    headers = [th.get_text(strip = True).replace("\n", "") for th in table_tag.find_all("th")]

    # calculate the space between cells?
    if headers:
        markdown_lines.append("| " + " | ".join(headers) + " |")
        markdown_lines.append("| " + " | ".join(["---"] * len(headers)) + " |")

    # process table rows
    for row in table_tag.find_all("tr"):
        cells = [td.get_text(strip = True).replace("\n", "") for td in row.find_all(["td", "th"])]
        # only add rows that have content and match the header count if headers exist
        if cells and (not headers or len(cells) == len(headers)):
            markdown_lines.append("| " + " | ".join(cells) + " |")

    return "\n\n" + "\n".join(markdown_lines) + "\n\n" 

def clean_html(html_text):

    if not html_text:
        return ""

    soup = BeautifulSoup(html_text, "lxml")

    # decompose (completely remove) all script, style, and other non-content tags
    for tag in soup(['script', 'style', 'header', 'footer', 'nav']):
        tag.decompose()

    # convert tables to Markdown and replace the original table tag
    for table in soup.find_all("table"):
        markdown_text_tag = soup.new_string(html_table_to_markdown(table))
        table.replace_with(markdown_text_tag)

    text = soup.get_text(separator = "\n", strip = True)
    # remove excessive blank lines to make it more readable
    cleaned_text = re.sub(r'\n\n+', '\n\n', text)

    return cleaned_text

def chunk_filing_by_section(cleaned_text, metadata):

    pattern = r'(?i)(item\s*\d+[a-z]?\.?)'
    parts = re.split(pattern, cleaned_text)

    chunks = []

    intro_content = parts[0].strip()
    if len(intro_content.split()) > 20:
        chunks.append({
            "content": intro_content,
            "metadata": {**metadata, "section": "Introduction"}
        })

    # The rest of the list is ['Item 1.', 'Content of Item 1...', 'Item 1A.', 'Content of 1A...']
    # We iterate through them in pairs.
    for i in range(1, len(parts), 2):
        header = parts[i].strip()
        content = parts[i + 1].strip() if (i + 1) < len(parts) else ""

        chunk_content = f"{header}\n\n{content}"

        if len(content.split()) > 20:
            chunk_obj = {
                "content": chunk_content,
                "metadata": {**metadata, "section": header}
            }
            chunks.append(chunk_obj)

    return chunks

def get_filing_chunks(extracted_data):

    chunks = dict()
    if extracted_data:
        for filing in extracted_data:
            url = construct_sec_url(cik, filing)
            filing["url"] = url
            html_text = get_html_text(url)
            cleaned_text = clean_html(html_text)
            chunk = chunk_filing_by_section(cleaned_text, filing)
            id = f"{TICKER}-{chunk[0]['metadata']['form_type']}-{chunk[0]['metadata']['accession_number']}" # identifier for a filing
            chunks[id] = chunk
        
    return chunks