In [3]:
from edgar import Company, set_identity
import re
import os

# Function to find the year in the filing text based on your specified pattern
def find_year_in_text(text):
    
    pattern = r'\b(19|20)\d{2}(?=-\d{2}-\d{2})'
    # Searching for the first match
    match = re.search(pattern, text)
    if match:
        return match.group()  # Returns the first occurrence of the year
    else:
        return "Year not found"

def clean_text(text):
    """Clean the input text by removing unreadable characters and extra spaces."""
    # Remove non-ASCII characters
    cleaned_text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    # Replace multiple spaces/newlines with a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

def process_filings(ticker):
    # Set your identity as required by the EDGAR API
    set_identity("Krithu Raju kraju30@gatech.edu")

    company_ticker = Company(ticker)
    filings = company_ticker.get_filings(form="10-K")
    len(filings)

    # Set a base path to the data directory
    base_path = os.path.join('..', 'data')

    filing_folder = "item1a_filings"
    filing_path = os.path.join(base_path, filing_folder)
    ticker_path = os.path.join(filing_path, ticker)
    os.makedirs(filing_path, exist_ok=True)
    os.makedirs(ticker_path, exist_ok=True)
    
    #Limiting to 14 as the Edgartools is not returning past that.
    num_filings_to_process = min(14, len(filings))
    
    for i in range(num_filings_to_process):
        filing = filings[i].obj()
        item1a = filing["Item 1A"]
        #get the balance sheet as a dataframe and get the year from that.
        balance_sheet_df = filing.financials.balance_sheet.to_dataframe()
        head = balance_sheet_df.head(1)
        # Convert the head to a string
        str_df = head.to_string(index=False)
        #print(str_df)
        year = find_year_in_text(str_df) if str_df else "Year not found"
        print(year)

        cleaned_item1a = clean_text(item1a)
        filename = f"{ticker_path}/{ticker}_{year}_item1a.txt"
        
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(f"{year}\n{cleaned_item1a}")
        
        print(f"Processed {filename}")

In [5]:
process_filings("GOOG")
print("Completed processing the filings.")

2023
Processed ..\data\item1a_filings\GOOG/GOOG_2023_item1a.txt
2022
Processed ..\data\item1a_filings\GOOG/GOOG_2022_item1a.txt
2021
Processed ..\data\item1a_filings\GOOG/GOOG_2021_item1a.txt
2020
Processed ..\data\item1a_filings\GOOG/GOOG_2020_item1a.txt
2019
Processed ..\data\item1a_filings\GOOG/GOOG_2019_item1a.txt
2018
Processed ..\data\item1a_filings\GOOG/GOOG_2018_item1a.txt
2017
Processed ..\data\item1a_filings\GOOG/GOOG_2017_item1a.txt
2016
Processed ..\data\item1a_filings\GOOG/GOOG_2016_item1a.txt
2015
Processed ..\data\item1a_filings\GOOG/GOOG_2015_item1a.txt
Completed processing the filings.
