In [None]:
%pip install edgartools llama-parse pywebcopy nest_asyncio huggingface_hub

In [None]:
%pip install lxml_html_clean
import nest_asyncio; nest_asyncio.apply()
import os
from edgar import Company, set_identity
from pywebcopy import save_webpage
from llama_parse import LlamaParse

# Set SEC identity
# Put your name and email address 
set_identity("Your Name Youremail@example.com")

# Define the output directory
output_dir = "./data"

# Fetch the latest 10-Q filing for AAPL
# Change it according to whatever reports you want 
# This example is for Apple's 10-Q (quarterly) reports
# More options can be seen on docs of edgartools
filings = Company("AAPL").get_filings(form="10-Q").latest(1)
document_url = filings.document.url

# Save the webpage
save_webpage(
    url=document_url,
    project_folder=output_dir,
    open_in_browser=False
)

# File Search
file_extension = ".html"
html_file_path = None
for root, dirs, files in os.walk(output_dir):
    for file in files:
        if file.endswith(file_extension):
            html_file_path = os.path.join(root, file)
            break
    if html_file_path:
        break

if html_file_path:
    print(f"Found file: {html_file_path}")

    # Initialize LlamaParse
    # Please enter Llama-Parse access key generated from Llama-Index website
    parser = LlamaParse(
        api_key="llx-.",
        result_type="markdown",  # or "text"
        verbose=True,
    )

    # Please enter filename (along with .txt) for the parsed file
    output_filename = input("Enter the filename for the parsed content (e.g., 'parsed_filing.txt'): ")

    try:
        documents = parser.load_data(html_file_path)
        # Save the parsed content to the file
        output_file_path = os.path.join(output_dir, output_filename)
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            for doc in documents:
                output_file.write(doc.text)
        print(f"Parsed content saved as: {output_file_path}")
    except Exception as e:
        print(f"Error processing the file: {e}")
else:
    print("No HTML files found in the directory.")


In [None]:
# The following steps are in addition to save the file on huggingface
from huggingface_hub import HfApi, HfFolder
from huggingface_hub import create_repo
# Replace 'your_hf_token' with your Hugging Face API token
hf_token = 'hf_..'
api = HfApi()
#Define the repo name
repo_id = ''
# Create the repository if it doesn't exist
try:
    create_repo(repo_id=repo_id, token=hf_token)
    print(f"Repository {repo_id} created successfully.")
except Exception as e:
    print(f"Error creating the repository: {e}")
HfFolder.save_token(hf_token)
try:
    # Upload the file to Hugging Face
    api.upload_file(
        path_or_fileobj=output_file_path,
        path_in_repo=output_filename,
        repo_id=repo_id,
        token=hf_token
    )
    print(f"File uploaded successfully to Hugging Face repository: {repo_id}")
except Exception as e:
    print(f"Error uploading the file: {e}")


In [1]:
#please clear output_dir after each run

In [None]:
#Made by Arnav 