In [None]:
# 10-k 주소리스트 파일 작성
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import time
import random
import pandas as pd
from openpyxl import Workbook

delay = random.uniform(1,4.3)
options = Options()

# 10-k CIK list에 있는 CIK 별 10-k 접근 url 수집
# Company name, Trading symbol 포함
# Trading symbol의 경우 selenium에서 페이지 전환이 필요하므로 함수가 10-k, 10-q로 나눠질 필요가 있음
def download_url_10k(cik_number_list, doc_type):
    data = []
    finish_count = 0
    for cik_num in cik_number_list:
        company_name = ""
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        options.add_argument(f"user-agent={user_agent}")
        cik_num = str(cik_num).zfill(10)
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        test_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik_num}&type={doc_type}&dateb=&owner=include&count=100&search_text="
        driver.get(test_url)
        time.sleep(delay)
        html_1 = driver.page_source
        time.sleep(delay)
        page_10k = bs(html_1, "html.parser")

        #회사명 수집
        company_name = page_10k.find("span", class_ = "companyName").text
        #날짜 + url 수집 함수
        date_links_list = date_links(page_10k)
        #Trading symbol 수집 함수
        symbol = trading_symbol(driver)
        
        time.sleep(delay)
        driver.close()

        #회사명 정리
        company_name_p = []
        company_name_split = company_name.split(" ")
               
        for i in company_name_split:
            if "CIK" not in i:
                company_name_p.append(i)
            else:
                break
        company = " ".join(company_name_p)

        data.append((company, symbol, cik_num, date_links_list))
        finish_count += 1
        print(f"successful : {cik_num} ({finish_count}/{len(cik_number_list)}")
    return data

# 10-k에만 trading symbol을 확인 가능하다
def trading_symbol(driver):
    symbol = None
    
    symbol_find = driver.find_element(By.ID, "interactiveDataBtn")
    time.sleep(delay)
    symbol_find.click()
    time.sleep(delay)
    html_2 = driver.page_source
    page_symbol = bs(html_2, "html.parser")
    report_table = page_symbol.find("table", class_="report")
    if report_table:
        a_tags = report_table.find_all("a", class_="a")
        for a_tag in a_tags:
            if "Trading Symbol" in a_tag.text:
                td = a_tag.find_parent("td")
                if td:
                    sb_tds = td.find_next_siblings("td", class_="text")
                    for sb_td in sb_tds:
                        symbol = sb_td.text.strip()
                        if symbol:
                            return symbol
                                                    
    print("not find Trading symbol....")
    return symbol

# filling date(재정날짜) 기준으로 다운로드 받을수 있는 row 주소 수집집
def date_links(parser_datas):
    date_links = []
    
    rows = parser_datas.select("table.tableFile2 tr")
    for row in rows:
        date = None
        link = None
        for cell in row.find_all("td"):
            a_tags = cell.find_all("a", id="interactiveDataBtn")
            if a_tags:
                link = a_tags[0]["href"]
    
            date_text = cell.text.strip()
            if len(date_text) == 10 and date_text[4] == "-" and date_text[7] == "-":
                date = date_text
    
        if date:
            date_links.append((date, link))
        
    return date_links

# 엑셀로 저장장
def save2excel(data, path, file_name, doc_type):
    
    wb = Workbook()
    ws = wb.active
    ws.title = doc_type

    ws.append(["Company Name", "Symbol", "CIK", "Filling Date", "Link"])

    for company, symbol, cik_num, date_links_list in data:
        if date_links_list:
            for date, link in date_links_list:
                ws.append([company, symbol, cik_num, date, link])
        else:
            ws.append([company, symbol, cik_num, None, None])

    wb.save(path + file_name)
    print(f"<{path + file_name}> save complite!!!")
    
save2excel_path = "C:\\SEC_Data\\"
save_file_name_10_k = "sec_edgar_bank_section_10_k.xlsx"
save_file_name_10_q = "sec_edgar_bank_section_10_q.xlsx"
doc_type = "10-k"
   
# cik_number_list = ["0000750577"] #, "0000729986"]

open_path = "C:\\SEC_Data\\CIK_number_list.xlsx"
cik_df = pd.read_excel(open_path)
cik_number_list = cik_df["CIK_number"].tolist()

data = download_url_10k(cik_number_list, doc_type)

save2excel(data, save2excel_path, save_file_name_10_k, doc_type)

In [8]:
# 10-q 주소 리스트 파일 작성
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup as bs
import time
import random
import pandas as pd
from openpyxl import Workbook

delay = random.uniform(1,4.3)
options = Options()

# 10-k CIK list에 있는 CIK 별 10-k 접근 url 수집
# Company name, Trading symbol 포함
# Trading symbol의 경우 selenium에서 페이지 전환이 필요하므로 함수가 10-k, 10-q로 나눠질 필요가 있음
def download_url_10q(cik_number_list, doc_type):
    data = []
    finish_count = 0
    for cik_num in cik_number_list:
        company_name = ""
        user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        options.add_argument(f"user-agent={user_agent}")
        cik_num = str(cik_num).zfill(10)
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        test_url = f"https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik_num}&type={doc_type}&dateb=&owner=include&count=100&search_text="
        driver.get(test_url)
        time.sleep(delay)
        html_1 = driver.page_source
        time.sleep(delay)
        page_10q = bs(html_1, "html.parser")

        #회사명 수집
        company_name = page_10q.find("span", class_ = "companyName").text
        #날짜 + url 수집 함수
        date_links_list = date_links(page_10q)
        #Trading symbol 수집 함수
        symbol = None
        
        time.sleep(delay)
        driver.close()

        #회사명 정리
        company_name_p = []
        company_name_split = company_name.split(" ")
               
        for i in company_name_split:
            if "CIK" not in i:
                company_name_p.append(i)
            else:
                break
        company = " ".join(company_name_p)

        data.append((company, symbol, cik_num, date_links_list))
        finish_count += 1
        print(f"successful : {cik_num} ({finish_count}/{len(cik_number_list)})")
    return data

# 10-q에만 trading symbol을 확인 가능하다
def trading_symbol(driver):
    symbol = None
    
    symbol_find = driver.find_element(By.ID, "interactiveDataBtn")
    time.sleep(delay)
    symbol_find.click()
    time.sleep(delay)
    html_2 = driver.page_source
    page_symbol = bs(html_2, "html.parser")
    report_table = page_symbol.find("table", class_="report")
    if report_table:
        a_tags = report_table.find_all("a", class_="a")
        for a_tag in a_tags:
            if "Trading Symbol" in a_tag.text:
                td = a_tag.find_parent("td")
                if td:
                    sb_tds = td.find_next_siblings("td", class_="text")
                    for sb_td in sb_tds:
                        symbol = sb_td.text.strip()
                        if symbol:
                            return symbol
                                                    
    print("not find Trading symbol....")
    return symbol

# filling date(재정날짜) 기준으로 다운로드 받을수 있는 row 주소 수집집
def date_links(parser_datas):
    date_links = []
    
    rows = parser_datas.select("table.tableFile2 tr")
    for row in rows:
        date = None
        link = None
        for cell in row.find_all("td"):
            a_tags = cell.find_all("a", id="interactiveDataBtn")
            if a_tags:
                link = a_tags[0]["href"]
    
            date_text = cell.text.strip()
            if len(date_text) == 10 and date_text[4] == "-" and date_text[7] == "-":
                date = date_text
    
        if date:
            date_links.append((date, link))
        
    return date_links

# 엑셀로 저장장
def save2excel(data, path, file_name, doc_type):
    
    wb = Workbook()
    ws = wb.active
    ws.title = doc_type

    ws.append(["Company Name", "Symbol", "CIK", "Filling Date", "Link"])

    for company, symbol, cik_num, date_links_list in data:
        if date_links_list:
            for date, link in date_links_list:
                ws.append([company, symbol, cik_num, date, link])
        else:
            ws.append([company, symbol, cik_num, None, None])

    wb.save(path + file_name)
    print(f"<{path + file_name}> save complite!!!")
    
save2excel_path = "C:\\SEC_Data\\"
save_file_name_10_k = "sec_edgar_bank_section_10_k.xlsx"
save_file_name_10_q = "sec_edgar_bank_section_10_q.xlsx"
doc_type = "10-q"
   
# cik_number_list = ["0000750577", "0000022356", "0000036377"] #, "0000729986"]

open_path = "C:\\SEC_Data\\CIK_number_list.xlsx"
cik_df = pd.read_excel(open_path)
cik_number_list = cik_df["CIK_number"].tolist()

data = download_url_10q(cik_number_list, doc_type)

save2excel(data, save2excel_path, save_file_name_10_q, doc_type)

successful : 0000004962 (1/101)
successful : 0000007789 (2/101)
successful : 0000018349 (3/101)
successful : 0000019617 (4/101)
successful : 0000022356 (5/101)
successful : 0000028412 (6/101)
successful : 0000035527 (7/101)
successful : 0000036104 (8/101)
successful : 0000036270 (9/101)
successful : 0000036377 (10/101)
successful : 0000036966 (11/101)
successful : 0000037808 (12/101)
successful : 0000039263 (13/101)
successful : 0000040729 (14/101)
successful : 0000046195 (15/101)
successful : 0000049196 (16/101)
successful : 0000070858 (17/101)
successful : 0000072971 (18/101)
successful : 0000073124 (19/101)
successful : 0000090498 (20/101)
successful : 0000091576 (21/101)
successful : 0000092230 (22/101)
successful : 0000093751 (23/101)
successful : 0000101382 (24/101)
successful : 0000109380 (25/101)
successful : 0000311094 (26/101)
successful : 0000316709 (27/101)
successful : 0000357173 (28/101)
successful : 0000357301 (29/101)
successful : 0000707179 (30/101)
successful : 000071

In [41]:
# 리스트 파일의 값 다운로드(적용 10-Q, 10-k 사용시 수정해야함)

import os
import requests
from pathlib import Path
import re

delay = random.uniform(0.8, 1.9)

finished_file_path = "C:\\SEC_Data\\sec_edgar_bank_section_10_q.xlsx"
appendix_file_path = "C:\\SEC_Data\\sec_edgar_bank_section_10_k.xlsx"
download_folder_path = "C:\\SEC_Data\\240708\\"

sec_10q_df = pd.read_excel(finished_file_path)
sec_10k_df = pd.read_excel(appendix_file_path)
 
link_filtered = sec_10q_df[sec_10q_df["Link"].notna()]

headers = {
            "Host": "www.sec.gov",
            "Sec-Fetch-Site": "none",
            "Accept-Language": "ko-KR,ko;q=0.9",
            "Connection": "keep-alive",
            "Sec-Fetch-Mode": "navigate",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4.1 Safari/605.1.15",
            "Sec-Fetch-Dest": "document",
            "Accept-Encoding": "gzip, deflate, br",
        }

def extract_accession_number(url):
    match = re.search(r"accession_number=([0-9-]+)", url)
    if match:
        return match.group(1).replace("-", "")
    return None

def download_save(link_filtered):
    download_finish_count = 0
    fail_list = []
    for index, row in link_filtered.iterrows():
        company_name = row["Company Name"]
        cik = str(int(row['CIK']))
        date = row['Filling Date']
        link = row['Link']
        doc_type = "10_Q"
        symbol_row = sec_10k_df[sec_10k_df["CIK"] == int(cik)]
        if not symbol_row.empty:
            symbol = symbol_row.iloc[0]["Symbol"]
            try:
                fixed_ticker = symbol.split("/")[0]
            except:
                fixed_ticker = symbol 
        else:
            fixed_ticker = "NA"

        accession_number = extract_accession_number(link)
    
        if accession_number:
            download_url_xlxs = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accession_number}/Financial_Report.xlsx"
            download_url_xls = f"https://www.sec.gov/Archives/edgar/data/{int(cik)}/{accession_number}/Financial_Report.xls"
            
            folder_name = f"{fixed_ticker}_{cik}"
            file_name_xlsx = f"{fixed_ticker}_{cik}_{date}_{doc_type}.xlsx"
            file_name_xls = f"{fixed_ticker}_{cik}_{date}_{doc_type}.xls"
            
            folder_path = os.path.join(download_folder_path, folder_name)
    
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
    
            file_path_xlsx = os.path.join(folder_path, file_name_xlsx)
            file_path_xls = os.path.join(folder_path, file_name_xls)
            
            response_xlsx = requests.get(download_url_xlxs, headers = headers)
            response_xls = requests.get(download_url_xls, headers = headers)
            
            time.sleep(delay)
            if response_xlsx.status_code == 200:
                with open(file_path_xlsx, "wb") as file:
                    file.write(response_xlsx.content)
                    download_finish_count += 1
                print(f"successful : {folder_name}\\{file_name_xlsx}, ({download_finish_count}/{len(link_filtered)})")
            elif response_xls.status_code == 200:
                with open(file_path_xls, "wb") as file:
                    file.write(response_xls.content)
                    download_finish_count += 1
                print(f"successful : {folder_name}\\{file_name_xls}, ({download_finish_count}/{len(link_filtered)})")
            else:
                fail_list.append((fixed_ticker, cik, date, download_url))
                print(f"{response.status_code}")
                print(f"fail url : {download_url}")
    
    fail_list_name = "fail_list.xlsx"
    fail_folder_path = os.path.join(download_folder_path, fail_list_name)
    fail_df = pd.DataFrame(fail_list, columns=["Symbol", "CIK", "Date", "URL"])
    fail_df.to_excel(fail_folder_path, index=False)

In [42]:
download_save(link_filtered[304:])

successful : USB_36104\USB_36104_2024-05-01_10_Q.xlsx, (1/3724)
successful : USB_36104\USB_36104_2023-11-01_10_Q.xlsx, (2/3724)
successful : USB_36104\USB_36104_2023-08-07_10_Q.xlsx, (3/3724)
successful : USB_36104\USB_36104_2023-05-08_10_Q.xlsx, (4/3724)
successful : USB_36104\USB_36104_2022-11-01_10_Q.xlsx, (5/3724)
successful : USB_36104\USB_36104_2022-08-04_10_Q.xlsx, (6/3724)
successful : USB_36104\USB_36104_2022-05-03_10_Q.xlsx, (7/3724)
successful : USB_36104\USB_36104_2021-11-02_10_Q.xlsx, (8/3724)
successful : USB_36104\USB_36104_2021-08-03_10_Q.xlsx, (9/3724)
successful : USB_36104\USB_36104_2021-05-04_10_Q.xlsx, (10/3724)
successful : USB_36104\USB_36104_2020-11-05_10_Q.xlsx, (11/3724)
successful : USB_36104\USB_36104_2020-08-06_10_Q.xlsx, (12/3724)
successful : USB_36104\USB_36104_2020-05-07_10_Q.xlsx, (13/3724)
successful : USB_36104\USB_36104_2019-11-08_10_Q.xlsx, (14/3724)
successful : USB_36104\USB_36104_2019-08-01_10_Q.xlsx, (15/3724)
successful : USB_36104\USB_36104_2

In [43]:
len(link_filtered)

4028