In [None]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# 配置参数
INDEX_CSV_PATH = r'C:\Users\lishe\Documents\GitHub\Programming-Study\Other\putnc736bnq5aeat.csv'  # 替换为您的索引文件路径
SAVE_DIR = 'sec_filings'      # 下载文件的保存目录
USER_AGENT = 'Your Name (your.email@example.com)'  # 替换为您的姓名和邮箱，确保仅包含ASCII字符

# 创建保存目录（如果不存在）
os.makedirs(SAVE_DIR, exist_ok=True)

# 读取索引数据
try:
    df = pd.read_csv(INDEX_CSV_PATH, dtype=str)
    df.columns = df.columns.str.strip().str.lower()  # 清理和标准化列名
    print(f"成功读取索引文件：{INDEX_CSV_PATH}")
    print(f"列名：{df.columns.tolist()}")  # 检查列名
except Exception as e:
    print(f"读取索引文件失败：{e}")
    exit(1)

# 确保表格包含必要的列
if 'cik' not in df.columns or 'form' not in df.columns:
    print("索引文件缺少必要的列：'cik' 或 'form'")
    exit(1)

# 下载文件的通用函数，并验证内容
def download_and_validate_file(file_url, cik, filing_date, form_type, extension='html'):
    headers = {'User-Agent': USER_AGENT}
    response = requests.get(file_url, headers=headers)
    if response.status_code == 200:
        file_name = f'{cik}_{filing_date}_{form_type}.{extension}'
        file_path = os.path.join(SAVE_DIR, file_name)
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f'文件已保存至：{file_path}')

        # 检查文件内容是否包含 "FORM 10-K"、"10-K" 或 "ANNUAL REPORT"
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                for i in range(500):  # 读取前500行，以增加检索的范围
                    line = file.readline().upper()
                    if 'FORM 10-K' in line or '10-K' in line or 'ANNUAL REPORT' in line:
                        print(f'文件包含FORM 10-K：{file_path}')
                        return True  # 保留文件
        except UnicodeDecodeError:
            print(f"无法解码文件：{file_path}，跳过进一步检查。")
        
        # 如果文件前500行不包含关键字，则删除文件
        print(f'文件不包含FORM 10-K或相关内容，已删除：{file_path}')
        os.remove(file_path)
        return False
    else:
        print(f"无法下载文件：{file_url}，状态码：{response.status_code}")
        return False

# 遍历 CSV 中的每个 CIK 和 Form
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="处理每个公司"):
    cik = row['cik'].strip()
    form_type = row['form'].strip().upper()

    if form_type != '10-K':
        continue  # 只处理10-K文件

    # 构建公司档案页面URL
    company_url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type={form_type}&count=100&owner=exclude&output=xml'
    headers = {'User-Agent': USER_AGENT}
    response = requests.get(company_url, headers=headers)
    if response.status_code != 200:
        print(f'无法访问公司页面：{company_url}，状态码：{response.status_code}')
        continue

    # 解析XML响应
    soup = BeautifulSoup(response.content, 'lxml')
    filings = soup.find_all('filing')

    # 遍历每个提交
    for filing in filings:
        filing_date = filing.find('datefiled').text
        filing_href = filing.find('filinghref').text
        print(f'发现提交日期为{filing_date}的{form_type}文件：{filing_href}')

        # 访问提交详情页面
        response = requests.get(filing_href, headers=headers)
        if response.status_code != 200:
            print(f'无法访问提交详情页面，状态码：{response.status_code}')
            continue

        # 解析详情页面，查找所有文件表格
        soup = BeautifulSoup(response.content, 'html.parser')
        doc_tables = soup.find_all('table', class_='tableFile')

        # 若存在多个tableFile表格，逐个解析
        found_10k = False
        for doc_table in doc_tables:
            rows = doc_table.find_all('tr')
            print(f'解析文件表格，找到 {len(rows)} 行记录')
            for row in rows:
                columns = row.find_all('td')
                if len(columns) >= 4:
                    doc_type = columns[3].text.strip().upper()
                    doc_link = columns[2].find('a', href=True)
                    
                    if doc_type == '10-K' and doc_link:
                        found_10k = True
                        doc_href = doc_link['href']
                        
                        # 如果链接包含 "/ix?doc="，则移除该部分
                        if '/ix?doc=' in doc_href:
                            doc_href = doc_href.replace('/ix?doc=', '')

                        file_url = f'https://www.sec.gov{doc_href}'
                        print(f'正在下载10-K文件：{file_url}')

                        # 根据文件扩展名下载并验证内容
                        if doc_href.endswith('.htm') or doc_href.endswith('.html'):
                            download_and_validate_file(file_url, cik, filing_date, form_type, extension='html')
                        elif doc_href.endswith('.txt'):
                            download_and_validate_file(file_url, cik, filing_date, form_type, extension='txt')
                        else:
                            print(f'未找到合适的文件链接：{doc_href}')
            if found_10k:
                break  # 如果找到10-K，跳出内层循环
        if not found_10k:
            print(f'未在提交日期为{filing_date}的文件中找到10-K文件类型')


fixed date

In [None]:
# This version is useable but need to fix print too much issue
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import fitz  # PyMuPDF for PDF text extraction

# Configuration
INDEX_CSV_PATH = r'C:\Users\lishe\Documents\GitHub\Programming-Study\Other\putnc736bnq5aeat.csv'  # Replace with your index file path
SAVE_DIR = 'sec_filings'      # Directory to save downloaded files
USER_AGENT = 'Your Name (your.email@example.com)'  # Replace with your name and email (ASCII only)

# Create save directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)

# Read the index data
try:
    df = pd.read_csv(INDEX_CSV_PATH, dtype=str)
    df.columns = df.columns.str.strip().str.lower()  # Clean and standardize column names
    print(f"Successfully read index file: {INDEX_CSV_PATH}")
    print(f"Columns: {df.columns.tolist()}")  # Check column names
except Exception as e:
    print(f"Failed to read index file: {e}")
    exit(1)

# Ensure required columns exist
if 'cik' not in df.columns or 'form' not in df.columns or 'fdate' not in df.columns:
    print("Index file missing required columns: 'cik', 'form', or 'fdate'")
    exit(1)

# Download and validate function
def download_and_validate_file(file_url, cik, filing_date, form_type, extension='html'):
    headers = {'User-Agent': USER_AGENT}
    response = requests.get(file_url, headers=headers)
    if response.status_code == 200:
        file_name = f'{cik}_{filing_date}_{form_type}.{extension}'
        file_path = os.path.join(SAVE_DIR, file_name)
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f'File saved to: {file_path}')

        # Check content for "FORM 10-K", "10-K", or "ANNUAL REPORT"
        if extension == 'pdf':
            try:
                with fitz.open(file_path) as pdf:
                    for page_num in range(min(5, pdf.page_count)):  # Check the first 5 pages
                        text = pdf[page_num].get_text()
                        if any(keyword in text.upper() for keyword in ['FORM 10-K', '10-K', 'ANNUAL REPORT']):
                            print(f'File contains FORM 10-K: {file_path}')
                            return True
            except Exception as e:
                print(f"Error reading PDF file: {file_path}, Error: {e}")
        else:
            # Check HTML or TXT content
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    for _ in range(500):  # Read first 500 lines for keywords
                        line = file.readline().upper()
                        if 'FORM 10-K' in line or '10-K' in line or 'ANNUAL REPORT' in line:
                            print(f'File contains FORM 10-K: {file_path}')
                            return True
            except UnicodeDecodeError:
                print(f"Cannot decode file: {file_path}, skipping further check.")
        
        # Delete file if it doesn't contain the necessary content
        print(f'File does not contain FORM 10-K or related content, deleting: {file_path}')
        os.remove(file_path)
        return False
    else:
        print(f"Failed to download file: {file_url}, Status Code: {response.status_code}")
        return False

# Process each CIK and Form in the CSV
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing each company"):
    cik = row['cik'].strip()
    form_type = row['form'].strip().upper()
    fdate = row['fdate'].strip()  # Expected filing date

    if form_type != '10-K':
        continue  # Only process 10-K files

    # Build company filing page URL
    company_url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type={form_type}&count=100&owner=exclude&output=xml'
    headers = {'User-Agent': USER_AGENT}
    response = requests.get(company_url, headers=headers)
    if response.status_code != 200:
        print(f'Cannot access company page: {company_url}, Status Code: {response.status_code}')
        continue

    # Parse XML response
    soup = BeautifulSoup(response.content, 'lxml')
    filings = soup.find_all('filing')

    # Iterate through each filing
    for filing in filings:
        filing_date = filing.find('datefiled').text
        if filing_date != fdate:
            continue  # Skip files that don't match the fdate

        filing_href = filing.find('filinghref').text
        print(f'Found {form_type} file dated {filing_date}: {filing_href}')

        # Access filing detail page
        response = requests.get(filing_href, headers=headers)
        if response.status_code != 200:
            print(f'Cannot access filing detail page, Status Code: {response.status_code}')
            continue

        # Parse filing detail page and search for file table
        soup = BeautifulSoup(response.content, 'html.parser')
        doc_tables = soup.find_all('table', class_='tableFile')

        # Process each document table for 10-K files
        found_10k = False
        for doc_table in doc_tables:
            rows = doc_table.find_all('tr')
            print(f'Parsing document table with {len(rows)} rows')
            for row in rows:
                columns = row.find_all('td')
                if len(columns) >= 4:
                    doc_type = columns[3].text.strip().upper()
                    doc_link = columns[2].find('a', href=True)
                    
                    if doc_type == '10-K' and doc_link:
                        found_10k = True
                        doc_href = doc_link['href']
                        
                        # Remove "/ix?doc=" if present
                        if '/ix?doc=' in doc_href:
                            doc_href = doc_href.replace('/ix?doc=', '')

                        file_url = f'https://www.sec.gov{doc_href}'
                        print(f'Downloading 10-K file: {file_url}')

                        # Determine file extension and download
                        if doc_href.endswith('.htm') or doc_href.endswith('.html'):
                            download_and_validate_file(file_url, cik, filing_date, form_type, extension='html')
                        elif doc_href.endswith('.txt'):
                            download_and_validate_file(file_url, cik, filing_date, form_type, extension='txt')
                        elif doc_href.endswith('.pdf'):
                            download_and_validate_file(file_url, cik, filing_date, form_type, extension='pdf')
                        else:
                            print(f'No suitable file link found: {doc_href}')
            if found_10k:
                break  # Exit inner loop if 10-K found
        if not found_10k:
            print(f'No 10-K file type found for filing date: {filing_date}')


In [2]:
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import fitz  # PyMuPDF for PDF text extraction

# Configuration
INDEX_CSV_PATH = r'C:\Users\lishe\Documents\GitHub\Programming-Study\Other\putnc736bnq5aeat.csv'  # Replace with your index file path
SAVE_DIR = 'sec_filings'      # Directory to save downloaded files
LOG_FILE = 'download_log.txt' # Log file for recording details of each download
USER_AGENT = 'Your Name (your.email@example.com)'  # Replace with your name and email (ASCII only)

# Create save directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)

# Open log file
log_file = open(LOG_FILE, 'w')

# Read the index data
try:
    df = pd.read_csv(INDEX_CSV_PATH, dtype=str)
    df.columns = df.columns.str.strip().str.lower()  # Clean and standardize column names
    log_file.write(f"Successfully read index file: {INDEX_CSV_PATH}\n")
except Exception as e:
    print(f"Failed to read index file: {e}")
    exit(1)

# Ensure required columns exist
if 'cik' not in df.columns or 'form' not in df.columns or 'fdate' not in df.columns:
    print("Index file missing required columns: 'cik', 'form', or 'fdate'")
    exit(1)

# Download and validate function
def download_and_validate_file(file_url, cik, filing_date, form_type, extension='html'):
    headers = {'User-Agent': USER_AGENT}
    response = requests.get(file_url, headers=headers)
    if response.status_code == 200:
        file_name = f'{cik}_{filing_date}_{form_type}.{extension}'
        file_path = os.path.join(SAVE_DIR, file_name)
        with open(file_path, 'wb') as file:
            file.write(response.content)
        log_file.write(f'File saved to: {file_path}\n')

        # Check content for "FORM 10-K", "10-K", or "ANNUAL REPORT"
        if extension == 'pdf':
            try:
                with fitz.open(file_path) as pdf:
                    for page_num in range(min(5, pdf.page_count)):  # Check the first 5 pages
                        text = pdf[page_num].get_text()
                        if any(keyword in text.upper() for keyword in ['FORM 10-K', '10-K', 'ANNUAL REPORT']):
                            log_file.write(f'File contains FORM 10-K: {file_path}\n')
                            return True
            except Exception as e:
                print(f"Error reading PDF file: {file_path}, Error: {e}")
        else:
            # Check HTML or TXT content
            try:
                with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                    for _ in range(500):  # Read first 500 lines for keywords
                        line = file.readline().upper()
                        if 'FORM 10-K' in line or '10-K' in line or 'ANNUAL REPORT' in line:
                            log_file.write(f'File contains FORM 10-K: {file_path}\n')
                            return True
            except UnicodeDecodeError:
                print(f"Cannot decode file: {file_path}, skipping further check.")
        
        # Delete file if it doesn't contain the necessary content
        log_file.write(f'File does not contain FORM 10-K or related content, deleting: {file_path}\n')
        os.remove(file_path)
        return False
    else:
        print(f"Failed to download file: {file_url}, Status Code: {response.status_code}")
        return False

# Process each CIK and Form in the CSV
for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing each company"):
    cik = row['cik'].strip()
    form_type = row['form'].strip().upper()
    fdate = row['fdate'].strip()  # Expected filing date

    if form_type != '10-K':
        continue  # Only process 10-K files

    # Build company filing page URL
    company_url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={cik}&type={form_type}&count=100&owner=exclude&output=xml'
    headers = {'User-Agent': USER_AGENT}
    response = requests.get(company_url, headers=headers)
    if response.status_code != 200:
        print(f'Cannot access company page: {company_url}, Status Code: {response.status_code}')
        continue

    # Parse XML response
    soup = BeautifulSoup(response.content, 'lxml')
    filings = soup.find_all('filing')

    # Iterate through each filing
    for filing in filings:
        filing_date = filing.find('datefiled').text
        if filing_date != fdate:
            continue  # Skip files that don't match the fdate

        filing_href = filing.find('filinghref').text
        log_file.write(f'Found {form_type} file dated {filing_date}: {filing_href}\n')

        # Access filing detail page
        response = requests.get(filing_href, headers=headers)
        if response.status_code != 200:
            print(f'Cannot access filing detail page, Status Code: {response.status_code}')
            continue

        # Parse filing detail page and search for file table
        soup = BeautifulSoup(response.content, 'html.parser')
        doc_tables = soup.find_all('table', class_='tableFile')

        # Process each document table for 10-K files
        found_10k = False
        for doc_table in doc_tables:
            rows = doc_table.find_all('tr')
            log_file.write(f'Parsing document table with {len(rows)} rows\n')
            for row in rows:
                columns = row.find_all('td')
                if len(columns) >= 4:
                    doc_type = columns[3].text.strip().upper()
                    doc_link = columns[2].find('a', href=True)
                    
                    if doc_type == '10-K' and doc_link:
                        found_10k = True
                        doc_href = doc_link['href']
                        
                        # Remove "/ix?doc=" if present
                        if '/ix?doc=' in doc_href:
                            doc_href = doc_href.replace('/ix?doc=', '')

                        file_url = f'https://www.sec.gov{doc_href}'
                        log_file.write(f'Downloading 10-K file: {file_url}\n')

                        # Determine file extension and download
                        if doc_href.endswith('.htm') or doc_href.endswith('.html'):
                            download_and_validate_file(file_url, cik, filing_date, form_type, extension='html')
                        elif doc_href.endswith('.txt'):
                            download_and_validate_file(file_url, cik, filing_date, form_type, extension='txt')
                        elif doc_href.endswith('.pdf'):
                            download_and_validate_file(file_url, cik, filing_date, form_type, extension='pdf')
                        else:
                            log_file.write(f'No suitable file link found: {doc_href}\n')
            if found_10k:
                break  # Exit inner loop if 10-K found
        if not found_10k:
            log_file.write(f'No 10-K file type found for filing date: {filing_date}\n')

# Close the log file after all processing
log_file.close()


  soup = BeautifulSoup(response.content, 'lxml')
Processing each company: 100%|██████████| 92285/92285 [22:18:56<00:00,  1.15it/s]    
