In [None]:
from sympy import symbols, Eq, solve

# 定义变量
P, Q = symbols('P Q')

# 组 A 和 组 B 的需求方程
Q_A = (677 - P) / 0.4  # 组 A 的需求
Q_B = (723 - P) / 0.1  # 组 B 的需求

# 总需求为 Q_A 和 Q_B 之和
total_demand_eq = Eq(Q, Q_A + Q_B)
# total_demand = Eq(Q, (3569 - 5*P) / 0.4)


# 供给方程
supply_eq = Eq(P, 338.5 + 0.4 * Q)

# 解联立方程，求出均衡价格 P 和均衡数量 Q
equilibrium_solution = solve([supply_eq, total_demand_eq], (P, Q))

# 输出均衡价格和均衡数量
equilibrium_solution


In [None]:
import pandas as pd

# 使用 chunksize 参数读取前10行
chunksize = 10  # 每次读取 10 行
chunk = pd.read_csv(r'C:\Users\lishe\Downloads\gwfziq8ptfx6csqt.csv', chunksize=chunksize)

# 获取第一个块，也就是文件的前10行
df_first_10 = next(chunk)

# 显示前10行
print(df_first_10)


In [None]:
import pandas as pd
import zipfile

# 打开 ZIP 压缩包
with zipfile.ZipFile(r'C:\Users\lishe\Downloads\pe8pqmpquql8iaiz_csv.zip') as z:
    # 打开压缩包内的特定 CSV 文件
    with z.open('pe8pqmpquql8iaiz.csv') as f:
        # 只读取前 10 行
        df = pd.read_csv(f, nrows=10)

# 打印前 10 行数据
print(df)


In [None]:
import os
import shutil
import zipfile
import tempfile
import logging
import warnings
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, count
from pyspark.sql.types import FloatType, StringType, DateType

# Suppress unnecessary warnings
warnings.filterwarnings("ignore")
logging.getLogger("distributed.shuffle").setLevel(logging.ERROR)

# 创建SparkSession
spark = SparkSession.builder \
    .appName("Process Zip CSV with PySpark") \
    .master("local[*]") \
    .config("spark.hadoop.fs.permissions.enabled", "false") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

def process_zip_file(zip_file_path, output_folder):
    try:
        # Specify temporary directory location
        temp_dir = r'X:\Transfer'

        # Create a unique temporary directory for extraction
        with tempfile.TemporaryDirectory(dir=temp_dir) as temp_subdir:
            # Extract ZIP file to temporary subdirectory
            with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
                zip_ref.extractall(temp_subdir)
                # Assume there is only one CSV file in the ZIP
                csv_file_name = zip_ref.namelist()[0]
                csv_file_path = os.path.join(temp_subdir, csv_file_name)

            # Read CSV into PySpark DataFrame
            df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

            # Filter out invalid BID and OFR values
            df = df.filter((col("BID") > 0) & (col("OFR") > 0))

            # Calculate Spread and Midpoint
            df = df.withColumn('Spread', col('OFR') - col('BID'))
            df = df.filter(col('Spread') <= 5)  # Filter out large spreads

            df = df.withColumn('Midpoint', (col('BID') + col('OFR')) / 2)
            df = df.withColumn('Relative_Quoted_Spread', (col('Spread') / col('Midpoint')) * 100)

            # Convert DATE to date format
            df = df.withColumn('DATE', col('DATE').cast(DateType()))

            # Group by SYMBOL and DATE to compute daily equal-weighted averages
            result = df.groupBy('SYMBOL', 'DATE').agg(
                mean('Relative_Quoted_Spread').alias('Relative_Quoted_Spread_mean'),
                count('SYMBOL').alias('Quote_count')
            )

            # Save the result as CSV
            zip_file_name = os.path.basename(zip_file_path)
            output_file_name = os.path.splitext(zip_file_name)[0] + '_daily_average_quoted_spread.csv'
            output_file_path = os.path.join(output_folder, output_file_name)
            result.coalesce(1).write.csv(output_file_path, header=True, mode='overwrite')
            print(f"Results saved to {output_file_path}")

    except Exception as e:
        print(f"Error processing file {zip_file_path}: {e}")

def process_single_zip_file(zip_file_name, source_folder, transfer_folder, output_folder):
    source_zip_path = os.path.join(source_folder, zip_file_name)
    local_zip_path = os.path.join(transfer_folder, zip_file_name)

    # Copy the ZIP file to the local transfer folder
    try:
        print(f"Copying {source_zip_path} to {local_zip_path}...")
        shutil.copy2(source_zip_path, local_zip_path)
        print("Copy completed.")
    except Exception as e:
        print(f"Error copying file {source_zip_path}: {e}")
        return  # Skip this file

    # Process the local ZIP file
    process_zip_file(local_zip_path, output_folder)

    # Delete the local ZIP file after processing
    try:
        os.remove(local_zip_path)
        print(f"Deleted local file {local_zip_path}.")
    except Exception as e:
        print(f"Error deleting file {local_zip_path}: {e}")

def main():
    # Define paths
    source_folder = r'X:Quote_2001.zip'  # Source folder containing ZIP files
    transfer_folder = r'X:\Transfer'  # Temporary transfer folder
    output_folder = r'C:\Users\lishe\Documents\GitHub\Replicate-Work\TAQ_Output\Quote_Daily'  # Output folder

    # Create necessary directories if they don't exist
    os.makedirs(transfer_folder, exist_ok=True)
    os.makedirs(output_folder, exist_ok=True)

    # Get a list of all ZIP files in the source folder
    zip_files = [f for f in os.listdir(source_folder) if f.endswith('.zip')]

    # Process the ZIP files sequentially (can be parallelized)
    for zip_file_name in zip_files:
        process_single_zip_file(zip_file_name, source_folder, transfer_folder, output_folder)

    # Stop the Spark session
    spark.stop()
    print("All files processed, program completed.")

if __name__ == "__main__":
    main()


In [None]:
import pandas as pd
from datetime import datetime
import os

# 读取原始数据
df = pd.read_csv(r'C:\Users\lishe\Documents\GitHub\Replicate-Work\1_1_Data_Cleansing_Daily_Stock_Price_n_Volume\0_CUSIP_PERMNO_Mapping_1993_2023.csv')

# 创建所需列
df['symbol'] = df['TICKER']
df['quantity'] = 100  # 假设数量为 100，可以根据实际数据填写
df['cost'] = 10.0  # 假设成本为 10.0，可以根据需求调整
df['date'] = datetime.now().strftime('%m/%d/%Y')  # 当前日期，格式为 MM/DD/YYYY

# 去重处理，只保留每个 symbol 的第一个记录
df = df.drop_duplicates(subset=['symbol'])

# 选择所需的列并重命名
final_df = df[['symbol', 'quantity', 'cost', 'date']]

# 初始化变量
file_index = 1
chunk_size = 100  # 初始设定的行数，可以根据情况调整

while not final_df.empty:
    # 检查剩余数据量是否足以生成一个标准大小的文件
    if len(final_df) <= chunk_size:
        # 如果剩余数据不足以生成一个新文件，则将所有剩余数据保存为最后一个文件
        filename = f'portfolio_data_part{file_index}.csv'
        final_df.to_csv(filename, index=False)
        print(f"最后一个文件已生成，名称为 {filename}，可能小于140KB。")
        break

    # 提取 chunk_size 行
    chunk = final_df[:chunk_size]
    
    # 保存为临时 CSV 文件以检查大小
    filename = f'portfolio_data_part{file_index}.csv'
    chunk.to_csv(filename, index=False)
    
    # 检查文件大小
    file_size = os.path.getsize(filename)
    if file_size > 190 * 1024:  # 如果文件大于150KB
        chunk_size = int(chunk_size * 0.9)  # 减少行数
        os.remove(filename)  # 删除文件并重试
    elif file_size < 180 * 1024:  # 如果文件小于140KB
        chunk_size = int(chunk_size * 1.1)  # 增加行数
        os.remove(filename)  # 删除文件并重试
    else:
        # 文件大小在140KB到150KB之间，保留该文件
        final_df = final_df[chunk_size:]  # 移除已处理的行
        file_index += 1  # 递增文件编号
        chunk_size = 100  # 重置 chunk_size

print("所有文件已生成，每个文件大小在180KB到190KB之间（除最后一个文件）。")


In [48]:
import os
import requests
from bs4 import BeautifulSoup

# 配置参数
SAVE_DIR = 'sec_filings'      # 下载文件的保存目录
USER_AGENT = 'Your Name (your.email@example.com)'  # 替换为您的姓名和邮箱，确保仅包含ASCII字符
CIK = '36840'
FORM_TYPE = '10-K'

# 创建保存目录（如果不存在）
os.makedirs(SAVE_DIR, exist_ok=True)

# 下载文件的通用函数，并验证内容
def download_and_validate_file(file_url, cik, filing_date, form_type, extension='html'):
    headers = {'User-Agent': USER_AGENT}
    response = requests.get(file_url, headers=headers)
    if response.status_code == 200:
        file_name = f'{cik}_{filing_date}_{form_type}.{extension}'
        file_path = os.path.join(SAVE_DIR, file_name)
        with open(file_path, 'wb') as file:
            file.write(response.content)
        print(f'文件已保存至：{file_path}')

        # 读取整个文件内容，并检查是否包含 "FORM 10-K" 或相关关键词
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
                content = file.read()  # 读取整个文件内容
                if any(keyword in content.upper() for keyword in ['FORM 10-K', '10-K', 'ANNUAL REPORT']):
                    print(f'文件包含FORM 10-K或相关内容：{file_path}')
                    return True  # 保留文件
        except UnicodeDecodeError:
            print(f"无法解码文件：{file_path}，跳过进一步检查。")
        
        # 如果文件内容中不包含关键字，则删除文件
        print(f'文件不包含FORM 10-K或相关内容，已删除：{file_path}')
        os.remove(file_path)
        return False
    else:
        print(f"无法下载文件：{file_url}，状态码：{response.status_code}")
        return False

# 构建公司档案页面URL
company_url = f'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={CIK}&type={FORM_TYPE}&count=100&owner=exclude&output=xml'
headers = {'User-Agent': USER_AGENT}
response = requests.get(company_url, headers=headers)
if response.status_code != 200:
    print(f'无法访问公司页面：{company_url}，状态码：{response.status_code}')
else:
    # 解析XML响应
    soup = BeautifulSoup(response.content, 'lxml')
    filings = soup.find_all('filing')

    # 遍历每个提交
    for filing in filings:
        filing_date = filing.find('datefiled').text
        filing_href = filing.find('filinghref').text
        print(f'发现提交日期为{filing_date}的{FORM_TYPE}文件：{filing_href}')

        # 访问提交详情页面
        response = requests.get(filing_href, headers=headers)
        if response.status_code != 200:
            print(f'无法访问提交详情页面，状态码：{response.status_code}')
            continue

        # 解析详情页面，查找所有文件表格
        soup = BeautifulSoup(response.content, 'html.parser')
        doc_tables = soup.find_all('table', class_='tableFile')

        # 若存在多个tableFile表格，逐个解析
        found_10k = False
        for doc_table in doc_tables:
            rows = doc_table.find_all('tr')
            print(f'解析文件表格，找到 {len(rows)} 行记录')
            for row in rows:
                columns = row.find_all('td')
                if len(columns) >= 4:  # 确保有足够的列来定位Type列
                    doc_type = columns[3].text.strip().upper()  # 读取Type列
                    doc_link = columns[2].find('a', href=True)  # 读取Document列的链接
                    
                    # 输出每一行的内容进行调试
                    print(f'文档类型: {doc_type}, 链接: {doc_link["href"] if doc_link else "无链接"}')

                    # 检查文件类型是否完全匹配为 '10-K'
                    if doc_type == '10-K' and doc_link:
                        found_10k = True
                        doc_href = doc_link['href']
                        
                        # 如果链接包含 "/ix?doc="，则移除该部分
                        if '/ix?doc=' in doc_href:
                            doc_href = doc_href.replace('/ix?doc=', '')
                        
                        file_url = f'https://www.sec.gov{doc_href}'
                        print(f'正在下载10-K文件：{file_url}')

                        # 根据文件扩展名下载并验证内容
                        if doc_href.endswith('.htm') or doc_href.endswith('.html'):
                            download_and_validate_file(file_url, CIK, filing_date, FORM_TYPE, extension='html')
                        elif doc_href.endswith('.txt'):
                            download_and_validate_file(file_url, CIK, filing_date, FORM_TYPE, extension='txt')
                        else:
                            print(f'未找到合适的文件链接：{doc_href}')
            if found_10k:
                break  # 如果找到10-K，跳出内层循环
        if not found_10k:
            print(f'未在提交日期为{filing_date}的文件中找到10-K文件类型')


  soup = BeautifulSoup(response.content, 'lxml')


发现提交日期为2024-01-29的10-K文件：https://www.sec.gov/Archives/edgar/data/36840/000117494724000132/0001174947-24-000132-index.htm
解析文件表格，找到 9 行记录
文档类型: 10-K, 链接: /ix?doc=/Archives/edgar/data/36840/000117494724000132/frevsob-20231031.htm
正在下载10-K文件：https://www.sec.gov/Archives/edgar/data/36840/000117494724000132/frevsob-20231031.htm
文件已保存至：sec_filings\36840_2024-01-29_10-K.html
文件包含FORM 10-K或相关内容：sec_filings\36840_2024-01-29_10-K.html
文档类型: EX-21, 链接: /Archives/edgar/data/36840/000117494724000132/ex21.htm
文档类型: EX-23.1, 链接: /Archives/edgar/data/36840/000117494724000132/ex23-1.htm
文档类型: EX-31.1, 链接: /Archives/edgar/data/36840/000117494724000132/ex31-1.htm
文档类型: EX-31.2, 链接: /Archives/edgar/data/36840/000117494724000132/ex31-2.htm
文档类型: EX-32.1, 链接: /Archives/edgar/data/36840/000117494724000132/ex32-1.htm
文档类型: EX-32.2, 链接: /Archives/edgar/data/36840/000117494724000132/ex32-2.htm
文档类型: , 链接: /Archives/edgar/data/36840/000117494724000132/0001174947-24-000132.txt
发现提交日期为2023-01-27的10-K文件：https://www