In [52]:
import pandas as pd
import numpy as np
from scipy.stats import norm


data = pd.read_csv('8k_with_prices.csv')
data

Unnamed: 0,Ticker,Filing Date,Section,Date,Close
0,LANC,2022-08-17,"8.01,9.01",2022-08-03,132.966370
1,LANC,2022-08-17,"8.01,9.01",2022-08-04,132.829498
2,LANC,2022-08-17,"8.01,9.01",2022-08-05,134.735718
3,LANC,2022-08-17,"8.01,9.01",2022-08-08,135.967438
4,LANC,2022-08-17,"8.01,9.01",2022-08-09,137.932343
...,...,...,...,...,...
287737,VNO,2020-11-24,"3.02,5.03,8.01,9.01",2020-12-03,35.143299
287738,VNO,2020-11-24,"3.02,5.03,8.01,9.01",2020-12-04,36.312126
287739,VNO,2020-11-24,"3.02,5.03,8.01,9.01",2020-12-07,35.396259
287740,VNO,2020-11-24,"3.02,5.03,8.01,9.01",2020-12-08,34.916512


In [53]:
# 将 'Filing Date' 和 'Date' 列的数据类型转换为日期时间
data['Filing Date'] = pd.to_datetime(data['Filing Date'])
data['Date'] = pd.to_datetime(data['Date'])
data

Unnamed: 0,Ticker,Filing Date,Section,Date,Close
0,LANC,2022-08-17,"8.01,9.01",2022-08-03,132.966370
1,LANC,2022-08-17,"8.01,9.01",2022-08-04,132.829498
2,LANC,2022-08-17,"8.01,9.01",2022-08-05,134.735718
3,LANC,2022-08-17,"8.01,9.01",2022-08-08,135.967438
4,LANC,2022-08-17,"8.01,9.01",2022-08-09,137.932343
...,...,...,...,...,...
287737,VNO,2020-11-24,"3.02,5.03,8.01,9.01",2020-12-03,35.143299
287738,VNO,2020-11-24,"3.02,5.03,8.01,9.01",2020-12-04,36.312126
287739,VNO,2020-11-24,"3.02,5.03,8.01,9.01",2020-12-07,35.396259
287740,VNO,2020-11-24,"3.02,5.03,8.01,9.01",2020-12-08,34.916512


In [150]:
# 筛选出 'Filing Date' 和 'Date' 相同的数据
filtered_data = data[data['Filing Date'].dt.date == data['Date'].dt.date]
# 删除 'Date' 列
filtered_data = filtered_data.drop(columns=['Date','Close'])
# 按 Ticker 和 Filing Date 进行排序
filtered_data = filtered_data.sort_values(by=['Ticker', 'Filing Date'])
# 删除重复数据
filtered_data = filtered_data.drop_duplicates(subset=['Ticker', 'Filing Date', 'Section'])
# 将 Section 按逗号拆分并计算其长度
filtered_data['Section_Length'] = filtered_data['Section'].str.split(',').str.len()
# 找到每个 Ticker 和 Filing Date 组合下 Section 最长的长度
max_section_lengths = filtered_data.groupby(['Ticker', 'Filing Date'])['Section_Length'].transform('max')
# 保留 Section 最长的数据
filtered_data = filtered_data[filtered_data['Section_Length'] == max_section_lengths]
# 删除 Section_Length 列
filtered_data.drop('Section_Length', axis=1, inplace=True)
# 重新设置索引
filtered_data = filtered_data.reset_index(drop=True)
# 打印筛选后的数据
print(filtered_data)

      Ticker Filing Date                   Section
0         AA  2020-01-15                 2.02,9.01
1         AA  2020-02-03                 8.01,9.01
2         AA  2020-04-22  1.01,2.02,2.03,8.01,9.01
3         AA  2020-05-11                      5.07
4         AA  2020-06-25            1.01,2.03,9.01
...      ...         ...                       ...
12500     ZI  2022-05-02                 2.02,9.01
12501     ZI  2022-05-19       3.03,5.03,5.07,9.01
12502     ZI  2022-06-30            5.02,7.01,9.01
12503     ZI  2022-08-01                 2.02,9.01
12504     ZI  2022-11-01                 2.02,9.01

[12505 rows x 3 columns]


In [151]:
# 创建包含所有的 '8-K' 项目数字的列
for item in ['1.01', '1.02', '2.01', '2.02', '2.03', '2.04', '2.05', '2.06', '2.07', '2.08', '3.01', '3.02', '3.03', '4.01', '4.02', '5.01', '5.02', '5.03', '5.04', '5.05', '5.06', '5.07', '5.08', '6.01', '6.02', '6.03', '6.04', '6.05', '7.01', '7.02', '7.03', '8.01', '8.02', '8.03', '8.04', '8.05', '8.06', '8.07', '8.08', '8.09', '9.01']:
    filtered_data[item] = 0

# 打印筛选后的数据（包括新创建的列）
print(filtered_data)

      Ticker Filing Date                   Section  1.01  1.02  2.01  2.02  \
0         AA  2020-01-15                 2.02,9.01     0     0     0     0   
1         AA  2020-02-03                 8.01,9.01     0     0     0     0   
2         AA  2020-04-22  1.01,2.02,2.03,8.01,9.01     0     0     0     0   
3         AA  2020-05-11                      5.07     0     0     0     0   
4         AA  2020-06-25            1.01,2.03,9.01     0     0     0     0   
...      ...         ...                       ...   ...   ...   ...   ...   
12500     ZI  2022-05-02                 2.02,9.01     0     0     0     0   
12501     ZI  2022-05-19       3.03,5.03,5.07,9.01     0     0     0     0   
12502     ZI  2022-06-30            5.02,7.01,9.01     0     0     0     0   
12503     ZI  2022-08-01                 2.02,9.01     0     0     0     0   
12504     ZI  2022-11-01                 2.02,9.01     0     0     0     0   

       2.03  2.04  2.05  ...  8.01  8.02  8.03  8.04  8.05  8.0

In [152]:
# 创建包含所有的 '8-K' 项目数字的列
for item in ['1.01', '1.02', '2.01', '2.02', '2.03', '2.04', '2.05', '2.06', '2.07', '2.08', '3.01', '3.02', '3.03', '4.01', '4.02', '5.01', '5.02', '5.03', '5.04', '5.05', '5.06', '5.07', '5.08', '6.01', '6.02', '6.03', '6.04', '6.05', '7.01', '7.02', '7.03', '8.01', '8.02', '8.03', '8.04', '8.05', '8.06', '8.07', '8.08', '8.09', '9.01']:
    column_name = 'LSM' + item  # 在 '8-K' 项目数字前添加 "LSM"
    filtered_data[column_name] = 0

# 打印筛选后的数据（包括新创建的列）
print(filtered_data)

      Ticker Filing Date                   Section  1.01  1.02  2.01  2.02  \
0         AA  2020-01-15                 2.02,9.01     0     0     0     0   
1         AA  2020-02-03                 8.01,9.01     0     0     0     0   
2         AA  2020-04-22  1.01,2.02,2.03,8.01,9.01     0     0     0     0   
3         AA  2020-05-11                      5.07     0     0     0     0   
4         AA  2020-06-25            1.01,2.03,9.01     0     0     0     0   
...      ...         ...                       ...   ...   ...   ...   ...   
12500     ZI  2022-05-02                 2.02,9.01     0     0     0     0   
12501     ZI  2022-05-19       3.03,5.03,5.07,9.01     0     0     0     0   
12502     ZI  2022-06-30            5.02,7.01,9.01     0     0     0     0   
12503     ZI  2022-08-01                 2.02,9.01     0     0     0     0   
12504     ZI  2022-11-01                 2.02,9.01     0     0     0     0   

       2.03  2.04  2.05  ...  LSM8.01  LSM8.02  LSM8.03  LSM8.0

In [153]:
# 遍历每行数据，根据 Section 设置对应的列为1
for index, row in filtered_data.iterrows():
    sections = row['Section'].split(',')
    for section in sections:
        column_name = section
        if column_name not in filtered_data.columns:
            filtered_data[column_name] = 0
        filtered_data.at[index, column_name] = 1

# 打印筛选后的数据（包括新创建的列）
print(filtered_data)

      Ticker Filing Date                   Section  1.01  1.02  2.01  2.02  \
0         AA  2020-01-15                 2.02,9.01     0     0     0     1   
1         AA  2020-02-03                 8.01,9.01     0     0     0     0   
2         AA  2020-04-22  1.01,2.02,2.03,8.01,9.01     1     0     0     1   
3         AA  2020-05-11                      5.07     0     0     0     0   
4         AA  2020-06-25            1.01,2.03,9.01     1     0     0     0   
...      ...         ...                       ...   ...   ...   ...   ...   
12500     ZI  2022-05-02                 2.02,9.01     0     0     0     1   
12501     ZI  2022-05-19       3.03,5.03,5.07,9.01     0     0     0     0   
12502     ZI  2022-06-30            5.02,7.01,9.01     0     0     0     0   
12503     ZI  2022-08-01                 2.02,9.01     0     0     0     1   
12504     ZI  2022-11-01                 2.02,9.01     0     0     0     1   

       2.03  2.04  2.05  ...  LSM8.03  LSM8.04  LSM8.05  LSM8.0

In [155]:
import pandas as pd
from datetime import timedelta


# 创建包含所有的 '8-K' 项目数字的列
for item in ['1.01', '1.02', '2.01', '2.02', '2.03', '2.04', '2.05', '2.06', '2.07', '2.08', '3.01', '3.02', '3.03', '4.01', '4.02', '5.01', '5.02', '5.03', '5.04', '5.05', '5.06', '5.07', '5.08', '6.01', '6.02', '6.03', '6.04', '6.05', '7.01', '7.02', '7.03', '8.01', '8.02', '8.03', '8.04', '8.05', '8.06', '8.07', '8.08', '8.09', '9.01']:
    column_name = 'LSM' + item  # 在 '8-K' 项目数字前添加 "LSM"
    filtered_data[column_name] = 0

# 将 'Filing Date' 列转换为日期时间格式
filtered_data['Filing Date'] = pd.to_datetime(filtered_data['Filing Date'])

# 按照 'Ticker' 列对数据进行分组，然后按 'Filing Date' 列升序排序
filtered_data = filtered_data.sort_values(by=['Ticker', 'Filing Date'])

# 创建一个字典来存储上一个 Section 的内容
prev_section = {}

# 遍历每行数据，将上一列的 'Section' 内容添加到本列的 'LSM' 项目中并将相应的列值设为1
for index, row in filtered_data.iterrows():
    ticker = row['Ticker']
    if ticker in prev_section and row['Filing Date'] - prev_section[ticker]['Filing Date'] <= timedelta(days=180):
        sections = prev_section[ticker]['Section'].split(',')
        for section in sections:
            column_name = 'LSM' + section
            filtered_data.at[index, column_name] = 1

    # 更新字典中的值
    prev_section[ticker] = row

    
# 将NaN替换为0
filtered_data = filtered_data.fillna(0)

# 打印筛选后的数据（包括新创建的列）
print(filtered_data)


      Ticker Filing Date                   Section  1.01  1.02  2.01  2.02  \
0         AA  2020-01-15                 2.02,9.01     0     0     0     1   
1         AA  2020-02-03                 8.01,9.01     0     0     0     0   
2         AA  2020-04-22  1.01,2.02,2.03,8.01,9.01     1     0     0     1   
3         AA  2020-05-11                      5.07     0     0     0     0   
4         AA  2020-06-25            1.01,2.03,9.01     1     0     0     0   
...      ...         ...                       ...   ...   ...   ...   ...   
12500     ZI  2022-05-02                 2.02,9.01     0     0     0     1   
12501     ZI  2022-05-19       3.03,5.03,5.07,9.01     0     0     0     0   
12502     ZI  2022-06-30            5.02,7.01,9.01     0     0     0     0   
12503     ZI  2022-08-01                 2.02,9.01     0     0     0     1   
12504     ZI  2022-11-01                 2.02,9.01     0     0     0     1   

       2.03  2.04  2.05  ...  LSM8.05  LSM8.06  LSM8.07  LSM8.0

In [131]:
filtered_data

Unnamed: 0,Ticker,Filing Date,Section,1.01,1.02,2.01,2.02,2.03,2.04,2.05,...,LSM 8.05,LSM 8.06,LSM 8.07,LSM 8.08,LSM 8.09,LSM 9.01,1.03,1.04,LSM 1.03,LSM 1.04
0,AA,2020-01-15,"2.02,9.01",0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
1,AA,2020-02-03,"8.01,9.01",0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
2,AA,2020-04-22,"1.01,2.02,2.03,8.01,9.01",1,0,0,1,1,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
3,AA,2020-05-11,5.07,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0.0,0.0
4,AA,2020-06-25,"1.01,2.03,9.01",1,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12500,ZI,2022-05-02,"2.02,9.01",0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
12501,ZI,2022-05-19,"3.03,5.03,5.07,9.01",0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
12502,ZI,2022-06-30,"5.02,7.01,9.01",0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0
12503,ZI,2022-08-01,"2.02,9.01",0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0.0,0.0


In [157]:
# 将 filtered_data 输出为 CSV 文件
filtered_data.to_csv('filtered_data.csv', index=False)