## 管理层信息抽取

In [None]:
! cp /Users/alchemy_taotaox/Desktop/next/my_test/* ../corpus/

In [None]:
! ls ../corpus

## 算法基本思路
1. 扫描pdf 获得页眉页脚
2. 扫描pdf 获得最小和最大的x坐标
    如果一个字符是最小的坐标，那么就和之前的行合并为一个段落
    如果一个字符是最大的坐标，那么就和之前的
2. 提取文档中的表格和字符串。

In [None]:
from pdfplumber.pdf import PDF
import re
from collections import Counter

def is_header(str, header):
    if header is None:
        return False
    return str == header

def is_foot(str, foot):
    if foot is None:
        return False
    
    s = re.sub(r'\d+', '#num#', str)
    return s == foot

def get_foot_header(pdf:PDF, check_pages_num=20, threshold = 0.9):
    head_counter = Counter()
    foot_counter = Counter()
    pages_num = len(pdf.pages)
    if pages_num < check_pages_num:
        check_pages_num = pages_num
    for i in range(check_pages_num):
        text_lines = pdf.pages[i].extract_words()
        top_line = text_lines[0]['text']
        bottom_line = text_lines[-1]['text']
        bottom_line = re.sub(r'\d+', '#num#', bottom_line)
        head_counter.update([top_line])
        foot_counter.update([bottom_line])
        
    most_common_head_word, count_head = head_counter.most_common(1)[0]
    most_common_foot_word, count_foot = foot_counter.most_common(1)[0]
    head, foot = None, None
    if count_head *1./check_pages_num >= threshold:
        head = most_common_head_word
    if count_foot *1./check_pages_num >= threshold:
        foot = most_common_foot_word
    return head, foot
    
    

In [None]:
import pdfplumber

In [None]:
from operator import itemgetter

def check_bboxes(word, table_bbox):
    """
    Check whether word is inside a table bbox.
    """
    l = word['x0'], word['top'], word['x1'], word['bottom']
    r = table_bbox
    return l[0] > r[0] and l[1] > r[1] and l[2] < r[2] and l[3] < r[3]



def get_lines_from_page(page, head, foot):
    tables = page.find_tables()
    table_bboxes = [i.bbox for i in tables]
    tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
    non_table_words = [word for word in page.extract_words() if not any(
        [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
    lines = []
    lines_x0_x1 =[]
    for cluster in pdfplumber.utils.cluster_objects(
        non_table_words + tables, itemgetter('top'), tolerance=5):
        if 'text' in cluster[0]:
            x_0 = x_1 = -1
            inner_lines = []
            for item in cluster:
                if len(cluster) == 1 and (is_foot(item['text'], foot) or  is_header(item['text'], head)):
                    continue
                if x_0 == -1:
                    x_0 = item['x0']
                x_1 = item['x1']
                inner_lines.append(item['text'])
            if len(inner_lines) > 0:
                lines.append(' '.join(inner_lines))
                lines_x0_x1.append((x_0, x_1))
                
            # lines.append(' '.join([i['text'] for i in cluster if not is_foot(i['text'], foot) and not is_header(i['text'], head)]))
        elif 'table' in cluster[0]:
            lines.append(cluster[0]['table'])
            lines_x0_x1.append((10000000, -1))

    # Find the minimum of the first elements
    if len(lines_x0_x1) == 0:
        return []
    min_first = min(x[0] for x in lines_x0_x1)

    # Find the maximum of the second elements
    max_second = max(x[1] for x in lines_x0_x1)
       
    # print(lines)

    for index, item in enumerate(lines):
        if type(lines[index]) == str:
            lines[index] = lines[index].strip()
        # 如果锁进， 那么要换行
        
        if abs(lines_x0_x1[index][0] - min_first) > 0.1 and lines_x0_x1[index][0] < 1000000:
            # print(lines_x0_x1[index][0], min_first, "----")
            lines[index] = "\n" + lines[index]
        # 如果没有写完，那么换行
        if abs(lines_x0_x1[index][1] - max_second) > 16 and lines_x0_x1[index][1] > 0:
            # print(lines_x0_x1[index][1], max_second, "----", lines[index])
            lines[index] = lines[index] + "\n"
        elif lines_x0_x1[index][1] > 0:
            # print('******FULL LINE******', lines_x0_x1[index][1], max_second, "----", lines[index])
            pass
        # print(lines[index], end='')
    return lines

In [None]:
filename = "../corpus/year_report_2.pdf"
pdf = pdfplumber.open(filename)
page = pdf.pages[10]
page = pdf.pages[15]
page = pdf.pages[10]
# print(page.extract_text())
head, foot = get_foot_header(pdf)
print(head)
print(foot)
get_lines_from_page(page, head, foot)

print(pdf.pages[10].extract_words() )# 计算一个字符大概的宽度是多少，然后定义magic distance = 16
print(pdf.pages[10].extract_text())

In [None]:
from pdfplumber.pdf import PDF
import json
def begin_mda(lines):
    if len(lines) < 1:
        return False
    if type(lines[0]) != str:
        return False
    return lines[0].find('管理层讨论与分析') != -1
def end_mda(lines):
    if len(lines) < 1:
        return False
    if type(lines[0]) != str:
        return False
    # print("check: ", lines[0], "$$$$$$")
    return lines[0].find('公司治理') != -1

def get_all_lines_about_mda(pdf:PDF, head:str, foot:str):
    begin = False
    end = False
    all_lines = []
    for page in pdf.pages:
        lines = get_lines_from_page(page, head, foot)
        if begin is False and begin_mda(lines):
            begin = True
        if begin and end_mda(lines):
            break
        if begin is True:
            all_lines.append(lines)

    content_list = []
    for lines in all_lines:
        for line in lines:
            if type(line) != str:
                line = json.dumps(line, ensure_ascii=False)
            # print(line, end='')
            content_list.append(line)
    return "".join(content_list)
            
content = get_all_lines_about_mda(pdf, head, foot)
print(content)

## 抓取信息



In [None]:
from datetime import datetime, timedelta

def split_interval(start, end, day_cnt):
    date_format = "%Y-%m-%d"
    start_date = datetime.strptime(start, date_format)
    end_date = datetime.strptime(end, date_format)
    
    result = []

    while start_date < end_date:
        # Calculate the end date of the current interval.
        interval_end_date = start_date + timedelta(days=day_cnt-1)

        # Ensure that the interval doesn't go beyond the overall end date.
        if interval_end_date > end_date:
            interval_end_date = end_date

        # Append the current interval to the result list.
        result.append(
            (start_date.strftime(date_format), 
             interval_end_date.strftime(date_format), 
             (interval_end_date - start_date).days + 1)
        )

        # Move the start date to the next day after the current interval.
        start_date = interval_end_date + timedelta(days=1)

    return result
results = split_interval('2023-01-01', '2023-07-31', 1)
print(results)

In [None]:
import requests
import time
from tqdm import tqdm
import json




def request_data(page_size=1000, page_num=1, se_date="2023-04-26~2023-04-26"):
    url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"
    if page_num > 1:
        url = "http://www.cninfo.com.cn/new/hisAnnouncement/query"

    params = {
        "pageNum": page_num,
        "pageSize": page_size,
        "column": "szse",
        "tabName": "fulltext",
        "plate": "",
        "stock": "",
        "searchkey": "",
        "secid": "",
        "category": "category_ndbg_szsh",
        "trade": "",
        "seDate": se_date,
        "sortName": "",
        "sortType": "",
        "isHLtitle": "true"
    }

    response = requests.post(url, data=params)

    if response.status_code == 200:
        return response.json()  # 或者返回response.text取决于你需要什么样的数据格式
    else:
        return None

file = open('report_meta_info.txt', 'w')
results = split_interval('2023-01-01', '2023-07-31', 1)
results = results[::-1]
#for date_range in results:
for i, date_range in enumerate(tqdm(results)):
    se_date = f"{date_range[0]}~{date_range[1]}"
    
    page_size = 30
    data = request_data(page_size=10, se_date=se_date)
    try_times = 0
    while data is None:
        time.sleep(1)
        data = request_data(page_size=10, se_date=se_date)
        try_times += 1
        print(f"craw data wrong, try {try_times}")
        if try_times == 3:
            break  
    if data is None:
        continue
    page_count = (data['totalAnnouncement'] -1 )// page_size  + 1
    print(page_count)
    begin = time.time()
    for index in range(1, page_count+1):
    
        data = request_data(page_num=index, page_size=page_size, se_date=se_date)
        try_times = 0
        while data is None:
        
            time.sleep(3)
            data = request_data(page_num=index, page_size=page_size, se_date=se_date)
            try_times += 1
            print(f"craw data wrong, try {try_times}")
            if try_times == 3:
                break
        
        last = time.time() - begin
        need_time = last/index * page_count  - last
        every_time = last*1./index
        print(f"{index}/{page_count}: last time {last} 秒， need {need_time}, every page cost: {every_time}, count: { len(data['announcements'])}, but page size is {page_size}")
        for item in data['announcements']:
            file.write(json.dumps(item, ensure_ascii=False) + "\n")
file.close()
    