In [9]:
## Configs
FROM_DIR="../resources/data/CSI300news_chunked_summarized"
TO_DIR="../resources/data/CSI300news_chunked_summarized_senti"
BERT_BACKBONE_DIR = "../resources/open_models/FinBert"
CLASSIFIER_DIR = '../resources/ckpts/FinBert'

In [2]:
## 初始化
import sys; sys.path.append("..")
from utils.models import *
from transformers import AutoTokenizer, AutoModel
import time
import pandas as pd
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained(BERT_BACKBONE_DIR)
backbone = AutoModel.from_pretrained(BERT_BACKBONE_DIR)
model = BaseModel.use_subclass("bert_classifier")(backbone,2)
model.load_classifier(CLASSIFIER_DIR)

  from .autonotebook import tqdm as notebook_tqdm


Classifier weights loaded from ../resources/ckpts/FinBert/classifier_weights.pth


In [12]:
def fn_extract(news):
    input_tokens = tokenizer(news, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # return input_tokens
    return model.pred(input_tokens)

def extract_sentiment_from_news(from_dir, to_dir, fn_extract):
    os.makedirs(to_dir,exist_ok=True)
    total_time = 0  # 总耗时
    total_count = 0  # 总条数
    start_time = time.time()  # 开始时间
    for filename in os.listdir(from_dir):
        stock_code = filename.split('.')[0]
        if not filename.endswith(".json"): continue

        print(f"当前股票代码：{stock_code}")
        filepath = os.path.join(from_dir, filename)
        # 读取文件
        df = pd.read_json(filepath)
        df['date'] = df['date'].dt.strftime("%Y-%m-%d %H:%M:%S")
        # 处理每行数据
        for index, row in tqdm(df.iterrows()):
            prob_ls,senti_label_ls = fn_extract(row['chunk'])
            df.loc[index, 'sentiment'] = senti_label_ls[0]
            df.loc[index, 'sentiment_pos_score'] = prob_ls[0][1]
            total_count += 1

        print(f"{stock_code} done.")  # 打印或保存结果DataFrame
        df.to_json(os.path.join(to_dir,f'{stock_code}.json'), force_ascii=False, orient='records', indent=2)
        
    end_time = time.time()  # 结束时间
    total_time = end_time - start_time  # 总耗时（秒）

    print(f"总耗时：{total_time:.2f} 秒")
    print(f"总条数：{total_count} 条")


In [11]:
fn_extract(["盛运环保2月13日晚间发布公告称，截至目前，共有37.48亿元到期债务未清偿。", "真好啊"])

(array([[0.8732003 , 0.1267997 ],
        [0.00231368, 0.99768627]], dtype=float32),
 ['NEGATIVE', 'POSITIVE'])

In [13]:
extract_sentiment_from_news(FROM_DIR,TO_DIR,fn_extract)

当前股票代码：000001


871it [00:07, 121.51it/s]


000001 done.
当前股票代码：000002


1452it [00:11, 122.11it/s]


000002 done.
当前股票代码：000063


717it [00:05, 123.68it/s]


000063 done.
当前股票代码：000069


220it [00:01, 121.95it/s]


000069 done.
当前股票代码：000895


261it [00:02, 124.62it/s]


000895 done.
当前股票代码：000898


139it [00:01, 123.73it/s]


000898 done.
当前股票代码：000938


92it [00:00, 122.95it/s]


000938 done.
当前股票代码：000959


63it [00:00, 121.06it/s]


000959 done.
当前股票代码：000961


137it [00:01, 123.17it/s]


000961 done.
当前股票代码：000963


97it [00:00, 122.72it/s]


000963 done.
当前股票代码：000983


70it [00:00, 123.42it/s]


000983 done.
当前股票代码：001965


187it [00:01, 123.42it/s]


001965 done.
当前股票代码：001979


416it [00:03, 122.68it/s]


001979 done.
当前股票代码：002001


33it [00:00, 121.01it/s]


002001 done.
当前股票代码：002007


47it [00:00, 123.60it/s]


002007 done.
当前股票代码：002008


499it [00:04, 123.20it/s]


002008 done.
当前股票代码：002010


118it [00:00, 123.38it/s]


002010 done.
当前股票代码：002024


2014it [00:16, 122.85it/s]


002024 done.
当前股票代码：002304


295it [00:02, 123.30it/s]


002304 done.
当前股票代码：002310


490it [00:03, 122.60it/s]


002310 done.
当前股票代码：002311


129it [00:01, 122.63it/s]


002311 done.
当前股票代码：002410


121it [00:00, 122.80it/s]


002410 done.
当前股票代码：002411


312it [00:02, 121.88it/s]


002411 done.
当前股票代码：002415


465it [00:03, 125.13it/s]


002415 done.
当前股票代码：002916


108it [00:00, 124.20it/s]


002916 done.
当前股票代码：002925


180it [00:01, 122.50it/s]


002925 done.
当前股票代码：600000


642it [00:05, 123.32it/s]


600000 done.
当前股票代码：600004


80it [00:00, 123.27it/s]


600004 done.
当前股票代码：600009


43it [00:00, 124.29it/s]


600009 done.
当前股票代码：600010


104it [00:00, 123.92it/s]


600010 done.
当前股票代码：600011


7it [00:00, 121.61it/s]


600011 done.
当前股票代码：600015


283it [00:02, 123.97it/s]


600015 done.
当前股票代码：600016


394it [00:03, 123.84it/s]


600016 done.
当前股票代码：600018


54it [00:00, 122.99it/s]


600018 done.
当前股票代码：600019


234it [00:01, 123.37it/s]


600019 done.
当前股票代码：600023


36it [00:00, 123.81it/s]


600023 done.
当前股票代码：600025


126it [00:01, 125.30it/s]


600025 done.
当前股票代码：600027


8it [00:00, 121.34it/s]


600027 done.
当前股票代码：600028


641it [00:05, 124.33it/s]


600028 done.
当前股票代码：600029


244it [00:01, 122.57it/s]


600029 done.
当前股票代码：600030


929it [00:07, 122.03it/s]


600030 done.
当前股票代码：600031


455it [00:03, 123.26it/s]


600031 done.
当前股票代码：600036


1067it [00:08, 124.16it/s]


600036 done.
当前股票代码：600038


6it [00:00, 112.36it/s]


600038 done.
当前股票代码：600048


388it [00:03, 123.89it/s]


600048 done.
当前股票代码：600050


710it [00:05, 123.43it/s]


600050 done.
当前股票代码：600061


48it [00:00, 122.92it/s]


600061 done.
当前股票代码：600066


119it [00:00, 123.43it/s]


600066 done.
当前股票代码：600068


105it [00:00, 123.19it/s]


600068 done.
当前股票代码：600085


149it [00:01, 122.54it/s]


600085 done.
当前股票代码：600089


26it [00:00, 120.72it/s]


600089 done.
当前股票代码：600100


184it [00:01, 122.20it/s]


600100 done.
当前股票代码：600104


1028it [00:08, 122.82it/s]


600104 done.
当前股票代码：600482


57it [00:00, 121.68it/s]


600482 done.
当前股票代码：600487


342it [00:02, 119.33it/s]


600487 done.
当前股票代码：600489


160it [00:01, 117.67it/s]


600489 done.
当前股票代码：600795


53it [00:00, 119.39it/s]


600795 done.
当前股票代码：600809


478it [00:03, 122.47it/s]


600809 done.
当前股票代码：600816


493it [00:03, 123.73it/s]


600816 done.
当前股票代码：600606


900it [00:07, 123.16it/s]


600606 done.
当前股票代码：600837


172it [00:01, 122.70it/s]


600837 done.
当前股票代码：600848


110it [00:00, 123.04it/s]


600848 done.
当前股票代码：600867


90it [00:00, 124.00it/s]


600867 done.
当前股票代码：600886


167it [00:01, 124.20it/s]


600886 done.
当前股票代码：600887


673it [00:05, 124.27it/s]


600887 done.
当前股票代码：600893


89it [00:00, 123.85it/s]


600893 done.
当前股票代码：600900


133it [00:01, 123.51it/s]


600900 done.
当前股票代码：600909


163it [00:01, 124.52it/s]


600909 done.
当前股票代码：600919


236it [00:01, 123.68it/s]


600919 done.
当前股票代码：600926


280it [00:02, 123.74it/s]


600926 done.
当前股票代码：600928


316it [00:02, 123.29it/s]


600928 done.
当前股票代码：600958


150it [00:01, 122.74it/s]


600958 done.
当前股票代码：600968


41it [00:00, 122.83it/s]


600968 done.
当前股票代码：600977


81it [00:00, 124.16it/s]


600977 done.
当前股票代码：600989


253it [00:02, 123.74it/s]


600989 done.
当前股票代码：600998


330it [00:02, 123.22it/s]


600998 done.
当前股票代码：600999


415it [00:03, 121.94it/s]


600999 done.
当前股票代码：601006


76it [00:00, 118.82it/s]


601006 done.
当前股票代码：601009


450it [00:03, 118.69it/s]


601009 done.
当前股票代码：601012


520it [00:04, 122.92it/s]


601012 done.
当前股票代码：601018


60it [00:00, 123.43it/s]


601018 done.
当前股票代码：601021


181it [00:01, 123.10it/s]


601021 done.
当前股票代码：601066


610it [00:04, 124.46it/s]


601066 done.
当前股票代码：601088


85it [00:00, 123.40it/s]


601088 done.
当前股票代码：601108


119it [00:00, 123.41it/s]


601108 done.
当前股票代码：601111


210it [00:01, 124.46it/s]


601111 done.
当前股票代码：601117


154it [00:01, 124.32it/s]


601117 done.
当前股票代码：601138


622it [00:05, 123.91it/s]


601138 done.
当前股票代码：601155


1703it [00:13, 124.46it/s]


601155 done.
当前股票代码：601162


537it [00:04, 123.26it/s]


601162 done.
当前股票代码：601166


600it [00:04, 120.04it/s]


601166 done.
当前股票代码：601169


329it [00:02, 123.15it/s]


601169 done.
当前股票代码：601186


358it [00:02, 124.74it/s]


601186 done.
当前股票代码：601198


235it [00:01, 122.78it/s]


601198 done.
当前股票代码：601211


494it [00:03, 124.30it/s]


601211 done.
当前股票代码：601225


123it [00:00, 123.02it/s]


601225 done.
当前股票代码：601216


66it [00:00, 121.35it/s]


601216 done.
当前股票代码：601212


139it [00:01, 123.25it/s]


601212 done.
当前股票代码：601228


126it [00:01, 123.77it/s]


601228 done.
当前股票代码：601229


315it [00:02, 122.95it/s]


601229 done.
当前股票代码：601236


109it [00:00, 123.36it/s]


601236 done.
当前股票代码：601238


643it [00:05, 123.98it/s]


601238 done.
当前股票代码：601288


472it [00:03, 123.84it/s]


601288 done.
当前股票代码：601298


104it [00:00, 122.52it/s]


601298 done.
当前股票代码：601318


2120it [00:17, 123.13it/s]


601318 done.
当前股票代码：601319


802it [00:06, 124.28it/s]


601319 done.
当前股票代码：601328


581it [00:04, 124.13it/s]


601328 done.
当前股票代码：601333


16it [00:00, 119.20it/s]


601333 done.
当前股票代码：601336


646it [00:05, 124.26it/s]


601336 done.
当前股票代码：601360


552it [00:04, 124.76it/s]


601360 done.
当前股票代码：601377


227it [00:01, 123.88it/s]


601377 done.
当前股票代码：601390


257it [00:02, 122.89it/s]


601390 done.
当前股票代码：601398


966it [00:07, 124.42it/s]


601398 done.
当前股票代码：601555


174it [00:01, 116.12it/s]


601555 done.
当前股票代码：601577


324it [00:02, 113.65it/s]


601577 done.
当前股票代码：601600


149it [00:01, 112.72it/s]


601600 done.
当前股票代码：601601


773it [00:06, 124.37it/s]


601601 done.
当前股票代码：601607


189it [00:01, 124.09it/s]


601607 done.
当前股票代码：601611


39it [00:00, 123.92it/s]


601611 done.
当前股票代码：601618


135it [00:01, 123.21it/s]


601618 done.
当前股票代码：601628


946it [00:07, 124.58it/s]


601628 done.
当前股票代码：601633


686it [00:05, 124.29it/s]


601633 done.
当前股票代码：601668


282it [00:02, 124.90it/s]


601668 done.
当前股票代码：601669


181it [00:01, 123.91it/s]


601669 done.
当前股票代码：601688


667it [00:05, 124.99it/s]


601688 done.
当前股票代码：601698


119it [00:00, 124.06it/s]


601698 done.
当前股票代码：601727


67it [00:00, 123.20it/s]


601727 done.
当前股票代码：601766


316it [00:02, 124.57it/s]


601766 done.
当前股票代码：601788


484it [00:03, 124.69it/s]


601788 done.
当前股票代码：601800


322it [00:02, 114.85it/s]


601800 done.
当前股票代码：601808


56it [00:00, 114.43it/s]


601808 done.
当前股票代码：601818


401it [00:03, 113.52it/s]


601818 done.
当前股票代码：601828


233it [00:02, 112.92it/s]


601828 done.
当前股票代码：601838


215it [00:01, 114.04it/s]


601838 done.
当前股票代码：601857


543it [00:04, 120.63it/s]


601857 done.
当前股票代码：601877


94it [00:00, 122.99it/s]


601877 done.
当前股票代码：601878


153it [00:01, 124.05it/s]


601878 done.
当前股票代码：601881


283it [00:02, 124.54it/s]


601881 done.
当前股票代码：601888


173it [00:01, 123.80it/s]


601888 done.
当前股票代码：601898


17it [00:00, 120.82it/s]


601898 done.
当前股票代码：601899


115it [00:00, 123.48it/s]


601899 done.
当前股票代码：601901


331it [00:02, 124.70it/s]


601901 done.
当前股票代码：601919


137it [00:01, 123.42it/s]


601919 done.
当前股票代码：601933


326it [00:02, 124.13it/s]


601933 done.
当前股票代码：601939


668it [00:05, 123.99it/s]


601939 done.
当前股票代码：603799


148it [00:01, 123.48it/s]


603799 done.
当前股票代码：603833


204it [00:01, 124.22it/s]


603833 done.
当前股票代码：603858


448it [00:03, 124.34it/s]


603858 done.
当前股票代码：603899


69it [00:00, 122.41it/s]

603899 done.
总耗时：409.69 秒
总条数：49899 条



