# 利用gpu批量识别实体

## 1.装载环境

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# 安装依赖包
!pip install -r /content/drive/MyDrive/finance_el/requirement.txt

Collecting stanza==1.3.0
  Downloading stanza-1.3.0-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 5.3 MB/s 
Collecting emoji
  Downloading emoji-1.6.3.tar.gz (174 kB)
[K     |████████████████████████████████| 174 kB 45.1 MB/s 
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.6.3-py3-none-any.whl size=170298 sha256=63a8a3cbca15aee45b5b3035d71b9d68c63d260bb0fb60768e90b7c59ad908eb
  Stored in directory: /root/.cache/pip/wheels/03/8b/d7/ad579fbef83c287215c0caab60fb0ae0f30c4d7ce5f580eade
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-1.6.3 stanza-1.3.0


## 2.初始化模型
只需要下载一次，注意下载到挂载云盘上，否则推出后会清空

In [6]:
import stanza
download_dir='/content/drive/MyDrive/stanza_resources'

In [8]:
# 下载stanza_resources模型文件
stanza.download('zh',model_dir=download_dir)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json:   0%|   …

2022-01-27 11:58:19 INFO: "zh" is an alias for "zh-hans"
2022-01-27 11:58:19 INFO: Downloading default packages for language: zh-hans (Simplified_Chinese)...


Downloading https://huggingface.co/stanfordnlp/stanza-zh-hans/resolve/v1.3.0/models/default.zip:   0%|        …

2022-01-27 11:58:45 INFO: Finished downloading models and saved to /content/drive/MyDrive/stanza_resources.


In [7]:
# 可以通过pipeline预加载不同语言的模型，也可以通过pipeline选择不同的处理模块，还可以选择是否使用GPU：
zh_nlp = stanza.Pipeline('zh', processors='tokenize,ner,pos', 
              tokenize_pretokenized=True,
              use_gpu=True,
              dir=download_dir)

2022-01-27 13:54:06 INFO: "zh" is an alias for "zh-hans"
2022-01-27 13:54:06 INFO: Loading these models for language: zh-hans (Simplified_Chinese):
| Processor | Package   |
-------------------------
| tokenize  | gsdsimp   |
| pos       | gsdsimp   |
| ner       | ontonotes |

2022-01-27 13:54:06 INFO: Use device: gpu
2022-01-27 13:54:06 INFO: Loading: tokenize
2022-01-27 13:54:06 INFO: Loading: pos
2022-01-27 13:54:19 INFO: Loading: ner
2022-01-27 13:54:24 INFO: Done loading processors!


In [13]:
import pandas as pd
import logging
import jieba
import datetime
def beijing(sec, what):
    beijing_time = datetime.datetime.now() + datetime.timedelta(hours=8)
    return beijing_time.timetuple()
logging.Formatter.converter = beijing
log_dir='/content/drive/MyDrive/finance_el/ner.log'
logging.basicConfig(level=logging.DEBUG,
          filename=log_dir,
          format="%(asctime)s - %(name)s - %(levelname)-9s - %(filename)-8s : %(lineno)s line - %(message)s",
          # -8表示占位符，让输出左对齐，输出长度都为8位
          datefmt="%Y-%m-%d %H:%M:%S")
from tqdm import tqdm
import time


# 文档列表，返回与文档列表等长的标签列表，句子无实体用'O'表示
def ner(docs):
    logging.info('=====len of doc:{}====='.format(len(docs)))
    t1=time.time()
    text_w=[jieba.lcut(t) for t in docs]
    t2=time.time()
    logging.debug('cut words:{:.2f}seconds'.format(t2-t1))
    res=[]
    keep_type=['GPE','LOC','PERSON','ORG']
    # logging.info('===开始加载文档===')
    doc = zh_nlp(text_w)
    t3=time.time()
    logging.debug('loading docs:{:.2f}seconds'.format(t3-t2))
    # logging.info('===结束加载文档===')
    for i, sent in enumerate(doc.sentences):
        # print("Sentence: " + sent.text)  # 因为提前分词，所以这里文本（自带空格分割）和后面分词结果打印出来一模一样
        # print("Tokenize：" + '||'.join(token.text for token in sent.tokens))  # 中文分词
        curr_en=set()# 当前句子实体集合
        for ent in sent.ents:
            if ent.type in keep_type:
                t=ent.text.replace(' ','')
                curr_en.add(f'{t}/{ent.type}')
        if len(curr_en)>0:
            res.append(';'.join(curr_en))
        else:
            res.append('O')
    assert len(res)==len(docs)
    logging.debug('select entitys:{:.2f}seconds'.format(time.time()-t3))
    return res


# 填充指定索引区间内的实体
def ner_partation(df,start,end, entity_list, col='mentions'):
    assert len(entity_list)==end-start+1
    df.loc[start:end,col]=entity_list
    return

In [9]:
src_path=r'/content/drive/MyDrive/data/segment_articles.csv'
out_path=r'/content/drive/MyDrive/tmp/'

In [14]:
df=pd.read_csv(src_path,sep='\t',index_col=0) # 省略的当前目录即pwd
max_line=df.shape[0]
# max_line=2050 # 先测试一下2050行/100,耗时73秒;/1000，56秒
t_per_epoch=1000
epochs = max_line // t_per_epoch + 1 # 101轮循环完
logging.info('NER START:总行数:{},t_per_epoch:{},epochs:{}'.format(max_line,t_per_epoch,epochs))
start_t=time.time()
for i in tqdm(range(epochs)):
    try:
        logging.debug('当前轮次=====epoch:{}/{}====='.format(i+1,epochs))
        start = i*t_per_epoch
        end = min((i+1)*t_per_epoch,max_line)-1
        texts=df.loc[start:end, 'text'].tolist()
        entitys=ner(texts)
        ner_partation(df,start,end, entitys, col='mentions')
        if i%20==0:
            logging.info('epoch {},temp save data'.format(i+1))
            df.to_csv(out_path+'segment_articles_tmp_end_{}.csv'.format(end),sep='\t')
    except Exception as e:
        print(e)
        logging.error('{} error！！ current start:{}, 耗时:{:.2f}seconds'.format(e,start,time.time() - start_t))
        print('{} error！！ current start:{}, 耗时:{:.2f}seconds'.format(e,start,time.time() - start_t))
        df.to_csv(out_path+'error_segment_articles_start{}.csv'.format(start),sep='\t')
logging.info("所有文档实体识别完成，总耗时: {:.2f}秒".format(time.time() - start_t))
print("所有文档实体识别完成，总耗时: {:.2f}秒".format(time.time() - start_t))
df.to_csv(out_path+'segment_articles_0127.csv',sep='\t')

 19%|█▉        | 19/101 [09:26<45:38, 33.40s/it]

CUDA out of memory. Tried to allocate 23.97 GiB (GPU 0; 11.17 GiB total capacity; 9.81 GiB already allocated; 707.81 MiB free; 9.96 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


 20%|█▉        | 20/101 [14:40<2:38:32, 117.43s/it]

CUDA out of memory. Tried to allocate 22.09 GiB (GPU 0; 11.17 GiB total capacity; 8.99 GiB already allocated; 727.81 MiB free; 9.94 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


100%|██████████| 101/101 [52:17<00:00, 31.06s/it]


In [16]:
df_r=pd.read_csv(out_path+'segment_articles_0127.csv',sep='\t',index_col=0)
df_r.tail()

Unnamed: 0,src_idx,keywords,text,mentions
100176,4500,非法集资,爆款兴全合宜上市即暴跌，中兴通讯估值下调造成净值下跌,中兴通讯/ORG
100177,4500,非法集资,今日敏感舆情指数，面向所有A股上市公司，以其前一日9:00至当日9:00的敏感新闻和公告信息...,O
100178,4500,非法集资,本文列举了今日指数最高的5支股票，同时列举了今日指数较昨日的涨跌值，以及时间段内主要的敏感新...,O
100179,4500,非法集资,获取更多上市公司信息和其他高级服务，请查阅财新“数据+”。,O
100180,4500,非法集资,本报告基于大数据技术自动计算和生成，不代表财新数据观点。,O
