# ner_for_seg

In [1]:
!pip install -r requirement.txt

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
Collecting jieba==0.42.1
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/c6/cb/18eeb235f833b726522d7ebed54f2278ce28ba9438e3135ab0278d9792a2/jieba-0.42.1.tar.gz (19.2 MB)
[K     |████████████████████████████████| 19.2 MB 11.9 MB/s eta 0:00:01�                  | 8.4 MB 11.9 MB/s eta 0:00:01B 11.9 MB/s eta 0:00:01�███████████▎| 18.8 MB 11.9 MB/s eta 0:00:01
Collecting stanza==1.3.0
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/1a/66/8efe2b2358d9973b8427b4e9cf17ef344bbe38f0ea20a0ba129a62716fa0/stanza-1.3.0-py3-none-any.whl (432 kB)
[K     |████████████████████████████████| 432 kB 24.8 MB/s eta 0:00:01
[?25hCollecting tqdm==4.62.3
  Downloading https://mirrors.cloud.aliyuncs.com/pypi/packages/63/f3/b7a1b8e40fd1bd049a34566eb353527bb9b8e9b98f8b6cf803bb64d8ce95/tqdm-4.62.3-py2.py3-none-any.whl (76 kB)
[K     |█████

In [3]:
import stanza
# stanza.download('zh')

In [2]:
!nvidia-smi

Thu Jan 27 19:15:26 2022
[AMP INFO][Frontend.cpp:152][1643282126:378132]pid=745, start to allocate gpu resource ...
+------------------------------------------------------------------------------+
|    VGPU_SMI 450.80.02     DRIVER_VERSION: 450.80.02     CUDA Version: 10.2   |
+-------------------------------------------+----------------------------------+
| GPU  Name                Bus-Id           |        Memory-Usage     GPU-Util |
|   0  Tesla V100-SXM2...  00000000:00:07.0                 |     0MiB /  7531MiB    0% /   0% |
+-------------------------------------------+----------------------------------+


In [None]:
zh_nlp = stanza.Pipeline('zh', processors='tokenize,ner,pos', 
                                tokenize_pretokenized=True,
                                use_gpu=True)

2022-01-27 19:16:00 INFO: "zh" is an alias for "zh-hans"
2022-01-27 19:17:35 INFO: Use device: gpu
2022-01-27 19:17:35 INFO: Loading: tokenize
2022-01-27 19:17:35 INFO: Loading: pos


In [4]:
import pandas as pd
import logging
import jieba
logging.basicConfig(level=logging.DEBUG,
                    filename='./log/ner.log',
                    format="%(asctime)s - %(name)s - %(levelname)-9s - %(filename)-8s : %(lineno)s line - %(message)s",
                    # -8表示占位符，让输出左对齐，输出长度都为8位
                    datefmt="%Y-%m-%d %H:%M:%S")
from tqdm import tqdm
import time


# 文档列表，返回与文档列表等长的标签列表，句子无实体用'O'表示
def ner(docs):
    logging.info('=====len of doc:{}====='.format(len(docs)))
    t1=time.time()
    text_w=[jieba.lcut(t) for t in docs]
    t2=time.time()
    logging.debug('cut words:{:.2f}seconds'.format(t2-t1))
    res=[]
    keep_type=['GPE','LOC','PERSON','ORG']
    # logging.info('===开始加载文档===')
    doc = zh_nlp(text_w)
    t3=time.time()
    logging.debug('loading docs:{:.2f}seconds'.format(t3-t2))
    # logging.info('===结束加载文档===')
    for i, sent in enumerate(doc.sentences):
        # print("Sentence: " + sent.text)  # 因为提前分词，所以这里文本（自带空格分割）和后面分词结果打印出来一模一样
        # print("Tokenize：" + '||'.join(token.text for token in sent.tokens))  # 中文分词
        curr_en=set()# 当前句子实体集合
        for ent in sent.ents:
            if ent.type in keep_type:
                t=ent.text.replace(' ','')
                curr_en.add(f'{t}/{ent.type}')
        if len(curr_en)>0:
            res.append(';'.join(curr_en))
        else:
            res.append('O')
    assert len(res)==len(docs)
    logging.debug('select entitys:{:.2f}seconds'.format(time.time()-t3))
    return res


# 填充指定索引区间内的实体
def ner_partation(df,start,end, entity_list, col='mentions'):
    assert len(entity_list)==end-start+1
    df.loc[start:end,col]=entity_list
    return

In [5]:
df=pd.read_csv(r'data/segment_articles.csv',sep='\t',index_col=0) # 省略的当前目录即pwd
# max_line=df.shape[0]
max_line=2050 # 先测试一下2050行/100,耗时3.71秒
t_per_epoch=100
epochs = max_line // t_per_epoch + 1 # 101轮循环完
logging.info('NER START:lines:{},t_per_epoch:{},epochs:{}'.format(max_line,t_per_epoch,epochs))
start_t=time.time()
for i in tqdm(range(epochs)):
    try:
        logging.debug('current=====epoch:{}/{}====='.format(i+1,epochs))
        start = i*t_per_epoch
        end = min((i+1)*t_per_epoch,max_line)-1
        texts=df.loc[start:end, 'text'].tolist()
        entitys=ner(texts)
        ner_partation(df,start,end, entitys, col='mentions')
        if i%50==0:
            logging.info('epoch {},temp save data'.format(i+1))
            df.to_csv('segment_articles_tmp_end_{}.csv'.format(end),sep='\t')
    except Exception as e:
        logging.error('{} error！！ current start:{}, 耗时:{:.2f}seconds'.format(e,start,time.time() - start_t))
        df.to_csv('segment_articles_start{}.csv'.format(start),sep='\t')
logging.info("所有文档实体识别完成，总耗时: {:.2f}秒".format(time.time() - start_t))
df.to_csv('segment_articles_test0122.csv',sep='\t')

  0%|          | 0/21 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.869 seconds.
Prefix dict has been built successfully.
100%|██████████| 21/21 [00:05<00:00,  3.73it/s]
