In [1]:
#https://cloud.tencent.com/developer/article/1454904
#https://zhuanlan.zhihu.com/p/82850698
#https://blog.csdn.net/weixin_42598761/article/details/104592171

In [1]:
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Lambda, Bidirectional, LSTM, Dense
from keras_bert import load_trained_model_from_checkpoint
from keras_contrib.layers import CRF
from keras_contrib.losses import crf_loss
from keras_contrib.metrics import crf_accuracy
from keras_bert import Tokenizer
from keras_bert import AdamWarmup, calc_train_steps
from datetime import datetime
from collections import Counter
import keras.callbacks
import re
import codecs
import time

Using TensorFlow backend.


In [2]:
from ckiptagger import WS, POS, NER
ckip_path = r'C:\Users\Jasonhsu\Desktop\esun\data'
ws = WS(ckip_path, disable_cuda=False) #斷詞
pos = POS(ckip_path, disable_cuda=False) #詞性標注
ner = NER(ckip_path, disable_cuda=False) #實體辨識

In [3]:
import os
import codecs
import pandas as pd
import numpy as np

bert_dir = r'C:\Users\Jasonhsu\Desktop\esun\chinese_L-12_H-768_A-12'
config_path = os.path.join(bert_dir, 'bert_config.json')
checkpoint_path = os.path.join(bert_dir, 'bert_model.ckpt')
dict_path = os.path.join(bert_dir, 'vocab.txt')

# 0. 載入資料

In [4]:
data = pd.read_csv(r'C:\Users\Jasonhsu\Desktop\esun\0620\content_df_0620.csv')
data = data[data["status"]=="ok"].drop(["url","context","raw_content","status", "content_status"],axis = 1)
data['aml_label'] = data['name'].apply(lambda x: 0 if x == '[]' else 1)
data['name'] = data['name'].apply(lambda x: eval(x))

In [5]:
data['content'] = data['content'].apply(lambda x: re.sub('<[^>]*>|【[^】]*】|（[^）]*）|〔[^〕]*〕', '', x))
data['content'] = data['content'].apply(lambda x: x.replace('記者', '＜')
                                                   .replace('報導', '＞')
                                                   .replace('▲', '')\
                                                   .replace('。　', '。')\
                                                   .replace('', '')\
                                                   .replace('.', '')\
                                                   .replace(' ', '')\
                                                   .replace('“', '「')\
                                                   .replace('”', '」'))
data['content'] = data['content'].apply(lambda x: re.sub('＜[^＞]*＞', '', x))

In [6]:
train, test =  train_test_split(data, test_size = 0.2, random_state = 0)

# 1. 建立 aml 模型

In [7]:
def create_tokenizer(dict_path):
    
    token_dict = {}
    with codecs.open(dict_path, 'r', 'utf8') as reader:
        for line in reader:
            token = line.strip()
            token_dict[token] = len(token_dict)
            
    return token_dict

def transfer(i):
    
    if i != 0:
        return 1
    else:
        return 0

def encoded(tokenizer, data, maxlen):
    
    x, y, z = [], [], []
    if 'content' in data.columns:
        for content in data['content']:
            x1, x2 = tokenizer.encode(content, max_len=maxlen)
            x3 = [transfer(i) for i in x1]
            x.append(x1)
            y.append(x2)
            z.append(x3)
    elif 'Sentence' in data.columns:
        for content in data['Sentence']:
            x1, x2 = tokenizer.encode(content, max_len=maxlen)
            x3 = [transfer(i) for i in x1]
            x.append(x1)
            y.append(x2)
            z.append(x3)
            
    return x, y, z

In [8]:
token_dict = create_tokenizer(dict_path)
tokenizer = Tokenizer(token_dict)

In [9]:
maxlen = 256
batch_size = 8
epochs = 3
input_shape = (maxlen, )

In [10]:
label = np.asarray(train['aml_label'])

In [11]:
input_id, segment_id, mask_input = encoded(tokenizer, train, maxlen)

In [12]:
def bert_LSTM_model():
    
    model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True, seq_len=maxlen)
    sequence_output = model.layers[-9].output
    #sequence_output = Lambda(lambda x: x[:, 0])(sequence_output)
    sequence_output = Bidirectional(LSTM(128, return_sequences=False))(sequence_output)
    output = Dense(1, activation='sigmoid')(sequence_output)
    model = Model(model.input, output)
    
    for layer in model.layers:
        layer.trainable = False
    model.layers[-1].trainable = True
    model.layers[-2].trainable = True
    
    return model

model = bert_LSTM_model()

In [13]:
# 載入模型
#model.load_weights('aml_model_weight.h5')

In [14]:
total_steps, warmup_steps = calc_train_steps(
    num_example=train.shape[0],
    batch_size=batch_size,
    epochs=epochs,
    warmup_proportion=0.1,
)

optimizer = AdamWarmup(total_steps, warmup_steps, lr=1e-3, min_lr=1e-5)

In [15]:
callback_list = [
                 keras.callbacks.EarlyStopping(monitor='val_acc', patience=1)
                 #,ModelCheckpoint(filepath='AML_bert.h5', monitor='val_loss', save_best_only=True)
                ]

In [16]:
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['acc'])

model.fit([input_id, segment_id, mask_input],
          label,
          epochs=epochs,
          batch_size=batch_size,
          validation_split=0.1,
          callbacks=callback_list
         )

Train on 3495 samples, validate on 389 samples
Epoch 1/3
Epoch 2/3


<keras.callbacks.callbacks.History at 0x18b740e1f88>

# 2. 建立 NER 模型
### transfer_NER 要跑 1.5 小時，訓練模型要跑 17mins/epoch，所以load weight就好

In [17]:
# 把大於512的新聞以句點分段 (bert最多只能吃512)
def split_content(data):
    data_more_split = pd.DataFrame()
    for i, row in data.iterrows():
        if (len(row['content']) > 512) & (len(row['content']) <= 1024):

            s = row['content']
            s_split = [(i, abs(len(s)//2 - s.find(x)), x) for i, x in enumerate(s.split('。'))]
            idx_left = min(s_split, key=lambda x: x[1])[0]
            first = "。".join([s_split[i][2] for i in range(idx_left)])
            second = "。".join([s_split[i][2] for i in range(idx_left, len(s_split))])    
            contents = [first, second]

            for content in contents:
                data_more_split = data_more_split.append(pd.DataFrame({'news_id':row['news_id'], 'content':content}, index=[66]), ignore_index=True)

        elif len(row['content']) > 1024:

            s = row['content']
            s_split1 = [(i, abs(len(s)//3 - s.find(x)), x) for i, x in enumerate(s.split('。'))]
            s_split2 = [(i, abs(len(s)*2//3 - s.find(x)), x) for i, x in enumerate(s.split('。'))]
            idx_left1 = min(s_split1, key=lambda x: x[1])[0]
            idx_left2 = min(s_split2, key=lambda x: x[1])[0]
            first = "。".join([s_split1[i][2] for i in range(idx_left1)])
            second = "。".join([s_split1[i][2] for i in range(idx_left1, idx_left2)])
            third = "。".join([s_split1[i][2] for i in range(idx_left2, len(s_split1))])
            contents = [first, second, third]

            for content in contents:
                data_more_split = data_more_split.append(pd.DataFrame({'news_id':row['news_id'], 'content':content}, index=[66]), ignore_index=True)
    
    return data_more_split

In [18]:
maxlen_ner = 512
batch_size = 8
epochs = 3
input_shape = (maxlen_ner, )

In [19]:
# 取得 NER input array (陳水扁貪汙 -> CKIP(陳水扁) -> 1 2 2 0 0)
def transfer_NER(data, tokenizer, maxlen):
    people_list = []
    first_name = ['申', '龔', '馮', '昌', '劉', '習', '陽', '顧', '鍾', '胡', '許', '魏',    '傅', '季', '扶', '柳', '狄', '焦', '封', '李', '羿', '刁', '和', '邴',    '陸', '王', '杜', '能', '侯', '伍', '平', '竺', '樂', '繆', '欒', '湛',    '道', '花', '賴', '浦', '萬', '章', '宮', '勾', '邵', '印', '夏', '杭',    '溥', '左', '池', '公', '閻', '符', '奚', '臧', '羅', '空', '璩', '巴',    '酈', '范', '談', '金', '顏', '慎', '郭', '僪', '聞', '車', '闞', '相',    '童', '雙', '方', '莊', '容', '姚', '田', '薛', '閔', '翟', '簡',    '蔚', '茹', '淩', '戴', '余', '鞏', '房', '富', '牛', '饒', '計', '居',    '後', '舒', '席', '翁', '祝', '鬱', '訾', '隆', '匡', '弘', '曆', '範',    '越', '趙', '卻', '岑', '隗', '冷', '張', '山', '松', '柯', '嵇', '韓',    '蕭', '褚', '殳', '滕', '滿', '洪', '荀', '庾', '廖', '盧', '危', '竇',    '曾', '郎', '遊', '穀', '慕', '禹', '凌', '廉', '邢', '梁', '葉',    '郝', '終', '齊', '藺', '曹', '全', '高', '樊', '史', '桂', '廣', '段',    '江', '餘', '袁', '弓', '牧', '魚', '儲', '尚', '逄', '尹', '通', '懷',    '皮', '何', '倪', '包', '晁', '涂', '蓬', '屠', '巫', '須', '巢', '卞',    '楊', '成', '孟', '楚', '呂', '古', '毋', '伊', '賁', '喻', '糜',    '蔔', '艾', '藍', '龐', '諸', '別', '任', '管', '冀', '壽', '惠', '梅',    '孫', '從', '康', '常', '駱', '鞠', '沈', '黨', '沙', '鳳', '郁', '邊',    '仰', '溫', '路', '逮', '賀', '雷', '鈄', '明', '裴', '滑', '毛', '費',    '關', '時', '步', '麴', '裘', '蒲', '司', '查', '錢', '盛', '霍', '鮑',    '彭', '龍', '沃', '單', '勞', '秋', '祖', '殷', '茅', '敖', '郗', '石',    '鐘', '嚴', '畢', '燕', '姜', '經', '程', '厙', '柏', '汪', '婁', '胥',    '聶', '邰', '桑', '辛', '扈', '穆', '仲', '紅', '項', '師', '桓', '黃',    '堵', '貢', '詹', '朱', '蔡', '戈', '于', '甄', '束', '屈', '索', '晏',    '阮', '魯', '虞', '歐', '濮', '俞', '黎', '文', '應', '姬', '貝', '籍',    '莘', '戚', '鄭', '郜', '景', '宋', '宗', '昝', '卓', '蒯', '馬', '顔',    '蘇', '衛', '東', '瞿', '蒼', '莫', '邱', '潘', '家', '林', '芮', '麻',    '元', '武', '強', '鈕', '陳', '井', '於', '游', '耿', '柴', '荊', '韶',    '易', '宿', '施', '鹹', '秦', '班', '甯', '汲', '酆', '暴', '尤',    '祿', '苗', '權', '仇', '都', '羊', '榮', '陶', '支', '賈', '白', '葛',    '暨', '解', '靳', '伏', '唐', '華', '吉', '融', '豐', '安', '衡', '那',    '闕', '俄', '盍', '鄔', '蒙', '利', '鄂', '謝', '宓', '湯', '喬', '孔',    '養', '紀', '幹', '牟', '連', '宰', '蔣', '雍', '益', '寇', '祁', '熊',    '崔', '丁', '薊', '譚', '吳', '烏', '周', '農', '徐', '充', '向', '宦',    '董', '甘', '冉', '韋', '米', '鄒', '鄧', '戎', '水']
    label = np.zeros([len(data), maxlen])
    
    # 用CKIP抓出每個新聞的名字
    for index, (_, row) in enumerate(data.iterrows()):
        
        if index % 100 == 0:
            print(index)
        
        token = tokenizer.tokenize(row['content'][0:maxlen])

        if len(token) > maxlen:
            token = token[0:maxlen-1]
            token.append('[SEP]')

        y = np.zeros([maxlen])
        content = ''.join(token)
        
        #CKIP
        word_sentence_list = ws([content],
                    sentence_segmentation=True,
                    segment_delimiter_set={'?', '？', '!', '！', '。', ',', '，', ';', ':', '、'})
        pos_sentence_list = pos(word_sentence_list)
        entity_sentence_list = ner(word_sentence_list, pos_sentence_list)
        
        people = [people for people in list(entity_sentence_list[0]) if (people[2] == 'PERSON') & (people[1] < maxlen)]
        people = [people for people in people if ((len(people[3]) < 5) & (people[3][0] in first_name)) | ((len(people[3]) >= 5) & ('#' not in people[3]))]
        people.sort()
        people_list.append(people)
        
        #轉換成input array
        j = 0
        for person in people: 
            for i, _ in enumerate(token):        
                if token[i:i+len(person[3])] == list(person[3]):
                    if len(person) == 1:
                        y[i+j] = 1
                        token = token[i+1:]
                        j = i+j+1
                        break

                    y[i+j] = 1
                    y[i+j+1:i+j+len(person[3])] = 2
                    token = token[i+len(person[3]):]
                    j = i+j+len(person[3])
                    break
        label[index, :] = y
    
    #用不到 people_list，只是檢查用
    return people_list, label.reshape([label.shape[0], label.shape[1], 1])

In [20]:
#分句
train_ner = train.drop(['name', 'aml_label'], axis=1)
data_less = train_ner[train_ner['content'].str.len() <= 512]
data_more = train_ner[train_ner['content'].str.len() > 512]
data_more_split = split_content(data_more)
train_ner = train_ner.append(data_more_split)
train_ner = train_ner.reset_index().drop(['index'], axis=1)

In [21]:
people_list, label = transfer_NER(train_ner, tokenizer, maxlen=maxlen_ner)

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000


In [22]:
input_id, segment_id, mask_input = encoded(tokenizer, train_ner, maxlen=maxlen_ner)

In [23]:
def bert_BiLSTM_CRF_model():
    
    ner_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, training=True, seq_len=maxlen_ner)
    bert_output = ner_model.layers[-9].output
    X = Lambda(lambda x: x[:, 0: input_shape[0]])(bert_output)
    X = Bidirectional(LSTM(128, return_sequences=True))(X)
    #X = TimeDistributed(Dense(len(y_token_dict), activation='relu'))(X)
    output = CRF(3, sparse_target = True)(X)    
    ner_model = Model(ner_model.input, output)
    
    for layer in ner_model.layers:
        layer.trainable = False
    ner_model.layers[-1].trainable = True
    ner_model.layers[-2].trainable = True
    
    return ner_model

In [24]:
ner_model = bert_BiLSTM_CRF_model()

In [25]:
# transfer_NER 要跑 1.5 小時，訓練模型要跑 17mins/epoch，所以load weight就好
ner_model.load_weights('ner_model_weight.h5')

In [26]:
total_steps, warmup_steps = calc_train_steps(
    num_example=data.shape[0],
    batch_size=batch_size,
    epochs=epochs,
    warmup_proportion=0.1,
)

optimizer = AdamWarmup(total_steps, warmup_steps, lr=1e-3, min_lr=1e-5)

In [27]:
callback_list = [
                 keras.callbacks.EarlyStopping(monitor='val_crf_accuracy', patience=1)
                 #,ModelCheckpoint(filepath='AML_bert.h5', monitor='val_loss', save_best_only=True)
                ]

In [28]:
ner_model.compile(optimizer=optimizer,
                  loss=crf_loss,
                  metrics=[crf_accuracy])

ner_model.fit([input_id, segment_id, mask_input],
          label,
          epochs=epochs,
          batch_size=batch_size,
          validation_split=0.1,
          callbacks=callback_list)

Train on 9064 samples, validate on 1008 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.callbacks.History at 0x18bfefb8748>

# 3. 預測

In [29]:
# 取得名字 (預測結果為onehot的狀態)
def get_name(input_id, y_pred):
    
    label_list = []
    word_dict = {v: k for k, v in token_dict.items()}
    
    for input_data, y in zip(input_id, y_pred):
        people_index = ''.join([str(a) for a in list(y)])
        j = 0
        name_list = []
        split_index = re.findall('[12]2*', people_index)
        name = ''.join([word_dict.get(input_data[index]) for index, value in enumerate(y) if value != 0])
        
        # [UNK], [PAD]會被算成 5 個字元，避免轉換成文字的index因長度不同對不上，故用 1 個字元的其他符號替代
        # 王春甡 -> 王春[UNK] -> 王春?
        name = name.replace('[UNK]','?')
        name = name.replace('[PAD]','!')
        
        for i in split_index:
            name_list.append(name[0+j:len(i)+j])
            j = len(i) + j
            
        name_list = [name for name in name_list]
        label_list.append(list(set(name_list)))
    
    return label_list

In [30]:
def predict_aml(model, test, aml_threshold):
    
    #第一階段預測，大於aml_threshold者為疑似aml文章
    input_id, segment_id, mask_input = encoded(tokenizer, test, maxlen=maxlen)
    prediction = model.predict([input_id, segment_id, mask_input])
    prediction[prediction >= aml_threshold] = 1
    prediction[prediction < aml_threshold] = 0
    
    return prediction

In [31]:
def predict_sentences(test, people_list, stick, threshold, tokenizer=tokenizer, maxlen=maxlen):
    
    time = datetime.now()
    
    AML = pd.DataFrame(columns=['news_id', 'Name', 'Sentence'])
    aml_highrisk = np.asarray(test['content'][test['prediction'] == 1])
    news_ids = np.asarray(test['news_id'][test['prediction'] == 1])
    
    for k, (news_id, y_news) in enumerate(zip(news_ids ,aml_highrisk)): 
        # 用，。？！切分句子
        news = re.split('，|。|？|！', y_news)

        for i in range(len(people_list[k])):
            # 找出人名存在的 news index
            index = [index for index, _ in enumerate(news) if people_list[k][i] in _]                           
            
            # 刪除前後句中有出現其他姓名的句子 (XX，陳水扁貪汙，吳淑珍也是 -> XX，陳水扁貪汙，也是)
            # 刪除同一句中有出現其他姓名的字 (XX，陳致中是陳水扁的兒子，XX -> XX，是陳水扁的兒子，XX)
            for j in index:
                
                name = [name for name in people_list[k] if name != people_list[k][i]]
                new_news = re.sub('|'.join(name), '', news[j])
                
                if j == 0:
                    new_news2 = re.sub('|'.join(name), '', news[j+1])    
                    sentences = new_news + '，' + new_news2
            
                elif j+1 == len(news):
                    new_news3 = re.sub('|'.join(name), '', news[j-1]) 
                    sentences = new_news3 + '，' + new_news
            
                else:
                    new_news2 = re.sub('|'.join(name), '', news[j+1]) 
                    new_news3 = re.sub('|'.join(name), '', news[j-1]) 
                    sentences = new_news3 + '，' + new_news + '，' + new_news2

            
                    
                AML = AML.append(pd.DataFrame([[news_ids[k] ,people_list[k][i], sentences]], columns=AML.columns))
                
    print('1.提取句子', datetime.now() - time)
    time = datetime.now()
    
    #若 stick==True 則把多筆同姓名句子以逗點合併 (效果不好)
    if stick:
        AML = AML.groupby(['news_id', 'Name'])['Sentence'].apply('，'.join).reset_index()
    
    #姓氏表
    first_name = [
        '申', '龔', '馮', '昌', '劉', '習', '陽', '顧', '鍾', '胡', '許', '魏',
        '傅', '季', '扶', '柳', '狄', '焦', '封', '李', '羿', '刁', '和', '邴',
        '陸', '王', '杜', '能', '侯', '伍', '平', '竺', '樂', '繆', '欒', '湛',
        '道', '花', '賴', '浦', '萬', '章', '宮', '勾', '邵', '印', '夏', '杭',
        '溥', '左', '池', '公', '閻', '符', '奚', '臧', '羅', '空', '璩', '巴',
        '酈', '范', '談', '金', '顏', '慎', '郭', '僪', '聞', '車', '闞', '相',
        '童', '雙', '方', '莊', '容', '姚', '田', '薛', '閔', '翟', '簡',
        '蔚', '茹', '淩', '戴', '余', '鞏', '房', '富', '牛', '饒', '計', '居',
        '後', '舒', '席', '翁', '祝', '鬱', '訾', '隆', '匡', '弘', '曆', '範',
        '越', '趙', '卻', '岑', '隗', '冷', '張', '山', '松', '柯', '嵇', '韓',
        '蕭', '褚', '殳', '滕', '滿', '洪', '荀', '庾', '廖', '盧', '危', '竇',
        '曾', '郎', '遊', '穀', '慕', '禹', '凌', '廉', '邢', '梁', '葉',
        '郝', '終', '齊', '藺', '曹', '全', '高', '樊', '史', '桂', '廣', '段',
        '江', '餘', '袁', '弓', '牧', '魚', '儲', '尚', '逄', '尹', '通', '懷',
        '皮', '何', '倪', '包', '晁', '涂', '蓬', '屠', '巫', '須', '巢', '卞',
        '楊', '成', '孟', '楚', '呂', '古', '毋', '伊', '賁', '喻', '糜',
        '蔔', '艾', '藍', '龐', '諸', '別', '任', '管', '冀', '壽', '惠', '梅',
        '孫', '從', '康', '常', '駱', '鞠', '沈', '黨', '沙', '鳳', '郁', '邊',
        '仰', '溫', '路', '逮', '賀', '雷', '鈄', '明', '裴', '滑', '毛', '費',
        '關', '時', '步', '麴', '裘', '蒲', '司', '查', '錢', '盛', '霍', '鮑',
        '彭', '龍', '沃', '單', '勞', '秋', '祖', '殷', '茅', '敖', '郗', '石',
        '鐘', '嚴', '畢', '燕', '姜', '經', '程', '厙', '柏', '汪', '婁', '胥',
        '聶', '邰', '桑', '辛', '扈', '穆', '仲', '紅', '項', '師', '桓', '黃',
        '堵', '貢', '詹', '朱', '蔡', '戈', '于', '甄', '束', '屈', '索', '晏',
        '阮', '魯', '虞', '歐', '濮', '俞', '黎', '文', '應', '姬', '貝', '籍',
        '莘', '戚', '鄭', '郜', '景', '宋', '宗', '昝', '卓', '蒯', '馬', '顔',
        '蘇', '衛', '東', '瞿', '蒼', '莫', '邱', '潘', '家', '林', '芮', '麻',
        '元', '武', '強', '鈕', '陳', '井', '於', '游', '耿', '柴', '荊', '韶',
        '易', '宿', '施', '鹹', '秦', '班', '甯', '汲', '酆', '暴', '尤',
        '祿', '苗', '權', '仇', '都', '羊', '榮', '陶', '支', '賈', '白', '葛',
        '暨', '解', '靳', '伏', '唐', '華', '吉', '融', '豐', '安', '衡', '那',
        '闕', '俄', '盍', '鄔', '蒙', '利', '鄂', '謝', '宓', '湯', '喬', '孔',
        '養', '紀', '幹', '牟', '連', '宰', '蔣', '雍', '益', '寇', '祁', '熊',
        '崔', '丁', '薊', '譚', '吳', '烏', '周', '農', '徐', '充', '向', '宦',
        '董', '甘', '冉', '韋', '米', '鄒', '鄧', '戎', '水'
    ]
    
    # 把指向同一人的姓名改成一樣（陳男 -> 陳水扁），若指向多人則不改（陳男 -> 陳致中、陳水扁）
    # 將預測不完整的名字回填（王音 -> 王音之）
    name_list = []
    for ids in AML['news_id'].unique():
        full_name = [name for name in AML[(AML['news_id'] == ids)]['Name']]
        full_3name = [name for name in AML[(AML['news_id'] == ids)]['Name'] if len(name) == 3]
        
        a = Counter([name[0] for name in full_3name])
        keep = [k for k,v in a.items() if v == 1]
        full_3name_filter = [name for name in full_3name if name[0] in keep]
        name_dict = dict((name[0], name) for name in full_3name_filter)   # ex: {'陳' : '陳水扁'}

        name_dict_2 = dict(zip([name[0:2] for name in full_3name], full_3name))  # ex: {'王音': '王音之'}
        
        for name in full_name:
            if (name[0] in name_dict.keys()) & (len(name) == 1):
                name_list.append(name_dict.get(name[0]))
            elif (name[0] in name_dict.keys()) & (len(name) == 2) & (name[-1] in ['男', '嫌', '婦', '夫', '某', '女', '妻',\
                                                                                  '員', '稱', '家', '哥', '媽', '生', '處',\
                                                                                  '和', '揆', '要', '再', '董', '涉', '母',\
                                                                                  '辱', '公', '少', '為', '指', '翁', '粉',\
                                                                                  '趁', '仔', '依', '氏', '父']):
                name_list.append(name_dict.get(name[0]))
            elif (name in name_dict_2.keys()) & (len(name) == 2):
                name_list.append(name_dict_2.get(name))
            else:
                name_list.append(name)
                
    print('2.整理名字', datetime.now() - time)
    time = datetime.now()
    
    # 排除重複資料、排除一字、兩字簡稱、兩字三字四字姓不在姓氏表中的人
    AML['Name'] = name_list
    AML = AML.drop_duplicates()
    AML = AML[AML['Name'].apply(lambda x: (len(x) > 1) )]
    AML = AML[~AML['Name'].apply(lambda x: (len(x) == 2) & (x[1] in ['男', '嫌', '婦', '夫', '某', '女', '妻',\
                                                                     '員', '稱', '家', '哥', '媽', '生', '處',\
                                                                     '和', '揆', '要', '再', '董', '涉', '母',\
                                                                     '辱', '公', '少', '為', '指', '翁', '粉',\
                                                                     '趁', '仔', '依', '氏', '父']))]
    AML = AML[AML['Name'].apply(lambda x: (len(x) > 2) | ((len(x) < 3) & (x[0] in first_name)))]
    AML = AML[~AML['Name'].apply(lambda x: (x[0] not in first_name) & (len(x) in (4,3,2)) )]

    print('3.刪除名字', datetime.now() - time)
    time = datetime.now()
    
    # 第二階段 預測句子
    input_id, segment_id, mask_input = encoded(tokenizer, AML, maxlen=256)
    prediction = model.predict([input_id, segment_id, mask_input])
    AML['prediction'] = np.round(prediction, 3) 
    
    # 同一人只要有一筆資料大於閥值（max），則預測為 aml 人物；若新聞中無人大於閥值，則為非 aml 新聞
    AML['prediction'] = AML['prediction'].apply(lambda x: 0 if x < threshold else 1)
    AML = AML.groupby(['news_id', 'Name'])['prediction'].max().reset_index()
    AML = AML[AML['prediction'] == 1]
    AML = AML.groupby(['news_id','prediction'])['Name'].apply(list).reset_index()
    
    print('4.預測名字', datetime.now() - time)
    
    return AML

In [32]:
def f1_score(a, b):
    
    if (len(a) != 0) & (len(b) != 0):
        recall = float(len(set(a) & set(b)) / len(a))
        pecision = float(len(set(a) & set(b)) / len(b))
        score = 2 / (np.reciprocal(recall) + np.reciprocal(pecision))
        return score
    elif (len(a) == 0) & (len(b) == 0):
        return 1
    else:
        return 0

# 4. 網格搜索

### 4-1 [NER法]

In [33]:
def GridSearch_ner(df, model, test, aml_threshold, stick, threshold):
    
    time = datetime.now()
    
    # 1. 預測是否疑似 aml
    prediction = predict_aml(model, test=test, aml_threshold=aml_threshold)
    test['prediction'] = prediction
    aml_highrisk = test[test['prediction'] == 1]
    
    # 2. 將超過 512 的句子以句點拆成多句分段預測
    test_ner = aml_highrisk.drop(['name'], axis=1)
    data_less = test_ner[test_ner['content'].str.len() <= 512]
    data_more = test_ner[test_ner['content'].str.len() > 512]
    data_more_split = split_content(data_more)
    test_ner = test_ner.append(data_more_split)
    test_ner = test_ner.reset_index().drop(['index'], axis=1)
    
    # 3. NER 預測人名
    input_id, segment_id, mask_input = encoded(tokenizer, test_ner, maxlen=maxlen_ner)
    prediction = ner_model.predict([input_id, segment_id, mask_input])
    y_pred = np.argmax(prediction, axis=-1)
    people_list = get_name(input_id, y_pred)
    
    
    # 4. 將拆開的句子組合回去
    test_ner['people_list'] = people_list
    content = test_ner[['news_id', 'content', 'aml_label', 'prediction']]
    content = content.groupby(['news_id', 'aml_label', 'prediction'])['content'].apply(lambda x : '。'.join(x)).reset_index()
    people = test_ner[['news_id', 'aml_label', 'prediction', 'people_list']]
    people = people.groupby(['news_id', 'aml_label', 'prediction'])['people_list'].agg(sum).reset_index()
    people['people_list'] = [list(set(people)) for people in people['people_list']]
    test_ner = pd.merge(content, people, on=['news_id', 'aml_label', 'prediction'], how='left')
    
    # 5. 將 [UNK], [PAD] 轉換回來 (王春? -> 王春甡)
    for _, row in test_ner.iterrows():
        for i, name in enumerate(row['people_list']):
            if ('?' in name) | ('!' in name):
                reexp = name.replace('?', '.').replace('!', '.')
                row['people_list'][i] = re.search(reexp, row['content']).group()
    
    
    print('0.CKIP', datetime.now() - time)
    
    # 6. 判斷名字前後句使是否為 aml
    AML = predict_sentences(test_ner, list(test_ner['people_list']), tokenizer=tokenizer, maxlen=maxlen, stick=stick, threshold=threshold)
    
    test_prediction = pd.merge(test, AML[['news_id', 'Name']], on='news_id', how='left')
    test_prediction['Name'] = test_prediction['Name'].apply(lambda x: x if isinstance(x, list) else [])
    test_prediction['text_prediction'] = test_prediction['Name'].apply(lambda x: 0 if x == [] else 1)
    test_prediction = test_prediction.drop(['content'],axis = 1)
    test_prediction.columns = ['news_id', 'name', 'label', 'AML_prediction', 'Name_prediction', 'text_prediction']
    
    # 7. 算分數
    score = []
    for i in range(len(test_prediction)):
        temp = f1_score(test_prediction['name'][i], test_prediction['Name_prediction'][i])
        score.append(temp)
        
    test_prediction['f1_score'] = score    
    total_score = sum(score)
    aml_score = sum(test_prediction[test_prediction['label'] == 1]['f1_score'])    
    
    df = df.append(pd.DataFrame([[aml_threshold, threshold, stick, total_score, aml_score]], columns=df.columns))
    
    print('aml_threshold =', aml_threshold, 'stick =', stick, 'threshold =', threshold, 'total_score =', total_score, 'aml_score =', aml_score)

    return df

In [34]:
df = pd.DataFrame(columns=['aml_threshold', 'threshold', 'stick', 'total_score', 'aml_score'])

In [35]:
for aml_threshold in [0.2, 0.3, 0.4, 0.5]:
    for threshold in [0.3, 0.4, 0.5, 0.6]:
        for stick in [False]:        
            df = GridSearch_ner(df, model, test, aml_threshold=aml_threshold, stick=stick, threshold=threshold)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


0.CKIP 0:00:23.768912
1.提取句子 0:00:01.388930
2.整理名字 0:00:00.059773
3.刪除名字 0:00:00.004590
4.預測名字 0:00:09.479925
aml_threshold = 0.2 stick = False threshold = 0.3 total_score = 956.7968253968252 aml_score = 60.796825396825376
0.CKIP 0:00:21.562784
1.提取句子 0:00:01.310155
2.整理名字 0:00:00.060914
3.刪除名字 0:00:00.006980
4.預測名字 0:00:09.480315
aml_threshold = 0.2 stick = False threshold = 0.4 total_score = 956.6788766788766 aml_score = 60.678876678876655
0.CKIP 0:00:21.580583
1.提取句子 0:00:01.318838
2.整理名字 0:00:00.060882
3.刪除名字 0:00:00.006982
4.預測名字 0:00:09.494191
aml_threshold = 0.2 stick = False threshold = 0.5 total_score = 957.01221001221 aml_score = 60.01221001220999


  


0.CKIP 0:00:21.568695
1.提取句子 0:00:01.323399
2.整理名字 0:00:00.059878
3.刪除名字 0:00:00.006982
4.預測名字 0:00:09.483897
aml_threshold = 0.2 stick = False threshold = 0.6 total_score = 957.171335200747 aml_score = 60.171335200746945
0.CKIP 0:00:21.039716
1.提取句子 0:00:01.275818
2.整理名字 0:00:00.057968
3.刪除名字 0:00:00.007980
4.預測名字 0:00:08.994437
aml_threshold = 0.3 stick = False threshold = 0.3 total_score = 958.0468253968252 aml_score = 60.046825396825376
0.CKIP 0:00:21.011572
1.提取句子 0:00:01.260113
2.整理名字 0:00:00.056714
3.刪除名字 0:00:00.007978
4.預測名字 0:00:08.980761
aml_threshold = 0.3 stick = False threshold = 0.4 total_score = 957.9288766788766 aml_score = 59.928876678876655
0.CKIP 0:00:21.074110
1.提取句子 0:00:01.260015
2.整理名字 0:00:00.057352
3.刪除名字 0:00:00.006981
4.預測名字 0:00:09.002558
aml_threshold = 0.3 stick = False threshold = 0.5 total_score = 957.26221001221 aml_score = 59.26221001220999
0.CKIP 0:00:21.023408
1.提取句子 0:00:01.280817
2.整理名字 0:00:00.056821
3.刪除名字 0:00:00.006981
4.預測名字 0:00:09.004458
am

### 4-2 [CKIP法]

In [36]:
def ckip(test):
    
    people_list = []
    for aml_news in test['content']:
        
        # 分詞
        word_sentence_list = ws([aml_news],
                    sentence_segmentation=True,
                    segment_delimiter_set={'?', '？', '!', '！', '。', ',', '，', ';', ':', '、'})
        pos_sentence_list = pos(word_sentence_list)
        entity_sentence_list = ner(word_sentence_list, pos_sentence_list)
        
        # 只取人名
        people = [people for people in list(entity_sentence_list[0]) if (people[2] == 'PERSON')]
        people = [person for person in [person[3] for person in people] if (len(person) < 5) | ('．' in person)]
        people = list(set(people))
        people_list.append(people)
        
    return people_list

In [37]:
def GridSearch(df2, model, test, aml_threshold, stick, threshold):
    time = datetime.now()
    
    # 1. 預測是否疑似 aml
    prediction = predict_aml(model, test, aml_threshold=aml_threshold)
    test['prediction'] = prediction
    aml_highrisk = test[test['prediction'] == 1]
    
    # 2. CKIP 預測人名
    people_list = ckip(aml_highrisk)
    
    print('0.CKIP', datetime.now() - time)
    
    # 3. 判斷名字前後句使是否為 aml
    AML = predict_sentences(test, people_list, tokenizer=tokenizer, maxlen=maxlen, stick=stick, threshold=threshold)
    
    test_prediction = pd.merge(test, AML[['news_id', 'Name']], on='news_id', how='left')
    test_prediction['Name'] = test_prediction['Name'].apply(lambda x: x if isinstance(x, list) else [])
    test_prediction['text_prediction'] = test_prediction['Name'].apply(lambda x: 0 if x == [] else 1)
    test_prediction = test_prediction.drop(['content'],axis = 1)
    test_prediction.columns = ['news_id', 'name', 'label', 'AML_prediction', 'Name_prediction', 'text_prediction']
    
    # 4. 算分數
    score = []
    for i in range(len(test_prediction)):
        temp = f1_score(test_prediction['name'][i], test_prediction['Name_prediction'][i])
        score.append(temp)
        
    test_prediction['f1_score'] = score    
    total_score = sum(score)
    aml_score = sum(test_prediction[test_prediction['label'] == 1]['f1_score'])    

    
    df2 = df2.append(pd.DataFrame([[aml_threshold, threshold, stick, total_score, aml_score]], columns=df.columns))
    print('aml_threshold =', aml_threshold, 'stick =', stick, 'threshold =', threshold, 'total_score =', total_score, 'aml_score =', aml_score)

    return df2

In [38]:
df2 = pd.DataFrame(columns=['aml_threshold', 'threshold', 'stick', 'total_score', 'aml_score'])

In [39]:
for aml_threshold in [0.2, 0.3, 0.4, 0.5]:
    for threshold in [0.3, 0.4, 0.5, 0.6]:
        for stick in [False]:        
            df2 = GridSearch(df2, model, test, aml_threshold=aml_threshold, stick=stick, threshold=threshold)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


0.CKIP 0:01:16.723094
1.提取句子 0:00:01.714970
2.整理名字 0:00:00.061341
3.刪除名字 0:00:00.008976
4.預測名字 0:00:11.913243
aml_threshold = 0.2 stick = False threshold = 0.3 total_score = 955.5583743842362 aml_score = 61.55837438423644


  


0.CKIP 0:01:16.823463
1.提取句子 0:00:01.624652
2.整理名字 0:00:00.060837
3.刪除名字 0:00:00.009973
4.預測名字 0:00:11.918961
aml_threshold = 0.2 stick = False threshold = 0.4 total_score = 955.616183242045 aml_score = 61.61618324204529
0.CKIP 0:01:16.584803
1.提取句子 0:00:01.623785
2.整理名字 0:00:00.060649
3.刪除名字 0:00:00.002099
4.預測名字 0:00:11.908833
aml_threshold = 0.2 stick = False threshold = 0.5 total_score = 955.1495165753785 aml_score = 61.14951657537863
0.CKIP 0:01:16.720413
1.提取句子 0:00:01.651036
2.整理名字 0:00:00.061956
3.刪除名字 0:00:00.010002
4.預測名字 0:00:11.911002
aml_threshold = 0.2 stick = False threshold = 0.6 total_score = 955.3605898158636 aml_score = 61.36058981586363
0.CKIP 0:01:12.089171
1.提取句子 0:00:01.557338
2.整理名字 0:00:00.058842
3.刪除名字 0:00:00.008982
4.預測名字 0:00:11.099192
aml_threshold = 0.3 stick = False threshold = 0.3 total_score = 955.6694854953473 aml_score = 60.66948549534755
0.CKIP 0:01:12.637532
1.提取句子 0:00:01.561191
2.整理名字 0:00:00.058873
3.刪除名字 0:00:00.004768
4.預測名字 0:00:11.097317
aml

In [40]:
df

Unnamed: 0,aml_threshold,threshold,stick,total_score,aml_score
0,0.2,0.3,False,956.796825,60.796825
0,0.2,0.4,False,956.678877,60.678877
0,0.2,0.5,False,957.01221,60.01221
0,0.2,0.6,False,957.171335,60.171335
0,0.3,0.3,False,958.046825,60.046825
0,0.3,0.4,False,957.928877,59.928877
0,0.3,0.5,False,957.26221,59.26221
0,0.3,0.6,False,957.421335,59.421335
0,0.4,0.3,False,958.046825,60.046825
0,0.4,0.4,False,957.928877,59.928877


In [41]:
df2

Unnamed: 0,aml_threshold,threshold,stick,total_score,aml_score
0,0.2,0.3,False,955.558374,61.558374
0,0.2,0.4,False,955.616183,61.616183
0,0.2,0.5,False,955.149517,61.149517
0,0.2,0.6,False,955.36059,61.36059
0,0.3,0.3,False,955.669485,60.669485
0,0.3,0.4,False,955.727294,60.727294
0,0.3,0.5,False,955.260628,60.260628
0,0.3,0.6,False,955.471701,60.471701
0,0.4,0.3,False,955.669485,60.669485
0,0.4,0.4,False,955.727294,60.727294


In [42]:
pd.merge(df, df2, on=['aml_threshold', 'threshold', 'stick'], how='left')

Unnamed: 0,aml_threshold,threshold,stick,total_score_x,aml_score_x,total_score_y,aml_score_y
0,0.2,0.3,False,956.796825,60.796825,955.558374,61.558374
1,0.2,0.4,False,956.678877,60.678877,955.616183,61.616183
2,0.2,0.5,False,957.01221,60.01221,955.149517,61.149517
3,0.2,0.6,False,957.171335,60.171335,955.36059,61.36059
4,0.3,0.3,False,958.046825,60.046825,955.669485,60.669485
5,0.3,0.4,False,957.928877,59.928877,955.727294,60.727294
6,0.3,0.5,False,957.26221,59.26221,955.260628,60.260628
7,0.3,0.6,False,957.421335,59.421335,955.471701,60.471701
8,0.4,0.3,False,958.046825,60.046825,955.669485,60.669485
9,0.4,0.4,False,957.928877,59.928877,955.727294,60.727294


# 5. NER, CKIP比較 (待整理)

In [43]:
# 預測是否疑似aml
prediction = predict_aml(model, test, aml_threshold=0.4)
test['prediction'] = prediction
aml_highrisk = test[test['prediction'] == 1]

# 將超過512的句子拆成多句分段預測
test_ner = aml_highrisk.drop(['name'], axis=1)
data_less = test_ner[test_ner['content'].str.len() <= 512]
data_more = test_ner[test_ner['content'].str.len() > 512]
data_more_split = split_content(data_more)
test_ner = test_ner.append(data_more_split)
test_ner = test_ner.reset_index().drop(['index'], axis=1)

# 預測人名
input_id, segment_id, mask_input = encoded(tokenizer, test_ner, maxlen=maxlen_ner)
prediction = ner_model.predict([input_id, segment_id, mask_input])
y_pred = np.argmax(prediction, axis=-1)
people_list = get_name(input_id, y_pred)

# 將拆開的句子組合回去
test_ner['people_list'] = people_list
content = test_ner[['news_id', 'content', 'aml_label', 'prediction']]
content = content.groupby(['news_id', 'aml_label', 'prediction'])['content'].apply(lambda x : '。'.join(x)).reset_index()
people = test_ner[['news_id', 'aml_label', 'prediction', 'people_list']]
people = people.groupby(['news_id', 'aml_label', 'prediction'])['people_list'].agg(sum).reset_index()
people['people_list'] = [list(set(people)) for people in people['people_list']]
test_ner = pd.merge(content, people, on=['news_id', 'aml_label', 'prediction'], how='left')

# 將 [UNK], [PAD] 轉換回來
for _, row in test_ner.iterrows():
    for i, name in enumerate(row['people_list']):
        if ('?' in name) | ('!' in name):
            reexp = name.replace('?', '.').replace('!', '.')
            row['people_list'][i] = re.search(reexp, row['content']).group()
            
AML = pd.DataFrame(columns=['news_id', 'Name', 'Sentence'])
aml_highrisk = np.asarray(test['content'][test['prediction'] == 1])
news_ids = np.asarray(test['news_id'][test['prediction'] == 1])

for k, (news_id, y_news) in enumerate(zip(news_ids ,aml_highrisk)): 
    #用，。？切分句子
    news = re.split('，|。|？|！', y_news)

    for i in range(len(people_list[k])):
        #找出人名存在的news index
        index = [index for index, _ in enumerate(news) if people_list[k][i] in _]                           

        #刪除前後句中有出現其他姓名的句子 (XX，陳水扁貪汙，吳淑珍也是 -> XX，陳水扁貪汙，也是)

        for j in index:

            name = [name for name in people_list[k] if name != people_list[k][i]]
            new_news = re.sub('|'.join(name), '', news[j])

            if j == 0:
                new_news2 = re.sub('|'.join(name), '', news[j+1])    
                sentences = new_news + '，' + new_news2

            elif j+1 == len(news):
                new_news3 = re.sub('|'.join(name), '', news[j-1]) 
                sentences = new_news3 + '，' + new_news

            else:
                new_news2 = re.sub('|'.join(name), '', news[j+1]) 
                new_news3 = re.sub('|'.join(name), '', news[j-1]) 
                sentences = new_news3 + '，' + new_news + '，' + new_news2



            AML = AML.append(pd.DataFrame([[news_ids[k] ,people_list[k][i], sentences]], columns=AML.columns))


#若 stick==True 則把多筆同姓名句子以逗點合併
if stick:
    AML = AML.groupby(['news_id', 'Name'])['Sentence'].apply('，'.join).reset_index()

#姓氏表
first_name = [
    '申', '龔', '馮', '昌', '劉', '習', '陽', '顧', '鍾', '胡', '許', '魏',
    '傅', '季', '扶', '柳', '狄', '焦', '封', '李', '羿', '刁', '和', '邴',
    '陸', '王', '杜', '能', '侯', '伍', '平', '竺', '樂', '繆', '欒', '湛',
    '道', '花', '賴', '浦', '萬', '章', '宮', '勾', '邵', '印', '夏', '杭',
    '溥', '左', '池', '公', '閻', '符', '奚', '臧', '羅', '空', '璩', '巴',
    '酈', '范', '談', '金', '顏', '慎', '郭', '僪', '聞', '車', '闞', '相',
    '童', '雙', '方', '莊', '容', '姚', '田', '薛', '閔', '翟', '簡',
    '蔚', '茹', '淩', '戴', '余', '鞏', '房', '富', '牛', '饒', '計', '居',
    '後', '舒', '席', '翁', '祝', '鬱', '訾', '隆', '匡', '弘', '曆', '範',
    '越', '趙', '卻', '岑', '隗', '冷', '張', '山', '松', '柯', '嵇', '韓',
    '蕭', '褚', '殳', '滕', '滿', '洪', '荀', '庾', '廖', '盧', '危', '竇',
    '曾', '郎', '遊', '穀', '慕', '禹', '凌', '廉', '邢', '梁', '葉',
    '郝', '終', '齊', '藺', '曹', '全', '高', '樊', '史', '桂', '廣', '段',
    '江', '餘', '袁', '弓', '牧', '魚', '儲', '尚', '逄', '尹', '通', '懷',
    '皮', '何', '倪', '包', '晁', '涂', '蓬', '屠', '巫', '須', '巢', '卞',
    '楊', '成', '孟', '楚', '呂', '古', '毋', '伊', '賁', '喻', '糜',
    '蔔', '艾', '藍', '龐', '諸', '別', '任', '管', '冀', '壽', '惠', '梅',
    '孫', '從', '康', '常', '駱', '鞠', '沈', '黨', '沙', '鳳', '郁', '邊',
    '仰', '溫', '路', '逮', '賀', '雷', '鈄', '明', '裴', '滑', '毛', '費',
    '關', '時', '步', '麴', '裘', '蒲', '司', '查', '錢', '盛', '霍', '鮑',
    '彭', '龍', '沃', '單', '勞', '秋', '祖', '殷', '茅', '敖', '郗', '石',
    '鐘', '嚴', '畢', '燕', '姜', '經', '程', '厙', '柏', '汪', '婁', '胥',
    '聶', '邰', '桑', '辛', '扈', '穆', '仲', '紅', '項', '師', '桓', '黃',
    '堵', '貢', '詹', '朱', '蔡', '戈', '于', '甄', '束', '屈', '索', '晏',
    '阮', '魯', '虞', '歐', '濮', '俞', '黎', '文', '應', '姬', '貝', '籍',
    '莘', '戚', '鄭', '郜', '景', '宋', '宗', '昝', '卓', '蒯', '馬', '顔',
    '蘇', '衛', '東', '瞿', '蒼', '莫', '邱', '潘', '家', '林', '芮', '麻',
    '元', '武', '強', '鈕', '陳', '井', '於', '游', '耿', '柴', '荊', '韶',
    '易', '宿', '施', '鹹', '秦', '班', '甯', '汲', '酆', '暴', '尤',
    '祿', '苗', '權', '仇', '都', '羊', '榮', '陶', '支', '賈', '白', '葛',
    '暨', '解', '靳', '伏', '唐', '華', '吉', '融', '豐', '安', '衡', '那',
    '闕', '俄', '盍', '鄔', '蒙', '利', '鄂', '謝', '宓', '湯', '喬', '孔',
    '養', '紀', '幹', '牟', '連', '宰', '蔣', '雍', '益', '寇', '祁', '熊',
    '崔', '丁', '薊', '譚', '吳', '烏', '周', '農', '徐', '充', '向', '宦',
    '董', '甘', '冉', '韋', '米', '鄒', '鄧', '戎', '水'
]

#把指向同一人的姓名改成一樣（陳男 -> 陳水扁），若指向多人則不改（陳男 -> 陳致中、陳水扁）
name_list = []
for ids in AML['news_id'].unique():
    full_name = [name for name in AML[(AML['news_id'] == ids)]['Name']]
    full_3name = [name for name in AML[(AML['news_id'] == ids)]['Name'] if len(name) == 3]

    a = Counter([name[0] for name in full_3name])
    keep = [k for k,v in a.items() if v == 1]
    full_3name_filter = [name for name in full_3name if name[0] in keep]
    name_dict = dict((name[0], name) for name in full_3name_filter)   # '陳' : '陳水扁'

    name_dict_2 = dict(zip([name[0:2] for name in full_3name], full_3name))  # '王音': '王音之'

    for name in full_name:
        if (name[0] in name_dict.keys()) & (len(name) == 1):
            name_list.append(name_dict.get(name[0]))
        elif (name[0] in name_dict.keys()) & (len(name) == 2) & (name[-1] in ['男', '嫌', '婦', '夫', '某', '女', '妻',\
                                                                              '員', '稱', '家', '哥', '媽', '生', '處',\
                                                                              '和', '揆', '要', '再', '董', '涉', '母',\
                                                                              '辱', '公', '少', '為', '指', '翁', '粉',\
                                                                              '趁', '仔', '依', '氏', '父']):
            name_list.append(name_dict.get(name[0]))
        elif (name in name_dict_2.keys()) & (len(name) == 2):
            name_list.append(name_dict_2.get(name))
        else:
            name_list.append(name)

#排除重複資料、排除一字、兩字簡稱、兩字三字四字姓不在姓氏表中的人
AML['Name'] = name_list
AML = AML.drop_duplicates()
AML = AML[AML['Name'].apply(lambda x: (len(x) > 1) )]
AML = AML[~AML['Name'].apply(lambda x: (len(x) == 2) & (x[1] in ['男', '嫌', '婦', '夫', '某', '女', '妻',\
                                                                 '員', '稱', '家', '哥', '媽', '生', '處',\
                                                                 '和', '揆', '要', '再', '董', '涉', '母',\
                                                                 '辱', '公', '少', '為', '指', '翁', '粉',\
                                                                 '趁', '仔', '依', '氏', '父']))]
AML = AML[AML['Name'].apply(lambda x: (len(x) > 2) | ((len(x) < 3) & (x[0] in first_name)))]
AML = AML[~AML['Name'].apply(lambda x: (x[0] not in first_name) & (len(x) in (4,3,2)) )]


#第二階段 預測句子
input_id, segment_id, mask_input = encoded(tokenizer, AML, maxlen=maxlen)
prediction = model.predict([input_id, segment_id, mask_input])
AML['raw_prediction'] = np.round(prediction, 3)


AML['prediction'] = AML['raw_prediction'].apply(lambda x: 0 if x < 0.7 else 1)
AML = AML.groupby(['news_id', 'Name'])['raw_prediction', 'prediction'].max().reset_index()
AML = AML[AML['prediction'] == 1]
AMLName = AML.groupby(['news_id','prediction'])['Name'].apply(list).reset_index()
AMLPrediction = AML.groupby(['news_id'])['raw_prediction'].apply(list).apply(lambda x: np.round(x,2)).reset_index()
AMLloss = pd.merge(AMLName, AMLPrediction, how='left', on = 'news_id')
AML = pd.merge(test, AMLloss, how='left', on='news_id').drop(['content', 'prediction_y'], axis=1)

final2 = AML[AML['aml_label'] == 1]

pd.set_option('display.max_rows', 100)

compare = pd.merge(final2, test_ner[['news_id', 'people_list']], on='news_id', how='left')
prediction = predict_aml(model, test=test, aml_threshold=aml_threshold)
test['prediction'] = prediction
aml_highrisk = test[test['prediction'] == 1]
people_list = ckip(aml_highrisk)
aml_highrisk['ckip_prediction'] = people_list
final = pd.merge(compare, aml_highrisk[['news_id', 'ckip_prediction']], on='news_id', how='left')
final.columns = ['news_id', 'label_name', 'aml_label', 'prediction', 'Ner_prediction', 'name_score', 'Ner_raw', 'ckip_raw']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [44]:
final

Unnamed: 0,news_id,label_name,aml_label,prediction,Ner_prediction,name_score,Ner_raw,ckip_raw
0,4056,"[林霙璋, 蔡景德, 齊德清]",1,1.0,"[蔡景德, 齊德清]","[0.98, 0.97]","[蔡, 蔡員, 林員, 齊德清, 林, 蔡景德, 齊員, 齊從輕, 林霙璋]","[蔡, 蔡員, 齊德清, 除齊, 林, 齊員處, 齊員, 蔡景德, 林員處, 蔡員處, 林霙璋]"
1,4101,"[陳志遇, 林玉華, 陳威廷, 陳富榮, 劉欣燕]",1,1.0,"[劉欣燕, 林玉華, 陳威廷, 陳富榮, 陳志遇]","[0.9, 1.0, 1.0, 0.99, 1.0]","[陳志遇, 林玉華, 陳富榮, 陳威廷, 劉欣燕]","[陳志遇, 林玉華, 陳富榮, 陳威廷, 劉欣燕]"
2,3076,[丁奕烜],1,1.0,,,"[李, 林女, 丁男, 丁奕烜, 李女]","[李, 丁奕烜, 林]"
3,4685,[鄭聖儒],1,1.0,[鄭聖儒],[1.0],"[蔡, 鄭聖儒, 徐, 徐女, 鄭男]","[蔡, 徐, 鄭聖儒, 鄭, 徐女, 鄭男]"
4,855,"[陳玟叡, 黃文鴻, 吳承霖]",1,1.0,"[吳承霖, 蔡英俊, 陳玟叡, 黃文鴻]","[1.0, 0.87, 1.0, 1.0]","[吳男, 孫, 吳, 吳承霖, 黃男, 蔡英俊, 陳玟叡, 黃文鴻]","[吳男, 黃, 孫, 吳承霖, 黃男, 吳, 嘉檢, 蔡英俊, 台北, 陳玟叡, 黃文鴻]"
5,1420,"[余依珊, 孫凱倫]",1,1.0,[余依珊],[0.74],"[余依珊, 余, 孫凱倫, 林]","[孫男, 林, 余, 余依珊, 孫凱倫]"
6,3131,"[胡原龍, 秦朝添]",1,1.0,"[王海濤, 秦朝添, 胡原龍, 鈕承澤]","[0.99, 1.0, 1.0, 0.98]","[秦朝添, 胡原龍, 胡, 王海濤, 鈕承澤]","[秦朝添, 胡原龍, 胡, 王海濤, 胡稱, 國磐, 胡受訪, 鈕承澤]"
7,2426,"[陳瑞禮, 陳文南]",1,1.0,"[陳文南, 陳瑞禮]","[1.0, 1.0]","[陳瑞禮, 陳, 陳文南]","[陳瑞禮, 陳, 陳文南]"
8,582,[阮文全],1,1.0,[阮文全],[0.99],"[阮文全, 阮, 鄭]","[阮, 阮文全, 鄭]"
9,4780,"[鄧超鴻, 道克明]",1,1.0,"[道克明, 鄧、道, 鄧超鴻]","[0.89, 0.99, 0.96]","[道克明, 鄧、道, 鄧超鴻, 鄧男]","[鄧, 道克明, 鄧男, 鄧超鴻]"
