In [1]:
import pandas as pd
import numpy as np
import ast
import re
import os
import pickle
import torch
import torch.nn.functional as F
from transformers import RobertaTokenizer
from transformers import BertTokenizer , BertConfig , BertModel , XLNetTokenizer, XLNetConfig , XLNetModel

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch import nn


In [2]:
def clean_string(content):
    content = content.replace('\n','').replace('\t','').replace(' ','').replace('\xa0','')
    content = re.sub("[●▼►★]", "",content)
    return content

def cut_sent(para):
    para = re.sub('([。！？\?])([^”’])', r"\1\n\2", para)
    para = re.sub('(\.{6})([^”’])', r"\1\n\2", para) 
    para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)  
    para = re.sub('([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
    return para.split("\n")

def combine_sentence(sentences):
    li = []
    string = ''
    for k in range(len(sentences)):
        sentence = sentences[k]
        if len(string) + len(sentence) < 510:
            string = string + sentence
        else:
#             原本是空的代表sentences太常
            if string == '':
                n = 510
                tmp_li = [sentence[i:i+n] for i in range(0, len(sentence), n)]
                string = tmp_li.pop(-1)
                li = li + tmp_li
            else:
                li.append(string)
                string = sentence
    if(string != ''):
        li.append(string)
    return li

class TestDataset(Dataset):
    def __init__(self, input_dict,text):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.text = text

    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        text = self.text[idx]

        return inputid , tokentype , attentionmask , text
    
    def __len__(self):
        return len(self.input_ids)
    
# class posClassfication_new(nn.Module):
#     def __init__(self):
#         super(posClassfication_new, self).__init__()
#         self.start_task = nn.Sequential(
#             nn.Linear(768, 1),
#         )    
#         self.end_task = nn.Sequential(
#             nn.Linear(768, 1),
#         ) 
#         self.binary_task = nn.Sequential(
#             nn.Linear(768, 2),
#         )
        

# #             
#     def forward(self, start_x, end_x, pool_cls):
#         start_x = start_x.double()
#         end_x = end_x.double()
#         pool_cls = pool_cls.double()
    
#         start_out = self.start_task(start_x)
#         end_out = self.end_task(end_x)
#         binary_out = self.binary_task(pool_cls)
        
#         return start_out , end_out , binary_out
            

In [4]:
def xlnet_news_has_ans(news):
    from transformers import XLNetForSequenceClassification,XLNetTokenizer
    class Testset(Dataset):
        def __init__(self, input_dict):
            self.input_ids = input_dict['input_ids']
            self.token_type_ids = input_dict['token_type_ids']
            self.attention_mask = input_dict['attention_mask']
        def __getitem__(self,idx):
            inputid = self.input_ids[idx]
            tokentype = self.token_type_ids[idx]
            attentionmask = self.attention_mask[idx]
            return inputid , tokentype , attentionmask

        def __len__(self):
            return len(self.input_ids)
    
    lm_path = './chinese_xlnet_mid_pytorch/'

    content = clean_string(news)
    tokenizer = XLNetTokenizer.from_pretrained(lm_path)

    input_dict = tokenizer.batch_encode_plus([content], 
                                         add_special_tokens=True,
                                         max_length=512,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
    
        

    BATCH_SIZE = 1
    testset = Testset(input_dict)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)
    
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'

    NUM_LABELS = 2
    model = XLNetForSequenceClassification.from_pretrained(lm_path,num_labels=NUM_LABELS)
    check_point = './TB_multispan/XLNet_fromseq_3.pkl'
    model.load_state_dict(torch.load(check_point))
    model = model.to(device)
    model.eval()


    with torch.no_grad():
        for data in testloader:
            tokens_tensors, segments_tensors, masks_tensors = [t.to(device) for t in data]
            outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)
            pred = torch.softmax(outputs[0] , dim = -1)
            torch.set_printoptions(precision=10)
            pred = torch.argmax(pred,dim=-1)
            pred = pred.cpu().detach().numpy()[0]
            return pred
        
    
    
    
    

In [30]:
news = '因巴拉圭東方市的「東方工業區」開發案控告前外交部長林宗俊誹謗的台商羅常軍，13日下午到台北地檢署出庭，羅常軍強調，他10日已經領到巴拉圭良民證，外交部指他在巴拉圭涉有訴訟，不但是胡說八道，更是刻意抹黑。 羅常軍是因外交部在2016年初給立法院的中央政府總預算審查的書面報告中，就「有效管理台巴（拉圭）工業區以促進國際合作經貿發展」案中，指「陳正平（原園區聘請管理經理）及羅常軍背信及詐欺案」仍在巴國法院進行，列入立法院公報並上網。  為此羅常軍行文外交部要求將公報自網路下架未果，因此提告前外文部長林宗俊等誹謗及偽造文書，總共在北檢有7個案件。   羅常軍明示，巴拉圭東方工業區開發案，就是國家坑害僑商的案子，所有文件證據都是齊備的，甚至包括機密及極機密文件。至於外交部指他在巴拉圭有詐欺及背信訴訟，羅常軍說，他10日已經領取巴拉圭的良民證，如果真如外交部所言，巴拉圭不可能核發良民證給他，顯然外交部是胡說八道刻意抹黑。'






In [31]:
xlnet_news_has_ans(news)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of the model checkpoint at ./chinese_xlnet_mid_pytorch/ were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a 

1

In [7]:
news_have_answer(news)

tensor([[0.6316, 0.3684]], grad_fn=<SoftmaxBackward>)


0

In [7]:
####### 
test_df = pd.read_csv('./tbrain/2020-07-29.csv')
different_ans = []

for index,row in test_df.iterrows():
    print(row)
    ckip_name = ast.literal_eval(row['ckip_name'])
    news = row['article']
    print(check_pred_name_is_real_ans_split_and_avg(ckip_name,news,4))
    break
    

article         檢調偵辦「三鑫集團」以投資俄羅斯賭場等名目吸金12億元案，發現三鑫集團負責人曾裕仁去年因債務...
binary                                                          1
ckip_name                             ['陳男', '謝發布', '王妤昆', '曾裕仁']
predict_name                                       ['王妤昆', '曾裕仁']
Name: 0, dtype: object


Some weights of the model checkpoint at ./bert_wwm_pretrain_tbrain/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were no

['王妤昆', '曾裕仁']


In [None]:
def eval(pred, ans):
    if bool(pred) is not bool(ans):
        return 0
    elif not pred and not ans:
        return 1
    else:
        pred = set(pred)
        ans = set(ans)
        interaction_len = len(pred & ans)
        if interaction_len == 0:
            return 0

        pred_len = len(pred)
        ans_len = len(ans)
        return 2 / (pred_len / interaction_len + ans_len / interaction_len)


def eval_all(pred_list, ans_list):
    assert len(pred_list) == len(ans_list)
    return sum(eval(p, a) for p, a in zip(pred_list, ans_list)) / len(pred_list)



In [6]:
def check_pred_name_is_real_ans_split_and_avg(pred_name_list,news,dataset):
    class Testset(Dataset):
        def __init__(self, input_ids , token_type_ids , attention_mask , names):
            self.input_ids = input_ids
            self.token_type_ids = token_type_ids
            self.attention_mask = attention_mask
            self.names = names
        def __getitem__(self,idx):
            inputid = self.input_ids[idx]
            tokentype = self.token_type_ids[idx]
            attentionmask = self.attention_mask[idx]
            name = self.names[idx]
            return inputid , tokentype , attentionmask , name

        def __len__(self):
            return len(self.input_ids)
    def combine_sentence(sentences , max_len):
        li = []
        string = ''
        for k in range(len(sentences)):
            sentence = sentences[k]
            if len(string) + len(sentence) < max_len:
                string = string + sentence
            else:
    #             原本是空的代表sentences太常
                if string == '':
                    n = max_len
                    tmp_li = [sentence[i:i+n] for i in range(0, len(sentence), n)]
                    string = tmp_li.pop(-1)
                    li = li + tmp_li
                else:
                    li.append(string)
                    string = sentence
        if(string != ''):
            li.append(string)
        return li

        
    
    
    lm_path = './bert_wwm_pretrain_tbrain/'
    tokenizer = BertTokenizer.from_pretrained(lm_path)

    
    train_input_ids = []
    train_token_types = []
    train_attention_mask = []
    testing_name = []
    
    
    content = clean_string(news)
    
    max_length = 500
    
    split_content = cut_sent(content)
    chunks = combine_sentence(split_content , max_length)
    
    
    
    for chunk in chunks:
        for name in pred_name_list:
#             content_max_length = 512-3-len(name)
            if len(chunk) >= max_length:
                print('error !!!! lenth > 500')
                continue
            if name not in chunk:
                continue

            input_ids = tokenizer.encode(name, chunk)
            if(len(input_ids)>512):
                continue
            sep_index = input_ids.index(tokenizer.sep_token_id)
            num_seg_a = sep_index + 1
            num_seg_b = len(input_ids) - num_seg_a
            segment_ids = [0]*num_seg_a + [1]*num_seg_b

            input_mask = [1] * len(input_ids)

            while len(input_ids) < 512:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            train_input_ids.append(input_ids)
            train_token_types.append(segment_ids)
            train_attention_mask.append(input_mask)
            testing_name.append(name)
        
    train_input_ids = np.array(train_input_ids)
    train_token_types  = np.array(train_token_types)
    train_attention_mask = np.array(train_attention_mask)
    testing_name = np.array(testing_name)
    
    
    BATCH_SIZE = train_input_ids.shape[0]
    testset = Testset(train_input_ids ,train_token_types , train_attention_mask, testing_name)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)
    
    
    from transformers import BertForSequenceClassification
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    checkpoint  = ''
    if dataset == 0:
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_alldataset_epoch9.pkl'
    elif dataset == 1:
#         dataset (1)
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_dataset1_epoch19.pkl'
    elif dataset == 2:
#         dataset (2)
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_dataset2_epoch11.pkl'
    elif dataset == 3:
#         dataset (3)
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_dataset3_epoch9.pkl'
    elif dataset == 4:
#         (train+test)
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_alldataset_epoch9.pkl'
    else:
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_alldataset_epoch9.pkl'

    
    NUM_LABELS = 2
    tokenizer = BertTokenizer.from_pretrained(lm_path)
    model = BertForSequenceClassification.from_pretrained(lm_path,num_labels=NUM_LABELS)
    model.load_state_dict(torch.load(checkpoint))
    model = model.to(device)
    model.eval()


    with torch.no_grad():
        for data in testloader:
            tokens_tensors, segments_tensors, masks_tensors  = [t.to(device) for t in data[:-1]]
            name = data[-1]
            outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)
            pred = torch.softmax(outputs[0] , dim = -1)
            
            pred_name_list = np.array(name)
            
            val_dict = {}
            count_dict = {}
            for k in range(len(pred_name_list)):
                name = pred_name_list[k]
                probi = pred[k]
                if name in val_dict:
                    val_dict[name] += probi
                    count_dict[name] += 1
                else:
                    val_dict[name] = probi
                    count_dict[name] = 1
                    
            for name,count in count_dict.items():
                val_dict[name] /= count
            
            keys = list(val_dict.keys())
            values = list(val_dict.values())
#             print(keys,values)
            ans = []
            th = 0.8
            for k in range(len(keys)):  
                if  values[k][1] > th:
                    ans.append(keys[k])
                
#             pred = torch.argmax(pred,dim=-1)
#             pred = pred.cpu().detach().numpy()
            

#             print(pred, name)
#             return list(pred_name_list[pred>0])
            return ans
        


In [19]:
"""danny"""
pre_bert_wwm_pred = [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['李訓成', '蔡開宇', '王宇正'], [], ['張永泉', '郭明賓'], [], [], [], [], [], [], [], [], [], [], [], [], ['李瑞廷', '謝昌年'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['牟孝儀', '陳學敏', '牟明哲'], ['許祈文'], [], [], [], [], [], [], [], [], [], ['黃顯雄', '黃世陽'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['黃淑頻', '呂東英', '呂建安', '曾國輝'], [], [], ['賴永吉', '章啟明', '章民強', '李恆隆', '章啟光'], ['張建生', '張宜豐', '林宏彬', '陳正達'], [], [], [], [], ['傅春生'], [], ['莊錫根'], [], [], [], [], ['朱國榮', '金寶山', '葉佳瑛', '劉慶珠'], [], [], [], [], [], [], [], [], [], [], ['周麗真', '張志偉', '陳逢璿'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['張玉鳳', '林偉強', '孫幼英', '江國貴', '蘇芸樂', '鍾素娥'], [], [], [], [], [], [], ['賴俊吉'], [], [], [], [], [], [], [], [], [], [], [], [], ['繆竹怡'], [], [], [], [], [], [], [], [], [], [], [], ['黃載文', '陳偉', '陳永昌', '陳偉和', '胡志明'], [], [], [], [], [], [], [], [], [], [], ['鄭聖儒'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['林右正'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['楊文值', '楊士弘', '歐彥志', '陳俞雄', '張東耀'], [], [], [], [], [], [], [], [], [], [], [], [], [], ['朱國榮', '林桂馨'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['張智凱'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['李全教'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['李威儀', '藍秀琪', '王桂霜'], [], [], [], [], [], [], [], ['邱世忠'], [], [], [], [], [], ['吳坤錦'], [], [], [], [], [], [], [], [], [], [], ['柯賜海'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['張桂銘', '劉威甫'], [], [], [], [], ['孔朝'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['鍾增林', '曾國財'], ['何培才'], [], [], [], ['姜維池', '郭永鴻', '葉清偉'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['林敏志'], [], [], [], [], [], ['裴振福', '劉吉雄', '楊自立', '林輝宏', '吳東明', '張建華', '呂宗南'], [], [], [], [], [], [], [], [], [], [], [], [], [], ['王懷恭'], [], [], []]
union_result = []
intersect_result = []
for i in range(len(ans)):
  union = []
  temp1 = set(pre_bert_wwm_pred[i])
  temp2 = set(mypred[i])
  union = list(temp1 | temp2)
  intersect = list(temp1 & temp2)
#   if (len(union) == 0):
#     union.append('')
#   if (len(intersect) == 0):
#     intersect.append('')
    
  union_result.append(union)
  intersect_result.append(intersect)
print(eval_all(union_result,ans))
print(eval_all(intersect_result,ans))
"""-----"""

0.9858258170885463
0.9848721430187827


'-----'

In [21]:
import pandas as pd

test_728 = pd.read_csv('./tbrain/2020-07-28.csv')
idx = 172
news = test_728.iloc[idx]['article']
pred_name_list = ast.literal_eval(test_728.iloc[idx]['predict_name'])

print(news)
print((pred_name_list))




check_pred_name_is_real_ans(pred_name_list , news , 1)

?記者楊政郡／台中報導?2014年間利鑫公司推出「F.A.S.Ttm基金」（未經許可及合法設立登記），由陳思哲引介「阮涵財」或「林玉婷」（真實姓名皆不詳），以非法多層次傳銷方式吸金，達615萬美金（約新台幣1億8450萬）及166萬港幣（約新台幣747萬），台灣負責人陳思哲依違反銀行法加重罪判8年6月徒刑。判決書指出，陳思哲明知利鑫外匯公司（瑞士商）未向我國申請許可及公司設立登記，非銀行機構，竟與自稱利鑫公司顧問之「阮涵財」或「林玉婷」等人共謀，自2014年元月起，由陳思哲對外招攬不特定人參與投資，在中市、高雄市、台北市、新竹市等地，租借飯店舉辦利鑫公司投資說明會，說明會中由陳思哲介紹，「阮涵財」或「林玉婷」向與會不特定民眾解說「F.A.S.Ttm基金」投資方案及獎金種類。誆稱所收取資金，將操作外匯投資和貨幣衍生品，前景可期，參與投資會員，投資額1萬至2萬9900美元範圍，每週可固定獲利2%（稱基本配套）；投資額為3萬至9萬9900美元範圍，每週可固定獲利3%（稱無限配套）；投資額為10萬至50萬美元範圍，每週可獲利3.1%至3.5%不等（稱鑫級配套）。會員招攬下線投資，成為會員，每週可領取第1層下線週分紅30%、第2層下線週分紅20%、第3層至第10層週分紅10%與第11層至第25層週分紅5%不等獎金，以此非法多層次傳銷方式，吸引不特人投入資金。陳思哲以上述方式陸續招約20名投資者，吸收資金共615萬餘美元（折新台幣1億8450萬）及港幣166萬（折新台幣747萬）餘元，同年10月利鑫公司未再支付各投資人紅利，始知受騙。
['阮涵財', '陳思哲', '阮涵', '林玉婷']
device: cuda:0


Some weights of the model checkpoint at ./bert_wwm_pretrain_tbrain/ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were no

tensor([[5.1023340225e-01, 4.8976656795e-01],
        [1.9034050638e-03, 9.9809664488e-01],
        [9.9968671799e-01, 3.1325456803e-04],
        [3.0855970457e-02, 9.6914398670e-01]], device='cuda:0')


['陳思哲', '林玉婷']

In [8]:
def check_pred_name_is_real_ans(pred_name_list,news,dataset):
    class Testset(Dataset):
        def __init__(self, input_ids , token_type_ids , attention_mask , names):
            self.input_ids = input_ids
            self.token_type_ids = token_type_ids
            self.attention_mask = attention_mask
            self.names = names
        def __getitem__(self,idx):
            inputid = self.input_ids[idx]
            tokentype = self.token_type_ids[idx]
            attentionmask = self.attention_mask[idx]
            name = self.names[idx]
            return inputid , tokentype , attentionmask , name

        def __len__(self):
            return len(self.input_ids)
    def combine_sentence(sentences , max_len):
        li = []
        string = ''
        for k in range(len(sentences)):
            sentence = sentences[k]
            if len(string) + len(sentence) < max_len:
                string = string + sentence
            else:
    #             原本是空的代表sentences太常
                if string == '':
                    n = max_len
                    tmp_li = [sentence[i:i+n] for i in range(0, len(sentence), n)]
                    string = tmp_li.pop(-1)
                    li = li + tmp_li
                else:
                    li.append(string)
                    string = sentence
        if(string != ''):
            li.append(string)
        return li
    
    lm_path = './bert_wwm_pretrain_tbrain/'
    tokenizer = BertTokenizer.from_pretrained(lm_path)

    
    train_input_ids = []
    train_token_types = []
    train_attention_mask = []
    testing_name = []
    
    
    content = clean_string(news)
    
    max_length = 500
    
    split_content = cut_sent(content)
    chunks = combine_sentence(split_content , max_length)
    
    
    
    for chunk in chunks:
        for name in pred_name_list:
#             content_max_length = 512-3-len(name)
            if len(chunk) >= max_length:
                print('error !!!! lenth > 500')
                continue
            if name not in chunk:
                continue

            input_ids = tokenizer.encode(name, chunk)
            if(len(input_ids)>512):
                continue
            sep_index = input_ids.index(tokenizer.sep_token_id)
            num_seg_a = sep_index + 1
            num_seg_b = len(input_ids) - num_seg_a
            segment_ids = [0]*num_seg_a + [1]*num_seg_b

            input_mask = [1] * len(input_ids)

            while len(input_ids) < 512:
                input_ids.append(0)
                input_mask.append(0)
                segment_ids.append(0)

            train_input_ids.append(input_ids)
            train_token_types.append(segment_ids)
            train_attention_mask.append(input_mask)
            testing_name.append(name)
        
    train_input_ids = np.array(train_input_ids)
    train_token_types  = np.array(train_token_types)
    train_attention_mask = np.array(train_attention_mask)
    testing_name = np.array(testing_name)
    
    
    BATCH_SIZE = train_input_ids.shape[0]
    testset = Testset(train_input_ids ,train_token_types , train_attention_mask, testing_name)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)
    
    
    from transformers import BertForSequenceClassification
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    checkpoint  = ''
    if dataset == 0:
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_alldataset_epoch9.pkl'
    elif dataset == 1:
#         dataset (1)
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_dataset1_epoch19.pkl'
    elif dataset == 2:
#         dataset (2)
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_dataset2_epoch11.pkl'
    elif dataset == 3:
#         dataset (3)
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_dataset3_epoch9.pkl'
    elif dataset == 4:
#         (train+test)
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_alldataset_epoch9.pkl'
    else:
        checkpoint = './TB_multispan/bert_wwm_split512_ckip_name_is_ans_alldataset_epoch9.pkl'

    
    NUM_LABELS = 2
    tokenizer = BertTokenizer.from_pretrained(lm_path)
    model = BertForSequenceClassification.from_pretrained(lm_path,num_labels=NUM_LABELS)
    model.load_state_dict(torch.load(checkpoint))
    model = model.to(device)
    model.eval()


    with torch.no_grad():
        for data in testloader:
            tokens_tensors, segments_tensors, masks_tensors  = [t.to(device) for t in data[:-1]]
            name = data[-1]
            outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)
            pred = torch.softmax(outputs[0] , dim = -1)
            pred = torch.argmax(pred,dim=-1)
            pred = pred.cpu().detach().numpy()
            pred_name_list = np.array(name)
#             print(pred, name)
            return list(pred_name_list[pred>0])
        


In [6]:
def check_first_word_is_name(name):
    hundred_name = ['趙','錢','孫','李','周','吳','鄭','王','馮','陳','褚','衛','蔣','沈','韓','楊','朱','秦','尤','許','何','呂','施','張','孔','曹','嚴','華','金','魏','陶','姜','戚','謝','鄒','章','蘇','潘','葛','范','彭','魯','韋','馬','苗','花','方','俞','任','袁','柳','鮑','史','唐','費','廉','薛','雷','賀','倪','湯','殷','羅','郝','安','于','傅','齊','康','伍','余','顧','孟','黃','蕭','尹','姚','邵','汪','毛','狄','戴','宋','龐','熊','紀','屈','項','祝','董','梁','杜','阮','藍','季','賈','江','童','顏','郭','盛','林','鍾','徐','邱','駱','高','夏','蔡','田','樊','胡','凌','霍','萬','柯','管','盧','莫','繆','解','應','丁','鄧','洪','包','石','崔','龔','程','裴','陸','甄','封','糜','焦','侯','全','甘','武','劉','詹','龍','葉','黎','白','邰','賴','卓','池','譚','溫','莊','瞿','連','習','向','古','易','廖','耿','歐','冷','簡','曾','司','歐','夏','諸','公','慕','牟']
    return name[0] in hundred_name
def check_first_word_list(name_list):
    return [x for x in name_list if(check_first_word_is_name(x))]

In [16]:
# testing
def two_words_is_name(name_list):
    from transformers import BertForSequenceClassification
    class TestDataset(Dataset):
        def __init__(self, input_dict ):
            self.input_ids = input_dict['input_ids']
            self.token_type_ids = input_dict['token_type_ids']
            self.attention_mask = input_dict['attention_mask']
        def __getitem__(self,idx):
            inputid = self.input_ids[idx]
            tokentype = self.token_type_ids[idx]
            attentionmask = self.attention_mask[idx]
            return inputid , tokentype , attentionmask
        def __len__(self):
            return len(self.input_ids)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'
    print("device:", device)


    lm_path = './bert_wwm_pretrain_tbrain/'
    tokenizer = BertTokenizer.from_pretrained(lm_path)


    input_dict = tokenizer.batch_encode_plus(name_list, 
                                         add_special_tokens=True,
                                         max_length=4,
                                       truncation=True,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')
    BATCH_SIZE = len(name_list)
    testset = TestDataset(input_dict)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)

    NUM_LABELS = 2
    model = BertForSequenceClassification.from_pretrained(lm_path,num_labels=NUM_LABELS)
    check_point = './TB_multispan/Bert_wwm_name_model_2words_all_2.pkl'
    model.load_state_dict(torch.load(check_point))
    model = model.to(device)
    model.eval()


    with torch.no_grad():
        for data in testloader:
            tokens_tensors, segments_tensors, \
            masks_tensors = [t.to(device) for t in data]
            outputs = model(input_ids=tokens_tensors, 
                                token_type_ids=segments_tensors, 
                                attention_mask=masks_tensors)
            
            pred = torch.argmax(outputs[0],dim=-1)
            pred = pred.cpu().detach().numpy()
            name_list = np.array(name_list)
            return list(name_list[pred>0])
#             pred = torch.argmax(outputs[0][0] , dim = 0)
#             return pred.item()


In [18]:
class TestDataset(Dataset):
    def __init__(self, input_dict,text):
        self.input_ids = input_dict['input_ids']
        self.token_type_ids = input_dict['token_type_ids']
        self.attention_mask = input_dict['attention_mask']
        self.text = text

    def __getitem__(self,idx):
        inputid = self.input_ids[idx]
        tokentype = self.token_type_ids[idx]
        attentionmask = self.attention_mask[idx]
        text = self.text[idx]

        return inputid , tokentype , attentionmask , text
    
    def __len__(self):
        return len(self.input_ids)
class roberta_pos_model(nn.Module):
    def __init__(self):
        super(roberta_pos_model, self).__init__()
        self.start_task = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),

        )    
        self.end_task = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1),
        ) 
            
    def forward(self, start_x, end_x):
        start_x = start_x.double()
        end_x = end_x.double()
         
        start_out = self.start_task(start_x)
        end_out = self.end_task(end_x)
        
        return start_out , end_out 
    
    
# pos_model = torch.load('./TB_multispan/start_pos_model8.pkl')  
# torch.save(pos_model.state_dict(),'./TB_multispan/roberta_single_span_state_dict.pkl')
# print(pos_model)
    

def roberta_single_span(news,ckip_names):
    lm_path = './pretrain_roberta_on_TBdata/'
    pos_model_path = './TB_multispan/roberta_single_span_state_dict.pkl'
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'
#     print("device:", device) 
    
    
    tokenizer = BertTokenizer.from_pretrained(lm_path)
    config = BertConfig.from_pretrained(lm_path + 'config.json',output_hidden_states=True)
    model = BertModel.from_pretrained(lm_path,config=config)


    model = model.to(device)
    model.output_hidden_states = True
    model.eval()
    
    pos_model = roberta_pos_model()
    pos_model.load_state_dict(torch.load(pos_model_path))
    pos_model = pos_model.double()
    pos_model = pos_model.to(device)
    pos_model.eval()
    
    
    
    content = clean_string(news)

    my_pred_name_list = []
    

    split_content = cut_sent(content)
    chunks = combine_sentence(split_content)

    test_input_dict = tokenizer.batch_encode_plus(chunks, 
                                     add_special_tokens=True,
                                     max_length=512,
                                    truncation=True,
                                     return_special_tokens_mask=True,
                                     pad_to_max_length=512,
                                     return_tensors='pt')

    BATCH_SIZE = 1
    testset = TestDataset(test_input_dict,chunks)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)


    for data in testloader:

        tokens_tensors ,  segments_tensors , masks_tensors = [t.to(device) for t in data[:-1]]
        text = data[-1]
        bert_outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
        

        bert_all_768 = bert_outputs[0]
        mini_batch = bert_all_768.size()[0]
        bert_all_768 = bert_all_768.double()

        start_pred , end_pred  = pos_model(bert_all_768, bert_all_768)

            
        start_pred = start_pred.reshape((mini_batch,512))
        end_pred = end_pred.reshape((mini_batch,512))



        topk = 10
        myrange = 20
        start_topk_indices = torch.topk(start_pred, topk).indices
        end_top_k_indices = torch.topk(end_pred,topk).indices
        all_indices = torch.cat([start_topk_indices,end_top_k_indices] , dim=-1)
        all_indices = torch.unique(all_indices)

        for i in range(all_indices.size()[0]):
            start_index = all_indices[i]
            start_index -= 1
            ans_string = ''

            if (start_index + myrange < 512) and (start_index - myrange > 0):
                ans_string = text[0][start_index - myrange:start_index + myrange]
            elif start_index + myrange > 512:
                ans_string = text[0][start_index-myrange:-1]
            elif start_index - myrange <= 0 :
                ans_string = text[0][0:start_index+myrange]
            else:
                print('out of range')

            for ckip_name in ckip_names:
                if(ckip_name in ans_string and len(ckip_name) >= 2):
                    my_pred_name_list.append(ckip_name)
    
    if len(my_pred_name_list)>0:
        my_pred_name_list = ckip_names_filter(my_pred_name_list)
    return my_pred_name_list
    
    
    

In [19]:
def ckip_names_filter(name_list):
    
    lm_path = './pretrain_roberta_on_TBdata/'
    name_model_state_path = './TB_multispan/name_model_state.pkl'
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'

#     print("device:", device) 
    
    
    name_model = NameModel()
    name_model.load_state_dict(torch.load(name_model_state_path))
    name_model = name_model.double()
    name_model = name_model.to(device)
    name_model.eval()
    
    
    tokenizer = BertTokenizer.from_pretrained(lm_path)

    test_input_dict = tokenizer.batch_encode_plus(name_list, 
                                         add_special_tokens=True,
                                         max_length=5,
                                        truncation=True,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=True,
                                         return_tensors='pt')

    model = BertModel.from_pretrained(lm_path)
    model = model.to(device)
    model.output_hidden_states = True
    model.eval()
    

    bert_outputs = model(input_ids = test_input_dict['input_ids'].to(device), 
                        token_type_ids = test_input_dict['token_type_ids'].to(device), 
                        attention_mask= test_input_dict['attention_mask'].to(device))  

    pool_cls = bert_outputs[1]
    pool_cls = pool_cls


    logits = name_model(pool_cls)
    logits = torch.argmax(logits,dim = -1)
    logits = logits.cpu().detach().numpy()
    name_list = np.array(name_list)
    
    return list(name_list[logits>0])
    

class NameModel(nn.Module):
    def __init__(self):
        super(NameModel, self).__init__()
        self.name_task = nn.Sequential(
            nn.Linear(768, 768),
            nn.Tanh(),
            nn.Linear(768,2)
        )    
         
    def forward(self, x):
        x = x.double()
        out = self.name_task(x)
        return out



    

In [5]:

class BertBinrayClassifier(nn.Module):
    def __init__(self):
        super(BertBinrayClassifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(768, 2),
        )


#             
    def forward(self, x):
        x = self.classifier(x)

        return x 

def news_have_answer_bert(news):
    binary_model_path = './TB_multispan/Bert_binary_alldataset_3.pkl'
    lm_path = './chinese_wwm_pytorch/'
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'
#     print("device:", device) 
    
    tokenizer = BertTokenizer.from_pretrained(lm_path)
    config = BertConfig.from_pretrained(lm_path + 'config.json',output_hidden_states=True)
    model = BertModel.from_pretrained(lm_path,config=config)
    model = model.to(device)
    model.eval()
    
    
    
    checkpoint = torch.load(binary_model_path)
    binary_model = BertBinrayClassifier()
    binary_model.load_state_dict(checkpoint)
    binary_model = binary_model.to(device)
    binary_model.eval()
    
    news = clean_string(news)
    test_input_dict = tokenizer.batch_encode_plus([news], 
                                         add_special_tokens=True,
                                         max_length=512,
                                         truncation=True,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=512,
                                         return_tensors='pt')
    BATCH_SIZE = 1
    testset = TestDataset(test_input_dict,news)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)
    for data in testloader:
        tokens_tensors ,  segments_tensors , masks_tensors = [t.to(device) for t in data[:-1]]
        
        text = data[-1]
        
        lm_outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)

        binary_pred = binary_model(lm_outputs[1].float())
        binary_pred = torch.argmax(binary_pred,dim = -1)
        return binary_pred[0].item()
    
    
    
    
    
    
    

In [6]:
from transformers.modeling_utils import SequenceSummary

class XLNetBinrayClassifier(nn.Module):
    def __init__(self,config):
        super(XLNetBinrayClassifier, self).__init__()
        self.sequence_summary = SequenceSummary(config)
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 2),
        )
    
    def forward(self, x):
        x = self.sequence_summary(x)
        x = self.classifier(x)
        return x 

def news_have_answer(news):
    binary_model_path = './TB_multispan/XLNet_binary_alldataset_fromback_3.pkl'
    lm_path = './chinese_xlnet_mid_pytorch/'
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'
#     print("device:", device) 
    
    tokenizer = XLNetTokenizer.from_pretrained(lm_path)
    config = XLNetConfig.from_pretrained(lm_path + 'config.json')
    model = XLNetModel.from_pretrained(lm_path)
    model = model.to(device)
    model.eval()
    
    
    
    checkpoint = torch.load(binary_model_path)
    binary_model = XLNetBinrayClassifier(config)
    binary_model.load_state_dict(checkpoint)
    binary_model = binary_model.to(device)
    binary_model.eval()
    
    news = clean_string(news)
    test_input_dict = tokenizer.batch_encode_plus([news], 
                                         add_special_tokens=True,
                                         max_length=512,
                                         truncation=True,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=512,
                                         return_tensors='pt')
    BATCH_SIZE = 1
    testset = TestDataset(test_input_dict,news)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)
    for data in testloader:
        tokens_tensors ,  segments_tensors , masks_tensors = [t.to(device) for t in data[:-1]]
        
        text = data[-1]
        
        lm_outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)

        binary_pred = binary_model(lm_outputs[0].float())
        binary_pred = torch.softmax(binary_pred,dim=-1)
        print(binary_pred)
        binary_pred = torch.argmax(binary_pred,dim = -1)
        return binary_pred[0].item()
    
    
    
    
    
    
    

In [14]:
from transformers.modeling_utils import SequenceSummary

    
class posClassfication_new(nn.Module):
    def __init__(self):
        super(posClassfication_new, self).__init__()
        self.start_task = nn.Sequential(
            nn.Linear(768, 1),
        )    
        self.end_task = nn.Sequential(
            nn.Linear(768, 1),
        )    
    def forward(self, start_x, end_x):
        start_x = start_x.double()
        end_x = end_x.double()
        
        start_out = self.start_task(start_x)
        end_out = self.end_task(end_x)
        
        return start_out , end_out    
    
    
# pos_model = torch.load('./TB_multispan/XLNet_pos_model10.pkl')
# torch.save(pos_model.state_dict(),'./TB_multispan/XLNet_single_span_state_dict.pkl')
# print(pos_model)
    
    
def xlnet_single_span(news,ckip_names):
    
    lm_path = './chinese_xlnet_mid_pytorch/'
    pos_model_path = './TB_multispan/XLNet_single_span_state_dict.pkl'
    pos_model_path = './TB_multispan/XLNet_only_labels_single_span_statedict_32k0.pkl'
    
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'
#     print("device:", device) 
    
    tokenizer = XLNetTokenizer.from_pretrained(lm_path)
    config = XLNetConfig.from_pretrained(lm_path + 'config.json')
    model = XLNetModel.from_pretrained(lm_path)
    model = model.to(device)
    model.eval()
    
    
    pos_model = posClassfication_new()
    pos_model.load_state_dict(torch.load(pos_model_path))
    pos_model = pos_model.to(device)
    pos_model = pos_model.double()
    pos_model.eval()
    
    news = clean_string(news)
    split_news = cut_sent(news)
    chunks = combine_sentence(split_news)


    test_input_dict = tokenizer.batch_encode_plus(chunks, 
                                         add_special_tokens=True,
                                         max_length=512,
                                         truncation=True,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=512,
                                         return_tensors='pt')
    
    BATCH_SIZE = 1
    testset = TestDataset(test_input_dict,chunks)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)
    
    my_pred_name_list = []
    for data in testloader:

        tokens_tensors ,  segments_tensors , masks_tensors = [t.to(device) for t in data[:-1]]
        text = data[-1]

        lm_outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
        
        start_pred , end_pred  = pos_model(lm_outputs[0].double(), lm_outputs[0].double())

        
        mini_batch = lm_outputs[0].size()[0]
        start_pred = start_pred.reshape((mini_batch,512))
        end_pred = end_pred.reshape((mini_batch,512))

        topk = 10
        myrange = 20
        start_topk_indices = torch.topk(start_pred, topk).indices
        end_top_k_indices = torch.topk(end_pred,topk).indices

        all_indices = torch.cat([start_topk_indices,end_top_k_indices] , dim=-1)
        all_indices = torch.unique(all_indices)

        
        for i in range(all_indices.size()[0]):
            start_index = all_indices[i]
            start_index -= 1
            ans_string = ''

            if (start_index + myrange < 512) and (start_index - myrange > 0):
                ans_string = text[0][start_index - myrange:start_index + myrange]
            elif start_index + myrange > 512:
                ans_string = text[0][start_index-myrange:-1]
            elif start_index - myrange <= 0 :
                ans_string = text[0][0:start_index+myrange]
            else:
                print('out of range')

            for ckip_name in ckip_names:
                if(ckip_name in ans_string and len(ckip_name) >= 2):
                    my_pred_name_list.append(ckip_name)

    if len(my_pred_name_list)>0:
        my_pred_name_list = ckip_names_filter(my_pred_name_list)
    return my_pred_name_list
     
    


In [15]:
def rbt_pretrain_combine_model(news , ckip_names):
    # model_path
    # binary_model_path = './combine_model5.pkl'
    # pos_model_path = ''
    lm_path = './pretrain_roberta_on_TBdata/'

    combine_model_path = './TB_multispan/combine_model5.pkl'


    # load model
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = 'cpu'
    print("device:", device)

    # for BERT and RoBERTa
    tokenizer = BertTokenizer.from_pretrained(lm_path)
    model = BertModel.from_pretrained(lm_path)
    model = model.to(device)
    model.output_hidden_states = True
    model.eval()

    # binary_model = torch.load(binary_model_path)
    # binary_model = binary_model.to(device)
    # binary_model.eval()

    # pos_model = torch.load(pos_model_path)
    # pos_model = pos_model.to(device)
    # pos_model.eval()

    combine_model = torch.load(combine_model_path)
    combine_model = combine_model.to(device)
    combine_model.eval()



    # process input 

    news = clean_string(news)
    split_news = cut_sent(news)
    chunks = combine_sentence(split_news)


    test_input_dict = tokenizer.batch_encode_plus(chunks, 
                                         add_special_tokens=True,
                                         max_length=512,
                                         truncation=True,
                                         return_special_tokens_mask=True,
                                         pad_to_max_length=512,
                                         return_tensors='pt')


    BATCH_SIZE = 1
    testset = TestDataset(test_input_dict,chunks)
    testloader = DataLoader(testset, batch_size=BATCH_SIZE)


    my_pred_name_list = []

    for data in testloader:

        tokens_tensors ,  segments_tensors , masks_tensors = [t.to(device) for t in data[:-1]]

        text = data[-1]

        bert_outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)

    #     pred binary


        pool_cls = bert_outputs[1]
        pool_cls = pool_cls.double()


        bert_all_768 = bert_outputs[0]
        mini_batch = bert_all_768.size()[0]
        bert_all_768 = bert_all_768.double()

        start_pred , end_pred , binary_pred  = combine_model(bert_all_768, bert_all_768 , pool_cls)

        binary_pred = torch.argmax(binary_pred,dim = -1)
        start_pred = start_pred.reshape((mini_batch,512))
        end_pred = end_pred.reshape((mini_batch,512))

        if(binary_pred[0]<1):
            continue


        topk = 5
        myrange = 18
        start_topk_indices = torch.topk(start_pred, topk).indices
        for i in range(topk):
            start_index = start_topk_indices[0][i]

            start_index -= 1
            ans_string = ''

            if (start_index + myrange < 512) and (start_index - myrange > 0):
                ans_string = text[0][start_index - myrange:start_index + myrange]
            elif start_index + myrange > 512:
                ans_string = text[0][start_index-myrange:-1]
            elif start_index - myrange <= 0 :
                ans_string = text[0][0:start_index+myrange]
            else:
                print('out of range')

            for ckip_name in ckip_names:
                if(ckip_name in ans_string and len(ckip_name) >= 2):
                    my_pred_name_list.append(ckip_name)

    return my_pred_name_list







---