## 实体名称泛化失败和属性识别

- 目前解决思路：利用分词或者向量匹配等方法将其中的实体拼接起来作为prompt给大模型来识别。

### 1. 数据处理

In [None]:
import csv
medical_data = "疾病名称,疾病描述,疾病种类,科室,病因,症状,检查,并发症,花费,疗程,疗法,治愈率,易感人群,感染概率,感染途径,预防措施,推荐药物,常用药物,具体药物,可以吃,不可以吃,推荐吃,是否纳入医保"
with open('/Users/dingzhijian/VSCode/dplus-doc/nl2cypher/Medical Data.csv', 'r') as f:
    reader = csv.DictReader(f)
    schema = []
    # 取出第一列数据
    for row in reader:
        schema.append(row['疾病名称'])
    print(schema)


medical_data = "疾病名称,疾病描述,疾病种类,科室,病因,症状,检查,并发症,花费,疗程,疗法,治愈率,易感人群,感染概率,感染途径,预防措施,推荐药物,常用药物,具体药物,可以吃,不可以吃,推荐吃,是否纳入医保"
medical_terms_list = medical_data.split(',')
print(medical_terms_list)

### 2. 结巴分词

In [17]:
import jieba
import paddle

paddle.enable_static()
jieba.enable_paddle()# 启动paddle模式。 0.40版之后开始支持，早期版本不支持
strs=["哪些人易感染肺放线菌病?"]
for str in strs:
    seg_list = jieba.cut(str,use_paddle=True) # 使用paddle模式
    print("Paddle Mode: " + '/'.join(list(seg_list)))



Paddle enabled successfully......
[2023-11-14 19:37:47,195] [   DEBUG] _compat.py:47 - Paddle enabled successfully......


Paddle Mode: 哪些/人/易/感染/肺放线菌病/?


### 3. fuzz字段模糊匹配

In [33]:
# fuzzy查询
from thefuzz import fuzz
from thefuzz import process

# 注意方法，现在有两种做法
#   1. 直接整句来对比
#   2. 只用实体来对比（更好）

sentences = ['肺泡蛋白质沉积症', '肺泡蛋白质沉淀' ,'肺泡蛋白整', '肺泡蛋白沉着症','大页性肺炎', '肺泡里沉淀蛋白', '放射性肺炎', '肺念珠菌病', '肺大疱', '肺炎球菌肺炎', '肺气肿', '肺炎杆菌肺炎']
tokens_input = ['易感','容易感染','容易患上']

process.extract("易感人群", tokens_input, limit=5)


[('易感', 90), ('容易感染', 50), ('容易患上', 25)]

### 4. 向量匹配

In [None]:
from sentence_transformers import SentenceTransformer
import torch

model = SentenceTransformer('/dataQ/zhijian/LLMs/m3e-base')
tokens_input = ['肺泡蛋白质沉积症', '肺泡蛋白质沉淀' , '肺泡蛋白沉着症', '肺泡沉淀有蛋白质', '放射性肺炎', '肺念珠菌病', '肺大疱', '肺炎球菌肺炎', '肺气肿', '肺炎杆菌肺炎']
tokens_dict = ['易感','容易感染','容易患上','易感人群','容易得']
sentences3 = []

#Sentences are encoded by calling model.encode()

sentences = seg_list
embeddings = model.encode(sentences)
embeddings3 = model.encode(sentences3)

sent_emb1 = zip(sentences, embeddings)
sent_emb2 = zip(tokens_dict, embeddings3)

#Print the embeddings
for sentence, embedding in sent_emb1:
    # 转换为1维tensor
    embedding = torch.tensor(embedding).unsqueeze(0)
    for sentence2, embedding2 in zip(sentences, embeddings):
        embedding2 = torch.tensor(embedding2).unsqueeze(0)
        # 计算余弦相似度
        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
        cos_sim = cos(embedding, embedding2)
        print("Sentence = ", sentence)
        print("Sentence2 = ", sentence2)
        print("Similarity = ", cos_sim.item())
    print("---")


In [12]:
from pprint import pprint
from paddlenlp import Taskflow

medical_data = ['']
# ie = Taskflow('information_extraction', schema=medical_data)
# pprint(ie("治愈率大于30%,感染概率小于50%的疾病有哪些,至少列出3个"))
# print("-" * 42)
# pprint(ie("大叶性肺炎,二硫化碳中毒的预防措施"))
# print("-" * 42)
# pprint(ie("百日咳的易感人群和治好概率"))

schema = {"疾病": medical_data}
ie = Taskflow('information_extraction', schema=schema)
pprint(ie("百日咳的易感人群和治好概率"))


[32m[2023-11-15 20:53:09,342] [    INFO][0m - We are using <class 'paddlenlp.transformers.ernie.tokenizer.ErnieTokenizer'> to load '/dataQ/zhijian/.paddlenlp/taskflow/information_extraction/uie-base'.[0m


[{'疾病': [{'end': 3,
          'probability': 0.8903937046969972,
          'relations': {'': [{'end': 8,
                              'probability': 0.8827342624272205,
                              'start': 4,
                              'text': '易感人群'}]},
          'start': 0,
          'text': '百日咳'}]}]


## 综合上述

In [26]:
# 1.jieba participle
import jieba
import paddle
from thefuzz import process
from sentence_transformers import SentenceTransformer
import torch

def jieba_div(strs: list)->list:
    paddle.enable_static()
    jieba.enable_paddle()# 启动paddle模式。 0.40版之后开始支持，早期版本不支持
    segmented_list = []
    for string in strs:
        seg_list = jieba.cut(string, use_paddle=True)  # 使用paddle模式
        segmented_str = "/".join(seg_list)  # 用'/'连接分词结果
        segmented_list.append(segmented_str.split('/'))  # 分割字符串并添加到最终列表

    print('-'*20 + " word segemenataion result " + '-'*20)
    print(segmented_list[0])
    return segmented_list[0]


def fuzz(sents1: list, sents2: list)->list:

    # for item1 in sents1:
    #     for item2 in sents2:
    #         print("item1:",item1,"item2:",item2)
    #         process.extract(item1, item2, limit=5)
    
    # change the list to string
    # sents1 = ''.join(sents1)
    for item1 in sents1:
        print(item1,'//',process.extract(item1,sents2, limit=5))


def similarity(tokens_input, tokens_dict, model_path, sent):
    model = SentenceTransformer(model_path)
    tokens_input_emb = [model.encode(token) for token in tokens_input]
    tokens_dict_emb = [model.encode(token) for token in tokens_dict]
    sent_emb = model.encode(sent)

    # 初始化两个列表用于存储相似度得分和对应的token或句子
    token_pairs_scores = []
    sent_pairs_scores = []

    # 计算token对和句子对的余弦相似度
    cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
    for sentence1, embedding1 in zip(tokens_input, tokens_input_emb):
        for sentence2, embedding2 in zip(tokens_dict, tokens_dict_emb):
            cos_sim = cos(torch.tensor(embedding1), torch.tensor(embedding2))
            if cos_sim.item() > 0.8:
                token_pairs_scores.append((sentence1, sentence2, cos_sim.item()))

    for sentence2, embedding2 in zip(tokens_dict, tokens_dict_emb):
        cos_sent = cos(torch.tensor(sent_emb), torch.tensor(embedding2))
        if cos_sent.item() > 0.8:
            sent_pairs_scores.append((sent, sentence2, cos_sent.item()))

    # 对列表按照相似度得分排序
    token_pairs_scores.sort(key=lambda x: x[2], reverse=True)
    sent_pairs_scores.sort(key=lambda x: x[2], reverse=True)

    # 打印token对和句子对的相似度得分
    print("Tokens Similarity Scores:")
    for score in token_pairs_scores:
        print(f"Token: {score[0]} / {score[1]} / Similarity: {score[2]}")

    print("\nSentence Similarity Scores:")
    for score in sent_pairs_scores:
        print(f"Sentence: {score[0]} / {score[1]} / Similarity: {score[2]}")

# tokens_input = ['肺泡蛋白质沉积症', '肺泡蛋白质沉淀' , '肺泡蛋白沉着症', '肺泡沉淀有蛋白质', '放射性肺炎', '肺念珠菌病', '肺大疱', '肺炎球菌肺炎', '肺气肿', '肺炎杆菌肺炎']
# tokens_dict = ['易感','容易感染','容易患上','易感人群','容易得']
medical_data = ['疾病名称', '疾病描述', '疾病种类', '科室', '病因', '症状', '检查', 
                '并发症', '花费', '疗程', '疗法', '治愈率', '易感人群', '感染概率', 
                '感染途径', '预防措施', '推荐药物', '常用药物', '具体药物', '可以吃', 
                '不可以吃', '推荐吃', '是否纳入医保']


if __name__ == '__main__':
    model_path = '/dataQ/zhijian/LLMs/m3e-base'    
    input = ['大叶性肺炎,二硫化碳中毒的预防措施?']
    tokens_input = jieba_div(input)
    tokens_dict = medical_data
    
    print("-"*20, "fuzz", "-"*20)
    fuzz(tokens_input, tokens_dict)
    print("-"*20, "similarity", "-"*20)   
    similarity(tokens_input, tokens_dict, model_path,input[0])



Paddle enabled successfully......
[2023-11-15 13:59:00,574] [   DEBUG] _compat.py:47 - Paddle enabled successfully......


--------------------word segemenataion result--------------------
['大叶性肺炎', ',', '二硫化碳中毒', '的', '预防', '措施', '?']
-------------------- fuzz --------------------
大叶性肺炎 // [('疾病名称', 0), ('疾病描述', 0), ('疾病种类', 0), ('科室', 0), ('病因', 0)]
, // [('疾病名称', 0), ('疾病描述', 0), ('疾病种类', 0), ('科室', 0), ('病因', 0)]
二硫化碳中毒 // [('疾病名称', 0), ('疾病描述', 0), ('疾病种类', 0), ('科室', 0), ('病因', 0)]
的 // [('疾病名称', 0), ('疾病描述', 0), ('疾病种类', 0), ('科室', 0), ('病因', 0)]
预防 // [('预防措施', 90), ('疾病名称', 0), ('疾病描述', 0), ('疾病种类', 0), ('科室', 0)]
措施 // [('预防措施', 90), ('疾病名称', 0), ('疾病描述', 0), ('疾病种类', 0), ('科室', 0)]
? // [('疾病名称', 0), ('疾病描述', 0), ('疾病种类', 0), ('科室', 0), ('病因', 0)]
-------------------- similarity --------------------
Tokens Similarity Scores:
Token: 预防 / 预防措施 / Similarity: 0.9377005100250244
Token: 措施 / 预防措施 / Similarity: 0.8163185119628906
Token: 大叶性肺炎 / 病因 / Similarity: 0.8127889633178711

Sentence Similarity Scores:
Sentence: 大叶性肺炎,二硫化碳中毒的预防措施? / 预防措施 / Similarity: 0.8289975523948669


## schema过长问题解决
1. 重定义一个短的schema页表做映射
2. 将schema先进行一遍摘要再存储，每次使用摘要过后的schema