# query重写demo

在data下面每个类别的文档中，都有log.txt的术语词，现在需要根据术语词，用llm按照论文的格式重写query，例如：

EPS回落流程中，PCF向SBC上报几次用户位置吗？

=>

演进的分组系统 ( Evolved Packet System , EPS ) 回落流程中，策略控制功能 ( Policy Control Function, PCF ) 向 会话边界控制器 ( Session Border Controller, SBC ) 上报几次用户位置吗？




In [1]:
# 思路：
# 1 把log.txt按照行构建bm25的检索器
# 2 对于query进行检索，找到相关的关键词
# 3 设置prompt，让模型重写组织问题
# 4 将新的问题输入main，得到新的回答
# 5 （可选） 将两个query对应的答案进行集成。

In [1]:
%cd /home/jhx/Projects/RAG/aiops24-RAG-demo/demo

/home/jhx/Projects/RAG/aiops24-RAG-demo/demo


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [99]:
import re
from tqdm import tqdm

## 1 单个query

### 1.1 整理术语

In [44]:
def is_english_abbreviation(text):
    # 定义正则表达式，匹配只包含英文字母和数字的文本
    # pattern = r'^[a-zA-Z0-9]+$'
    pattern = r'^[a-zA-Z0-9-]+$'

    # 使用re.match()函数尝试匹配文本
    if re.match(pattern, text):
        # 如果匹配成功，检查是否包含汉字
        if not any('\u4e00' <= char <= '\u9fff' for char in text):
            return True
    return False

def print_is_abbreviation(text):
    print(f"{text}: {is_english_abbreviation(text)}")

print_is_abbreviation("3rd Generation Partnership Project第三代合作伙伴计划")
print_is_abbreviation("3GPP")
print_is_abbreviation("5G System5G系统")
print_is_abbreviation("5G-GUTI")
print_is_abbreviation(" ")
print_is_abbreviation(" \n")



3rd Generation Partnership Project第三代合作伙伴计划: False
3GPP: True
5G System5G系统: False
5G-GUTI: True
 : False
 
: False


In [45]:
# 1 获取log.txt，变成文本列表
def read_terminology_file(file_path, sep = "|"):
    terminology_list = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        i = 0
        last_abbrev = None
        
        while i < len(lines):
            if lines[i].strip():  # Check if line is not empty
                acronym = lines[i].strip()
                i += 1

                if acronym.find("缩略语") != -1: continue
                if i < len(lines):
                    english_full = lines[i].strip()
                    i += 1
                    if i < len(lines):
                        chinese_full = lines[i].strip()
                        terminology_list.append(f"{acronym} {sep} {english_full} {chinese_full}")
                        i += 1
                    else:
                        # If there are no more lines after chinese_full, break the loop
                        break
                else:
                    # If there are no more lines after english_full, break the loop
                    break
            else:
                i += 1  # Move to the next line
    
    return terminology_list


In [48]:
# 1 获取log.txt，变成文本列表
# 非空，且 是缩写，记录last。然后如果不是缩写，就保存term
def read_terminology_file(file_path, sep = "|"):
    terminology_list = []
    last_abbrev = None

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        if (line.find("缩略语") != -1) or (not line.strip()): continue

        acronym = line.strip()
        if is_english_abbreviation(acronym):
            last_abbrev = acronym
            continue
        else:
            term = f"{last_abbrev} {sep} {acronym}"
            terminology_list.append(term)
    
    return terminology_list


In [49]:
# Example usage:
file_path = 'data/rcp/log.txt'  # Replace with your file path
terminology_list = read_terminology_file(file_path)
terminology_dict = {}
for term in terminology_list:
    term_abbrev = term.strip().split("|")[0].strip()
    terminology_dict[term_abbrev] = term

terminology_dict

{'3GPP': '3GPP | 3rd Generation Partnership Project第三代合作伙伴计划',
 '3GPP2': '3GPP2 | 3rd Generation Partnership Project 23G协作组2',
 '4A': '4A | Account, Authorization, Authentication, Audit认证、账号、授权、审计',
 '5G-GUTI': '5G-GUTI | 5G Globally Unique Temporary Identity5G全球唯一临时标识',
 '5GC': '5GC | 5G Core Network5G核心网',
 '5GS': '5GS | 5G System5G系统',
 '5QI': '5QI | 5G QoS Identifier5G QoS标识',
 'AAA': 'AAA | Authentication, Authorization and Accounting鉴权、授权及计费',
 'AAR': 'AAR | Auth-Application Request鉴权应用请求',
 'ACL': 'ACL | Access Control List访问控制列表',
 'ACR': 'ACR | Accounting Request计费请求',
 'ADC': 'ADC | Application Detection and Control应用检测控制',
 'ADMC': 'ADMC | Auto Detected Manually Cleared自动产生人工清除',
 'AES': 'AES | Advanced Encryption Standard高级加密标准',
 'AF': 'AF | Application Function应用功能',
 'AIA': 'AIA | Authentication-Information-Answer鉴权信息检索响应',
 'AIC': 'AIC | Automatic Integration Center自动化集成中心',
 'AIR': 'AIR | Authentication-Information-Request鉴权信息检索请求',
 'AKA': 'AKA | Authentication and Ke

### 1.2 构建向量库

In [50]:
from llama_index.llms.ollama import Ollama
from llama_index.core import StorageContext
from llama_index.core import Settings

from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import StorageContext, VectorStoreIndex


OLLAMA_URL = "http://localhost:11434"

llm = Ollama(
    model="qwen", base_url=OLLAMA_URL, temperature=0, request_timeout=120
)

embed_path = "/home/jhx/Projects/pretrained_models/bge-m3/"
embed_args = {
    'model_name': embed_path, 
    'max_length': 512, 
    'embed_batch_size': 32, 
    'device': 'cuda'
    }

embeding = HuggingFaceEmbedding(**embed_args)
Settings.embed_model = embeding
Settings.llm = llm


In [51]:
# 构建查询引擎
from llama_index.core import Document, VectorStoreIndex

term_documents = [Document(text = term) for term in terminology_list]

index  = VectorStoreIndex.from_documents(term_documents)


In [52]:

from llama_index.core.vector_stores import MetadataFilters

# Create vector search query engine
# query_engine = index.as_query_engine(
#     similarity_top_k=5,
#     filters=MetadataFilters.from_dicts(
#         [
#             {"key": "page_label", "value": "2"}
#         ]
#     )
# )
# response = query_engine.query(
#     "康肃公陈尧咨善于射箭，曾经，他在家里场地射箭，然后发生了什么事？", 
# )
# print(str(response))

query_engine = index.as_query_engine(
    similarity_top_k=5,
)

# N7接口的IMSI、MSISDN、IMEISV分别在哪个参数中传递

# template = (
#     ""
#     "{query_str}"
#     ""
# )

query = '''请根据术语库的信息，将query改写，包含完整术语词，每个术语词符合格式 中文名称（英文全称，英文缩写词）
query: N7接口的IMSI、MSISDN、IMEISV分别在哪个参数中传递
'''

response = query_engine.query(query)
print(str(response))

N7接口中的IMSI信息在Input/Output参数中传递；MSISDN信息同样在Input/Output参数中传递；IMEISV信息也在Input/Output参数中进行传递。


In [17]:
# retriever = index.as_retriever()
# nodes = retriever.retrieve("N7接口的IMSI、MSISDN、IMEISV分别在哪个参数中传递")
# print(len(nodes))
# print(nodes[0].text)
# print(nodes[1].text)


2
I-CSCF | Input/Output输入/输出
DNAI | DN Access Identifier数据网络接入标识符


In [18]:
# import re

# def extract_acronyms(query, term_definitions):
#     acronym_list = []
#     term_definitions_lower = {term.lower(): full_name for term, full_name in term_definitions.items()}
    
#     # Regular expression pattern to find potential acronyms
#     pattern = r'\b[A-Z][a-z0-9]*(?:[A-Z][a-z0-9]*)+\b'
    
#     # Find all potential acronyms in the query
#     potential_acronyms = re.findall(pattern, query)
    
#     # Check each potential acronym
#     for acronym in potential_acronyms:
#         acronym_lower = acronym.lower()
        
#         if acronym_lower in term_definitions_lower:
#             acronym_list.append(acronym)
    
#     return acronym_list

# # Example usage:
# query = "N7接口的IMSI、MSISDN、IMEISV分别在哪个参数中传递"
# term_definitions = {
#     "IMSI": "International Mobile Subscriber Identity",
#     "MSISDN": "Mobile Station International Subscriber Directory Number",
#     "IMEISV": "International Mobile Equipment Identity and Software Version"
# }

# result = extract_acronyms(query, term_definitions)
# print(result)  # Output: ['IMSI', 'MSISDN', 'IMEISV']


['MSISDN']


In [53]:
def extract_terms(text):
    # 定义正则表达式模式，匹配由字母（大小写）、数字和连字符 "-" 组成的连续字符序列
    pattern = r'[a-zA-Z0-9-]+'
    
    # 使用正则表达式查找匹配的术语词
    matches = re.findall(pattern, text)
    
    # 去重并保持顺序
    unique_matches = []
    seen = set()
    
    for term in matches:
        if term not in seen:
            unique_matches.append(term)
            seen.add(term)
    
    return unique_matches

# 示例文本
text = "PCF使用同一个版本，在同一个环境下部署的多套PCF网元，是否可以使用同一个license申请文件申请"

# 调用函数并打印结果
print(extract_terms(text))
print(extract_terms("PCC规则的Precedence用途是什么？"))
print(extract_terms("ZXUN RCP部署成功后，各个虚机个数都是最少个数，是否可以一次性扩容完成？有哪些注意事项？"))


['PCF', 'license']
['PCC', 'Precedence']
['ZXUN', 'RCP']


In [54]:
query = "N7接口的IMSI、MSISDN、IMEISV分别在哪个参数中传递"
possible_terms = extract_terms(query)
print(possible_terms)
res = []
for term in possible_terms:
    if term in terminology_dict:
        res.append(terminology_dict[term])
term_info = "\n".join(res)

['N7', 'IMSI', 'MSISDN', 'IMEISV']


### 1.3 改写query(不稳定)

In [51]:
template = (
    "术语信息遵循 '英文缩略词 | 英文全称 中文全称' 格式，示例如下：\n"
    "{term_str}\n"
    "请根据术语库信息对问题query进行改写，将术语词的缩写改为'中文名称（英文全称，英文缩写词）'的格式，如' 演进的分组系统 ( Evolved Packet System , EPS ) '，问题如下：\n"
    "query: {query_str}\n"
    "注意：仅仅修改query中的术语词,保持语义不变，并且由空格分开方便分词。"
    # "注意：仅仅修改query中的术语词，并且中英文术语都需要空格分开, 方便分词。"

)

prompt = template.format(term_str = term_info, query_str= query)
print("prompt:",prompt,"\n")
response = llm.complete(prompt)
print("response:",str(response))

prompt: 术语信息遵循 '英文缩略词 | 英文全称 中文全称' 格式，示例如下：
IMSI | International Mobile Subscriber Identity国际移动用户标识
MSISDN | Mobile Station International Subscriber Directory Number移动台国际用户目录号
IMEISV | International Mobile Equipment Identity and Software Version number国际移动设备识别码和软件版本号
请根据术语库信息对问题query进行改写，将术语词的缩写改为'中文名称（英文全称，英文缩写词）'的格式，如' 演进的分组系统 ( Evolved Packet System , EPS ) '，问题如下：
query: N7接口的IMSI、MSISDN、IMEISV分别在哪个参数中传递
注意：仅仅修改query中的术语词,保持语义不变，并且由空格分开方便分词。 

response: N7接口的国际移动用户标识（International Mobile Subscriber Identity，IMSI）、移动台国际用户目录号（Mobile Station International Subscriber Directory Number，MSISDN）、国际移动设备识别码和软件版本号（International Mobile Equipment Identity and Software Version number，IMEISV）分别在哪个参数中传递？


In [20]:
def rewrite_query_w_terminology(query, template, llm, terminology_dict):
    ''' 根据术语词典,将query改写'''
    possible_terms = extract_terms(query)
    res = []
    for term in possible_terms:
        if term in terminology_dict:
            res.append(terminology_dict[term])
    term_info = "\n".join(res)
    # print("term_info:\n",term_info)
    prompt = template.format(term_str = term_info, query_str= query)
    response = llm.complete(prompt)

    return str(response)
    


In [63]:
from functools import partial
rewrite_fn = partial(rewrite_query_w_terminology, llm=llm, template=template, terminology_dict=terminology_dict)
q1 = "RCP的HTTP信令路由包括哪些功能？"
q2 = "N7会话的ResourceURI由哪个网元在哪个消息中生成"
q3 = "RCP在IaaS架构下有哪些GSU虚机？"

print(rewrite_fn(q1))
print(rewrite_fn(q2))
print(rewrite_fn(q3))

# 总结:让模型负责改写query,不靠谱啊 不如对所有的术语词,整理成\t格式


term_info:
 RCP | Resource Control Platform资源控制平台
HTTP | Hypertext Transfer Protocol超文本传输协议
RCP的HTTP信令路由包括资源控制平台的哪些功能？
term_info:
 
N7会话的ResourceURI由哪个网络单元在网络消息中生成
term_info:
 RCP | Resource Control Platform资源控制平台
IaaS | Infrastructure as a Service基础设施即服务
GSU | General Service Unit通用业务单元
RCP资源控制平台在IaaS基础设施即服务架构下支持多种GSU通用业务单元虚拟机，具体包括但不限于网络管理、安全防护、负载均衡等关键功能组件。为了提供全面的IT基础架构服务，这些GSU虚机协同工作以实现高效、灵活和可扩展的云计算环境。


### 1.4 改写术语 （llm+re）

In [55]:
template = (
    "术语信息遵循 '英文缩略词 | 英文全称 中文全称' 格式，，请参考以下示例修正格式：\n"
    "input: 5GC | 5G Core Network5G核心网\n"
    "output: 5GC | 5G Core Network | 5G核心网\n"
    "input: {term_str}\n"
    "output:"
)
# prompt: 术语信息遵循 '英文缩略词 | 英文全称 中文全称' 格式，示例如下：
# IMSI | International Mobile Subscriber Identity国际移动用户标识
# MSISDN | Mobile Station International Subscriber Directory Number移动台国际用户目录号
# IMEISV | International Mobile Equipment Identity and Software Version number国际移动设备识别码和软件版本号
# 请根据术语库信息对问题query进行改写，将术语词的缩写改为'中文名称（英文全称，英文缩写词）'的格式，如' 演进的分组系统 ( Evolved Packet System , EPS ) '，问题如下：
# query: N7接口的IMSI、MSISDN、IMEISV分别在哪个参数中传递
# 注意：仅仅修改query中的术语词,保持语义不变，并且由空格分开方便分词。 

# response: N7接口的国际移动用户标识（International Mobile Subscriber Identity，IMSI）、移动台国际用户目录号（Mobile Station International Subscriber Directory Number，MSISDN）、国际移动设备识别码和软件版本号（International Mobile Equipment Identity and Software Version number，IMEISV）分别在哪个参数中传递？

prompt = template.format(term_str= "IMSI | International Mobile Subscriber Identity国际移动用户标识")
prompt = template.format(term_str= "IMEISV | International Mobile Equipment Identity and Software Version number国际移动设备识别码和软件版本号")

response = llm.complete(prompt)
print(str(response))

IMEISV | International Mobile Equipment Identity and Software Version number | 国际移动设备识别码和软件版本号


In [71]:
def rewrite_term_re(term_str):

    term_abbrev, term = [x.strip() for x in term_str.split("|")]

    # 定义正则表达式模式，匹配连续的英文字符和连续的中文字符
    match = re.match(r'([A-Za-z\s\-0-9]+)([\u4e00-\u9fff0-9]+.*)', term)
    
    if match:
        term_en = match.group(1).strip()
        term_zh = match.group(2).strip()
        formatted_term = f"{term_abbrev}| {term_zh} ( {term_en} , {term_abbrev} )"
    else:
        # 如果没有明确分隔，则尝试其他策略
        # 找到第一个中文字符出现的位置
        index = re.search(r'[\u4e00-\u9fff]', term)
        if index:
            split_pos = index.start()
            term_en = term[:split_pos].strip()
            term_zh = term[split_pos:].strip()
            formatted_term = f"{term_abbrev}| {term_zh} ( {term_en} , {term_abbrev} )"
        else:
            # 如果没有找到中文字符，则认为格式不正确
            term_en = term
            term_zh = None

    if term_zh is not None:
        formatted_term = f"{term_abbrev} | {term_zh} ( {term_en} , {term_abbrev} )"
    else:
        formatted_term = f"{term_abbrev} | {term}"

    return formatted_term

print(rewrite_term_re("IMSI | International Mobile Subscriber Identity国际移动用户标识"))
print(rewrite_term_re("IMEISV | International Mobile Equipment Identity and Software Version number国际移动设备识别码和软件版本号"))
print(rewrite_term_re("8GC | 8G Core Network8G核心网"))
sample_inputs = [
    "IMSI | International Mobile Subscriber Identity国际移动用户标识",
    "A | Non-Standalone5G非独立组网",
    "A | 5G System5G系统",
    "5 | 5G Globally Unique Temporary Identity5G全球唯一临时标识",
    "L2TP | L2TP Network ServerL2TP网络服务器"
]
for term in sample_inputs:
    print(rewrite_term_re(term))


IMSI | 国际移动用户标识 ( International Mobile Subscriber Identity , IMSI )
IMEISV | 国际移动设备识别码和软件版本号 ( International Mobile Equipment Identity and Software Version number , IMEISV )
8GC | 核心网 ( 8G Core Network8G , 8GC )
IMSI | 国际移动用户标识 ( International Mobile Subscriber Identity , IMSI )
A | 非独立组网 ( Non-Standalone5G , A )
A | 系统 ( 5G System5G , A )
5 | 全球唯一临时标识 ( 5G Globally Unique Temporary Identity5G , 5 )
L2TP | 网络服务器 ( L2TP Network ServerL2TP , L2TP )


In [95]:
def rewrite_terminology(term_str: str):
    global llm
    template = (
    "术语信息遵循 '英文缩略词 | 英文全称 中文全称' 格式，请参考以下示例修正格式：\n"
    # "注意：禁止自由发挥，必须遵循原有术语，中文不要翻译，用原文。\n"
    "input: 5GC | 5G Core Network5G核心网\n"
    "output: 5GC | 5G Core Network | 5G核心网\n"
    "input: {term_str}\n"
    "output:"
    )
    prompt = template.format(term_str= term_str )
    response = llm.complete(prompt)
    response = str(response).split("\n")[0]

    try:
        term_abbrev, term_en_full, term_zh_full = response.split("|")[:3]
        formatted_term = f"{term_abbrev}| {term_zh_full.strip()} ( {term_en_full.strip()} , {term_abbrev.strip()} )"
    except Exception as e:
        print("error in rewrite_terminology:",e)
        print("response:", str(response))
        res = str(response).split("|")
        print("res list:", len(res), res)
        formatted_term = rewrite_term_re(term_str)
        print("rewrite formatted_term with re:",formatted_term)

    return formatted_term

print(rewrite_terminology("IMSI | International Mobile Subscriber Identity国际移动用户标识"))
print(rewrite_terminology("IMEISV | International Mobile Equipment Identity and Software Version number国际移动设备识别码和软件版本号"))
print(rewrite_terminology("8GC | 8G Core Network8G核心网"))

sample_inputs = [
    "IMSI | International Mobile Subscriber Identity国际移动用户标识",
    "A | Non-Standalone5G非独立组网",
    "A | 5G System5G系统",
    "5G-GUTI | 5G Globally Unique Temporary Identity5G全球唯一临时标识",
    "L2TP | L2TP Network ServerL2TP网络服务器"
]
for term in sample_inputs:
    print(rewrite_terminology(term))



IMSI | 国际移动用户标识 ( International Mobile Subscriber Identity , IMSI )
IMEISV | 国际移动设备识别码和软件版本号 ( International Mobile Equipment Identity and Software Version number , IMEISV )
8GC | 8G核心网 ( 8G Core Network , 8GC )
IMSI | 国际移动用户标识 ( International Mobile Subscriber Identity , IMSI )
A | 非独立组网 ( Non-Standalone , A )
A | 5G系统 ( 5G System , A )
5G-GUTI | 5G全局唯一临时标识 ( 5G Globally Unique Temporary Identity , 5G-GUTI )
L2TP | 第二层隧道协议 ( Layer 2 Tunneling Protocol , L2TP )


IMSI | 国际移动用户标识 ( International Mobile Subscriber Identity , IMSI )
IMEISV | 国际移动设备识别码和软件版本号 ( International Mobile Equipment Identity and Software Version number , IMEISV )
8GC | 核心网 ( 8G Core Network8G , 8GC )
IMSI | 国际移动用户标识 ( International Mobile Subscriber Identity , IMSI )
A | 非独立组网 ( Non-Standalone5G , A )
A | 系统 ( 5G System5G , A )
5 | 全球唯一临时标识 ( 5G Globally Unique Temporary Identity5G , 5 )
L2TP | 网络服务器 ( L2TP Network ServerL2TP , L2TP )


In [59]:

for sample_input in sample_inputs:
    print(rewrite_terminology(sample_input))


IMSI | 国际移动用户标识 ( International Mobile Subscriber Identity , IMSI )
Non-Standalone | 非独立组网 ( Non-standalone , Non-Standalone )
5G System | 5G系统 ( 5G System , 5G System )
5G-GUTI | 5G全局唯一临时标识 ( 5G Globally Unique Temporary Identity , 5G-GUTI )
L2TP Network Server | 层2隧道协议网络服务器 ( Layer 2 Tunneling Protocol Network Server , L2TP Network Server )


In [97]:
def write_to_txt(res, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write('\n'.join(res))
    print(f"Results have been written to {output_file}")

In [73]:
# 写到txt
from tqdm import tqdm
res = []
for term in tqdm(terminology_list):
    formatted_term = rewrite_terminology(term)
    res.append(formatted_term)



write_to_txt(res, output_file="data/rcp/term_dict.txt")

  8%|█████▊                                                                    | 46/589 [00:22<04:48,  1.88it/s]

error too many values to unpack (expected 3)


  8%|█████▉                                                                    | 47/589 [00:22<05:25,  1.67it/s]

error too many values to unpack (expected 3)


 13%|█████████▋                                                                | 77/589 [00:37<06:00,  1.42it/s]

error too many values to unpack (expected 3)


 32%|███████████████████████▎                                                 | 188/589 [01:27<03:00,  2.22it/s]

error not enough values to unpack (expected 3, got 2)


 34%|█████████████████████████                                                | 202/589 [01:35<05:54,  1.09it/s]

error too many values to unpack (expected 3)


 46%|█████████████████████████████████▍                                       | 270/589 [02:06<02:21,  2.25it/s]

error not enough values to unpack (expected 3, got 2)


 65%|███████████████████████████████████████████████▍                         | 383/589 [02:59<01:35,  2.16it/s]

error not enough values to unpack (expected 3, got 2)


 76%|███████████████████████████████████████████████████████▊                 | 450/589 [03:29<00:57,  2.42it/s]

error not enough values to unpack (expected 3, got 2)


 90%|█████████████████████████████████████████████████████████████████▍       | 528/589 [04:05<00:26,  2.34it/s]

error not enough values to unpack (expected 3, got 2)


100%|█████████████████████████████████████████████████████████████████████████| 589/589 [04:34<00:00,  2.14it/s]

Results have been written to data/rcp/term_dict.txt





### 1.5 替换query √

In [8]:
# 读取写好的术语列表
dict_path = "data/rcp/term_dict.txt"
def read_terminology_dict(file_path: str, sep="|"):
    """
    读取术语字典文件，并返回字典。
    
    参数:
    file_path (str): 文件路径。
    
    返回:
    dict: 术语字典。
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        term_dict = {}
        for line in lines:
            key, value = line.strip().split(sep)
            key, value = key.strip(), value.strip()
            orig_val = term_dict.get(key, "")
            term_dict[key] = value if len(value)>= len(orig_val) else orig_val

        return term_dict

term_dict = read_terminology_dict(dict_path)
print(term_dict)

{'3GPP': '第三代合作伙伴计划 ( 3rd Generation Partnership Project , 3GPP )', '3GPP2': '3G协作组2 ( 3rd Generation Partnership Project 2 , 3GPP2 )', '4A': '认证、账号、授权、审计 ( Account, Authorization, Authentication, Audit , 4A )', '5G-GUTI': '5G全局唯一临时标识 ( 5G Globally Unique Temporary Identity , 5G-GUTI )', '5GC': '5G核心网 ( 5G Core Network , 5GC )', '5GS': '5G系统 ( 5G System , 5GS )', '5QI': '5G QoS标识 ( 5G QoS Identifier , 5QI )', 'AAA': '认证、授权和计费 ( Authentication, Authorization, and Accounting , AAA )', 'AAR': '鉴权应用请求 ( Auth-Application Request , AAR )', 'ACL': '访问控制列表 ( Access Control List , ACL )', 'ACR': '计费请求 ( Accounting Request , ACR )', 'ADC': '应用检测与控制 ( Application Detection and Control , ADC )', 'ADMC': '自动产生人工清除 ( Auto Detected Manually Cleared , ADMC )', 'AES': '高级加密标准 ( Advanced Encryption Standard , AES )', 'AF': '应用功能 ( Application Function , AF )', 'AIA': '鉴权信息检索响应 ( Authentication-Information-Answer , AIA )', 'AIC': '自动化集成中心 ( Automatic Integration Center , AIC )', 'AIR': '鉴权信息检索请求 ( Authen

In [111]:
# 然后找到query的替换词
def rewrite_query_w_terminology(query, terminology_dict):
    ''' 根据术语词典,将query改写'''
    possible_terms = extract_terms(query)

    new_query = query
    for term in possible_terms:
        if term in terminology_dict:
            term_info = terminology_dict[term].split("|")[-1].strip()
            new_query = new_query.replace(term.strip(), f" {term_info} ")
            
    # print(f"raw query: {query}")
    # print(f"new query: {new_query}\n")

    return new_query
    

from functools import partial
rewrite_fn = partial(rewrite_query_w_terminology, terminology_dict=term_dict)
q1 = "RCP的HTTP信令路由包括哪些功能？"
q2 = "N7会话的ResourceURI由哪个网元在哪个消息中生成"
q3 = "RCP在IaaS架构下有哪些GSU虚机？"

r1 = rewrite_fn(q1)
r2 = rewrite_fn(q2)
r3 = rewrite_fn(q3)
r3

' 资源控制平台 ( Resource Control Platform , RCP ) 在 基础设施即服务 ( Infrastructure as a Service , IaaS ) 架构下有哪些 通用服务单元 ( General Service Unit , GSU ) 虚机？'

In [32]:
term_dict.get("IaaS","")

'基础设施即服务 ( Infrastructure as a Service , IaaS )'

## 2 一个query对应多个术语来源（过滤）X

query = {"id":10,"query":"Director部署过程中有个网卡固化操作，那是在什么场景下做？","document":"director"}


In [104]:
# 1 获取log.txt，变成文本列表
# 非空，且 是缩写，记录last。然后如果不是缩写，就保存term
def read_terminology_file(file_path, sep = "|"):
    terminology_dict = {}
    last_abbrev = None

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    for line in lines:
        if (line.find("缩略语") != -1) or (not line.strip()): continue

        acronym = line.strip()
        if is_english_abbreviation(acronym):
            last_abbrev = acronym
            continue
        else:
            term = f"{last_abbrev} {sep} {acronym}"
            terminology_dict[last_abbrev.strip()] = term
    
    return terminology_dict


In [101]:
import os

# 1 多个目录的log转换为term_dict
doc_types = ["director",  "emsplus",  "rcp",  "umac"]
data_root = "/home/jhx/Projects/RAG/aiops24-RAG-demo/demo/data"
doc2term_dict = {}
for doc in doc_types:
    print(f"process {doc}")
    file_path = os.path.join(data_root, doc, "log.txt")
    out_path = os.path.join(data_root, doc, "term_dict.txt")
    terminology_dict = read_terminology_file(file_path)

    # todo: 用llm转为引用格式
    for k, v in tqdm(terminology_dict.items()):
        formatted_term = rewrite_terminology(v)
        terminology_dict[k] = formatted_term


    write_to_txt(list(terminology_dict.values()), output_file=out_path)
    doc2term_dict[doc] = terminology_dict

    


process director


 47%|███████████████████████████████████████████████████████████▋                                                                    | 35/75 [00:15<00:18,  2.11it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: MEC | Multi-access Edge Computing多接入边缘计算
res list: 2 ['MEC ', ' Multi-access Edge Computing多接入边缘计算']
rewrite formatted_term with re: MEC | 移动边缘计算 ( Mobile Edge Computing , MEC )


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 75/75 [00:34<00:00,  2.17it/s]


Results have been written to /home/jhx/Projects/RAG/aiops24-RAG-demo/demo/data/director/term_dict.txt
process emsplus


 30%|█████████████████████████████████████▊                                                                                         | 45/151 [00:20<00:47,  2.25it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: GW | Gateway 网关
res list: 2 ['GW ', ' Gateway 网关']
rewrite formatted_term with re: GW | 网关 ( GateWay , GW )


 46%|██████████████████████████████████████████████████████████▊                                                                    | 70/151 [00:32<00:36,  2.21it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: MU | Multi-User 多用户
res list: 2 ['MU ', ' Multi-User 多用户']
rewrite formatted_term with re: MU | 合并单元 ( Merging Unit , MU )


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 151/151 [01:10<00:00,  2.14it/s]


Results have been written to /home/jhx/Projects/RAG/aiops24-RAG-demo/demo/data/emsplus/term_dict.txt
process rcp


 32%|███████████████████████████████████████▊                                                                                      | 172/545 [01:21<02:53,  2.15it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: GW | Gateway 网关
res list: 2 ['GW ', ' Gateway 网关']
rewrite formatted_term with re: GW | 网关 ( GateWay , GW )


 64%|████████████████████████████████████████████████████████████████████████████████▉                                             | 350/545 [02:47<01:33,  2.09it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: PUDR | Policy UDR Uniform Data Repository for Policies
res list: 2 ['PUDR ', ' Policy UDR Uniform Data Repository for Policies']
rewrite formatted_term with re: PUDR | 统一的策略数据存储库 ( Policy UDR , PUDR )


 76%|███████████████████████████████████████████████████████████████████████████████████████████████▎                              | 412/545 [03:16<00:56,  2.36it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: SIG | Signalling 信号
res list: 2 ['SIG ', ' Signalling 信号']
rewrite formatted_term with re: SIG | 信令 ( Signal , SIG )


 89%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████▉              | 484/545 [03:50<00:25,  2.39it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: UL | UpLink 上行链路
res list: 2 ['UL ', ' UpLink 上行链路']
rewrite formatted_term with re: UL | 上行链路 ( Uplink , UL )


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 545/545 [04:19<00:00,  2.10it/s]


Results have been written to /home/jhx/Projects/RAG/aiops24-RAG-demo/demo/data/rcp/term_dict.txt
process umac


 29%|████████████████████████████████████                                                                                          | 193/674 [01:31<03:42,  2.17it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: GW | Gateway 网关
res list: 2 ['GW ', ' Gateway 网关']
rewrite formatted_term with re: GW | 网关 ( GateWay , GW )


 44%|███████████████████████████████████████████████████████▌                                                                      | 297/674 [02:22<02:49,  2.22it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: MEC | Multi-access Edge Computing多接入边缘计算
res list: 2 ['MEC ', ' Multi-access Edge Computing多接入边缘计算']
rewrite formatted_term with re: MEC | 移动边缘计算 ( Mobile Edge Computing , MEC )


 64%|████████████████████████████████████████████████████████████████████████████████▌                                             | 431/674 [03:28<01:54,  2.13it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: PUDR | Policy UDR Uniform Data Repository for Policies
res list: 2 ['PUDR ', ' Policy UDR Uniform Data Repository for Policies']
rewrite formatted_term with re: PUDR | 统一的策略数据存储库 ( Policy UDR , PUDR )


 98%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▏  | 659/674 [05:16<00:08,  1.78it/s]

error in rewrite_terminology: not enough values to unpack (expected 3, got 2)
response: eDRX | Extended Idle Mode Dynamic Receiver eXtended空闲模式动态接收器扩展
res list: 2 ['eDRX ', ' Extended Idle Mode Dynamic Receiver eXtended空闲模式动态接收器扩展']
rewrite formatted_term with re: eDRX | 演进的DRX ( extended Idle Mode DRX , eDRX )


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 674/674 [05:24<00:00,  2.07it/s]

Results have been written to /home/jhx/Projects/RAG/aiops24-RAG-demo/demo/data/umac/term_dict.txt





In [112]:
# 2 query按照doc选择term_dict
query = {"id":9,"query":"RCP如何将VoNR呼叫的Rx会话绑定到对应的N7会话？","document":"rcp"}
cur_term_dic = doc2term_dict[query["document"]]
new_query = rewrite_query_w_terminology(query["query"], cur_term_dic)
new_query

' 资源控制平台 ( Resource Control Platform , RCP ) 如何将 新空口承载语音 ( Voice over New Radio , VoNR ) 呼叫的Rx会话绑定到对应的N7会话？'

In [113]:
# doc2term_dict["rcp"].get("RCP","")

'RCP | 资源控制平台 ( Resource Control Platform , RCP )'

## 3 多个query多个doc

In [115]:
# 读取question.json
import json
from pipeline.qa import read_jsonl, write_jsonl

In [116]:
queries = read_jsonl("question.jsonl")

In [119]:
res = []
for query_dic in tqdm(queries):
    new_query = rewrite_query_w_terminology(query_dic["query"], doc2term_dict[query_dic["document"]])
    query_dic["query"] = new_query
    res.append(query_dic)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 103/103 [00:00<00:00, 256235.65it/s]


In [120]:
write_jsonl(res, "question_rewrite.jsonl")

In [123]:
!head question.jsonl

{"id":1,"query":"PCF与NRF对接时，一般需要配置哪些数据？","document":"rcp"}
{"id":2,"query":"ZXUN RCP部署成功后，各个虚机个数都是最少个数，是否可以一次性扩容完成？有哪些注意事项？","document":"rcp"}
{"id":3,"query":"如何排查PCF侧建立专载失败，发起Rx-ASR释放问题？","document":"rcp"}
{"id":4,"query":"Npcf_SMPolicyControl服务包含哪些操作？","document":"rcp"}
{"id":5,"query":"什么是裸金属","document":"director"}
{"id":6,"query":"RCP策略决策包括哪些策略类型？","document":"rcp"}
{"id":7,"query":"Director中的VDC是什么概念？","document":"director"}
{"id":8,"query":"RCP在IaaS架构下有哪些GSU虚机？","document":"rcp"}
{"id":9,"query":"RCP如何将VoNR呼叫的Rx会话绑定到对应的N7会话？","document":"rcp"}
{"id":10,"query":"Director部署过程中有个网卡固化操作，那是在什么场景下做？","document":"director"}


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [121]:
!head question_rewrite.jsonl

{"id": 1, "query": " 分组控制功能 ( Packet Control Function , PCF ) 与 网络仓储功能 ( NF Repository Function , NRF ) 对接时，一般需要配置哪些数据？", "document": "rcp"}
{"id": 2, "query": "ZXUN  资源控制平台 ( Resource Control Platform , RCP ) 部署成功后，各个虚机个数都是最少个数，是否可以一次性扩容完成？有哪些注意事项？", "document": "rcp"}
{"id": 3, "query": "如何排查 分组控制功能 ( Packet Control Function , PCF ) 侧建立专载失败，发起Rx-ASR释放问题？", "document": "rcp"}
{"id": 4, "query": "Npcf_SMPolicyControl服务包含哪些操作？", "document": "rcp"}
{"id": 5, "query": "什么是裸金属", "document": "director"}
{"id": 6, "query": " 资源控制平台 ( Resource Control Platform , RCP ) 策略决策包括哪些策略类型？", "document": "rcp"}
{"id": 7, "query": "Director中的 虚拟数据中心 ( Virtual Data Center , VDC ) 是什么概念？", "document": "director"}
{"id": 8, "query": " 资源控制平台 ( Resource Control Platform , RCP ) 在 基础设施即服务 ( Infrastructure as a Service , IaaS ) 架构下有哪些 通用服务单元 ( General Service Unit , GSU ) 虚机？", "document": "rcp"}
{"id": 9, "query": " 资源控制平台 ( Resource Control Platform , RCP ) 如何将 新空口承载语音 ( Voice over New Radio , VoNR ) 呼叫的Rx

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
