# R3: Recall, Rank, Rerank

## 处理数据，生成带有主题名称的验证集合

In [1]:
import os
import json
from pprint import pprint

os.chdir("/root/xiatian/github/llms4subjects")

In [None]:
import json
from llms4subjects.instance import instance_db_merged_with_dev
from llms4subjects.subject import subject_db_all

predicted_file = "./db/eval/merged/by_instance_5.dev2.jsonline"
with open(predicted_file, "r", encoding="utf-8") as f:
    records = [json.loads(line) for line in f.readlines()]
    
for r in records:
    instance = instance_db_merged_with_dev.get_by_instance_id(r['id'])
    r['true_names'] = [subject_db_all.get_name_by_code(name) for name in r['true_codes']]
    r['pred_names'] = [subject_db_all.get_name_by_code(name) for name in r['pred_codes']]
    r['abstract'] = instance.abstract
    r['doctype'] = instance.doctype
    r['language'] = instance.language
    
# 写回jsonline文件
with open('./db/eval/merged/by_instance_5.dev2_with_names.jsonline', "w", encoding="utf-8") as f:
    for r in records:
        line = json.dumps(r,ensure_ascii=False)
        f.write(f"{line}\n")
    

load 112292 alias
load 311410 alias


## 开始预测

In [22]:
import os
import json
from pprint import pprint
from llms4subjects.llm import LLM
from tqdm import tqdm

chatbot = LLM(base_url="http://10.96.1.43:7832/v1", 
              model="/data/app/yangyahe/base_model/Qwen-QwQ-32B-AWQ")

os.chdir("/root/xiatian/github/llms4subjects")

# 重新读取
with open('./db/eval/merged/by_instance_5.dev2_with_names.jsonline', "r", encoding="utf-8") as f:
    records = [json.loads(line) for line in f.readlines()]
    

In [None]:
template_v1 = """
You act as an expert in library subject indexing. Please carefully analyze the given document title and abstract, review the given list of reference topics, and reorder them according to their degree of relevance to the document. You can eliminate irrelevant topics and also add new topics.

## Title: xxxx
## Abstract: xxxx
## Reference list of possible document topics for sorting:
	- Topic 1
	- Topic 2

## Please re-output the sorted list of document topics: 
"""


tempate_v2 = """You act as an expert in library subject indexing. Please carefully analyze the given document title and abstract, review the given list of reference topics, and reorder them according to their degree of relevance to the document. Irrelevant topics can be removed, and new topics can also be added. Pay attention that after the intermediate analysis, you must finally output the "Final topic list". In the final topic list, only the topic names should be outputted, with one topic name per line, and there should be no other explanatory information mixed in.

## Here is an example of the input and output format
### Title: xxxx
### Abstract: xxxx
### Reference sorted list of document topics:
  - Topic 1
  - Topic 2
  
### Analysis process
(omitted)

### Final topic list
  - Topic 1
  - Topic 2

## Normal processing starts here

### Title: {your title}
### Abstract: {your abstract}
### Reference sorted list of document topics:
  - {your topic 1}
  - {your topic 2}
  
### Analysis process"""


In [20]:
def make_prompt(record):
    topics = [f"  - {name.strip()}" for name in record["pred_names"]]
    topics = "\n".join(topics)
    title, abstract = record["title"], record["abstract"]
    
    return f"""
You act as an expert in library subject indexing. Please carefully analyze the given document title and abstract, review the given list of reference topics, and reorder them according to their degree of relevance to the document. Irrelevant topics can be removed, and new topics can also be added. Pay attention that after the intermediate analysis, you must finally output the "Final topic list". In the final topic list, only the topic names should be outputted, with one topic name per line, and there should be no other explanatory information mixed in.

## Here is an example of the input and output format
### Title: xxxx
### Abstract: xxxx
### Reference sorted list of document topics:
  - Topic 1
  - Topic 2
    
### Analysis process
(omitted)

### Final topic list
  - Topic 1
  - Topic 2

## Normal processing starts here

### Title: {title}
### Abstract: {abstract}
### Reference sorted list of document topics:
{topics}

### Analysis process
"""

def rerank(record) -> str:
  prompot = make_prompt(record)
  text = chatbot.chat(user_prompt=prompot)
  data = json.loads(text)
  answer:str = data['choices'][0]['message']['content']
  return answer

answer = rerank(records[6])
print(answer)

<think>
Okay, let's tackle this problem step by step. First, I need to understand what the user is asking for. They want me to act as an expert in library subject indexing. My task is to analyze the given document's title and abstract, look at the provided list of reference topics, and then reorder them based on their relevance to the document. I can remove irrelevant topics or add new ones if necessary. The final list should only include the topic names, each on a separate line, without any explanations.

Alright, starting with the document's title: "Aktuelle Entwicklungen im Immissionsschutzrecht : Dokumentation des 17. Leipziger Umweltrechtlichen Symposions..." Translating that, it's about current developments in emissions protection law, specifically documenting a symposium on environmental law. The subtitle mentions the University of Leipzig and Helmholtz Centre for Environmental Research. The date is April 2012.

The abstract explains that the symposium focuses on questions arisi

In [None]:
llm_output_file = './db/eval/merged/by_instance_5.dev2.llm_output.jsonline'
with open(llm_output_file, "w", encoding="utf-8") as f:
    for lineno, r in tqdm(enumerate(records)):
        prompot = make_prompt(r)
        answer = rerank(r)
        data = {
            'lineno': lineno,
            'id': r['id'],
            'answer': answer
            }
        s = json.dumps(data, ensure_ascii=False)
        f.write(s)
        f.write('\n')
        f.flush()
        pass
    

11649it [00:00, 66533.01it/s]
