# RAG应用 构建高质量化学向量库

## step1 导包并加载文档

In [4]:
import os

In [5]:

root_dir = "./papers"

def extract_file_dirs(directory):
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                fp = os.path.join(root, file)
                file_paths.append(fp)
    return file_paths

files = extract_file_dirs(root_dir)
files

['./papers\\Enantioselective Iridium-Catalyzed Allylic Substitution with 2-Methylpyridines.pdf',
 './papers\\Iridium-Catalyzed Asymmetric Allylic Amination Reaction with N-Aryl Phosphoramidite Ligands.pdf']

In [7]:
from pdfminer.high_level import extract_text

In [8]:
texts = []

for file in files:
    texts.append(extract_text(file))
texts

['Communications\n\nAngewandte\n\nChemie\n\nAsymmetric Catalysis\n\nInternational Edition: DOI: 10.1002/anie.201700433\nGerman Edition:\nDOI: 10.1002/ange.201700433\n\nEnantioselective Iridium-Catalyzed Allylic Substitution with\n2-Methylpyridines\nXi-Jia Liu and Shu-Li You*\n\nAbstract: An enantioselective iridium-catalyzed allylic sub-\nstitution with a set of highly unstabilized nucleophiles gen-\nerated in situ from 2-methylpyridines is described. Enantioen-\nriched 2-substituted pyridines, which are frequently encoun-\ntered in natural products and pharmaceuticals, could be easily\nconstructed by this simple method in good yields and excellent\nenantioselectivity. The synthetic utility of the pyridine products\nis demonstrated through the synthesis of a key intermediate of\na reported Na+/H+ exchanger inhibitor and the total synthesis\nof ((cid:2))-lycopladine A.\n\nPyridines are among the most prevalent heterocyclic\n\nstructural moieties in biologically active natural products,\

In [9]:
len(texts[0]), len(texts[1])

(21916, 35247)

## step2 文档分割

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=550, chunk_overlap=100)
splits = []
for text in texts:
    splits.extend(text_splitter.split_text(text))
splits[:5], len(splits)

(['Communications\n\nAngewandte\n\nChemie\n\nAsymmetric Catalysis\n\nInternational Edition: DOI: 10.1002/anie.201700433\nGerman Edition:\nDOI: 10.1002/ange.201700433\n\nEnantioselective Iridium-Catalyzed Allylic Substitution with\n2-Methylpyridines\nXi-Jia Liu and Shu-Li You*',
  'Abstract: An enantioselective iridium-catalyzed allylic sub-\nstitution with a set of highly unstabilized nucleophiles gen-\nerated in situ from 2-methylpyridines is described. Enantioen-\nriched 2-substituted pyridines, which are frequently encoun-\ntered in natural products and pharmaceuticals, could be easily\nconstructed by this simple method in good yields and excellent\nenantioselectivity. The synthetic utility of the pyridine products\nis demonstrated through the synthesis of a key intermediate of',
  'is demonstrated through the synthesis of a key intermediate of\na reported Na+/H+ exchanger inhibitor and the total synthesis\nof ((cid:2))-lycopladine A.',
  'Pyridines are among the most prevalent hete

## step3 向量化并构建向量库

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings

In [19]:
embedding = HuggingFaceEmbeddings(model_name="shibing624/text2vec-base-multilingual")

# 保存到当前根目录的vector_db文件夹下
kb_name = "vector_test1"
persist_directory = os.path.join(os.getcwd(), kb_name)

vectordb = Chroma.from_texts(
    texts=splits,
    embedding=embedding,
    persist_directory=persist_directory,
)
vectordb.persist()

  attn_output = torch.nn.functional.scaled_dot_product_attention(
  warn_deprecated(


## step4 检索

In [35]:
query = '通过什么样的简单方法可以提高产率和对映选择性?'
q_docs = vectordb.similarity_search(query, k=1)
# print(q_docs)
context = [doc.page_content for doc in q_docs]
# print('context', context)

prompt = f"已知PDF内容：：\n{'-'.join(context)}\n根据已知信息回答问题：\n{query}\n所有的回答都根据已知信息的内容来回答，不要编造内容。用中文回答问题。"
prompt

'已知PDF内容：：\neffect on enantioselectivity. However, the reaction did not\nproceed to full conversion when 4 mol % of K1 was used.\nBased on these results, the optimal reaction conditions were\nestablished as described in Table 1, entry 11.\n根据已知信息回答问题：\n通过什么样的简单方法可以提高产率和对映选择性?\n所有的回答都根据已知信息的内容来回答，不要编造内容。用中文回答问题。'

## step5 根据context 生成答案

In [21]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [41]:
from sentence_transformers import SentenceTransformer

# 从本地读取模型 ./chemical-bert-uncased-simcse
model_name = "./chemical-bert-uncased-simcse"
model = SentenceTransformer('./chemical-bert-uncased-simcse')

No sentence-transformers model found with name ./chemical-bert-uncased-simcse. Creating a new one with mean pooling.


In [31]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer



BertTokenizerFast(name_or_path='./chemical-bert-uncased-simcse', vocab_size=31090, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	104: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [44]:
tokenized_inputs = tokenizer(prompt, return_tensors="pt")
tokenized_inputs

{'input_ids': tensor([[  102,   101,   101, 11815,   101,   101,   101,   101,   907,   191,
         17290, 19012,   707,   205,   694,   422,   111,  2426,  1544,   302,
          3934,   147,  2327,  5602,   603,   286,  3903,  1863,   131,   231,
         30130,   241,   501,   205,   791,   191,   407,   545,   422,   111,
          2409,  2426,  1245,   267,  3452,   188,  1356,   121,  1020,   158,
           422,  5762,  1021,   205,   101,   101,   101,   101,   101,   101,
           101,   101,   101,   101,   101,   101,   101,   101,   101,   101,
           101,   101,   101,   101,   101,   101,   101,   101,   101,   101,
           101,   101,   101,   101,   101,   101,   101,  3912,   101,   101,
           101,   101,   101,   101,   101,   101,   101,   101,   101,   101,
           101,   101,   101,   101,   101,   101,   101,   101,   101,   101,
           101,   101,   101,   101,   101,   101,   101,   101,   101,   101,
           101,   101,   103]]), 'toke

In [43]:
import torch

In [None]:
sen = "我觉得这家酒店不错，饭很好吃"
model.eval()
id2_label = { 0:'差评', 1:'好评'}
with torch.inference_mode():
    inputs = tokenizer(sen, return_tensors="pt")
    # inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs).logits
    pred = torch.argmax(outputs, dim=-1)

In [45]:
output = model(**tokenized_inputs)
output

TypeError: Sequential.forward() got an unexpected keyword argument 'input_ids'

In [25]:
from langchain_community.chat_models import ChatOllama # ChatOllma聊天模型
llm = ChatOllama(model="llama3", temperature=1)


In [27]:
from langchain.prompts import ChatPromptTemplate

In [30]:
retriver = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 2})
# prompt = ChatPromptTemplate.from_template(prompt)

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

rag_chain = (
    {"context": retriver | format_docs, "query": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

response = rag_chain.invoke(query)
print(response)

TypeError: 'ChatPromptValue' object is not subscriptable