# 搭建rag 构建向量数据库

## step1 导包

In [2]:
import os

In [5]:
from langchain.document_loaders import PyPDFLoader, UnstructuredFileLoader


## step2 加载文档数据

In [3]:
root_dir = "./papers"

def extract_file_dirs(directory):
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".pdf"):
                fp = os.path.join(root, file)
                file_paths.append(fp)
    return file_paths

files = extract_file_dirs(root_dir)
files

['./papers\\Enantioselective Iridium-Catalyzed Allylic Substitution with 2-Methylpyridines.pdf',
 './papers\\Iridium-Catalyzed Asymmetric Allylic Amination Reaction with N-Aryl Phosphoramidite Ligands.pdf']

In [17]:
from pdfminer.high_level import extract_text

text = extract_text(files[0])
print(text)
len(text)

# 使用pdfminer提取pdf中的目录
from pdfminer.pdf import PdfDocument, PdfReader, PdfWriter
from pdfminer.pdftypes import resolve_all
from pdfminer.layout import LAParams, LTTextBox, LTTextLine, LTChar, LTAnno
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator



Communications

Angewandte

Chemie

Asymmetric Catalysis

International Edition: DOI: 10.1002/anie.201700433
German Edition:
DOI: 10.1002/ange.201700433

Enantioselective Iridium-Catalyzed Allylic Substitution with
2-Methylpyridines
Xi-Jia Liu and Shu-Li You*

Abstract: An enantioselective iridium-catalyzed allylic sub-
stitution with a set of highly unstabilized nucleophiles gen-
erated in situ from 2-methylpyridines is described. Enantioen-
riched 2-substituted pyridines, which are frequently encoun-
tered in natural products and pharmaceuticals, could be easily
constructed by this simple method in good yields and excellent
enantioselectivity. The synthetic utility of the pyridine products
is demonstrated through the synthesis of a key intermediate of
a reported Na+/H+ exchanger inhibitor and the total synthesis
of ((cid:2))-lycopladine A.

Pyridines are among the most prevalent heterocyclic

structural moieties in biologically active natural products,
pharmaceuticals, and agrochemic

21916

In [6]:
docs = []
for file_path in files:
    loader = PyPDFLoader(file_path)
    docs.extend(loader.load())
docs

[Document(metadata={'source': './papers\\Enantioselective Iridium-Catalyzed Allylic Substitution with 2-Methylpyridines.pdf', 'page': 0}, page_content='German Edition: DOI: 10.1002/ange.201700433Asymmetric CatalysisInternational Edition: DOI: 10.1002/anie.201700433\nEnantioselective Iridium-Catalyzed Allylic Substitution with\n2-Methylpyridines\nXi-Jia Liu and Shu-Li You*\nAbstract: An enantioselective iridium-catalyzed allylic sub-\nstitution with a set of highly unstabilized nucleophiles gen-\nerated in situ from 2-methylpyridines is described. Enantioen-\nriched 2-substituted pyridines, which are frequently encoun-tered in natural products and pharmaceuticals, could be easilyconstructed by this simple method in good yields and excellentenantioselectivity. The synthetic utility of the pyridine productsis demonstrated through the synthesis of a key intermediate of\na reported Na\n+/H+exchanger inhibitor and the total synthesis\nof (/C0)-lycopladine A.\nPyridines are among the most pre

## step 自定义数据集

In [None]:
from torch.utils.data import Dataset

In [None]:
class ChemicalDataset:
    def __init__(self, data_path, tokenizer, max_len=512):
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = self.load_data()

    def load_data(self):
        data = []
        with open(self.data_path, 'r') as f:
            for line in f:
                line = line.strip()
                if line:
                    data.append(line)
                    data.append(self.tokenizer.encode(line, add_special_tokens=True, max_length=self.max_len, truncation=True))
                    data.append(self.tokenizer.decode(self.tokenizer.encode(line, add_special_tokens=True, max_length=self.max_len, truncation=True)))
                    data.append(self.tokenizer.tokenize(line, add_special_tokens=True, max_length=self.max_len, truncation=True))
                    data.append(self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(line, add_special_tokens=True, max_length=self.max_len, truncation=True)))
                    data.append(self.tokenizer.convert_ids_to_tokens(self.tokenizer.encode(line, add_special_tokens=True, max_length=self.max_len, truncation=True)))
        return data
                    

## step3 文档分割

## step 问题描述

In [10]:
query4 = '请回答关于此篇文献中有关于化学反应的优化相关的问题，说明清楚如催化剂/配体/添加剂/溶剂/温度/时间/空气等的实验条件，对反应产率或者选择性产生了哪些影响，并给出关于此化学反应的清晰明确的优化建议。'
query7 = '请说明此篇文献中化学反应机理的推导过程，并且列举出化学反应的实验结果和实验结论。'


In [11]:
questions = [
    query4,
    query7
]

## step 模板构建

In [12]:
chemical_template = "你是一位高分子化学领域的专家教授，请用中文或者化学语言详尽的回答如下问题："

## step 构建完整的输入

In [13]:
inputs = []
for q in questions:
    inputs.append(chemical_template + q)
inputs

['你是一位高分子化学领域的专家教授，请详尽的回答如下问题：请回答关于此篇文献中有关于化学反应的优化相关的问题，说明清楚如催化剂/配体/添加剂/溶剂/温度/时间/空气等的实验条件，对反应产率或者选择性产生了哪些影响，并给出关于此化学反应的清晰明确的优化建议。',
 '你是一位高分子化学领域的专家教授，请详尽的回答如下问题：请说明此篇文献中化学反应机理的推导过程，并且列举出化学反应的实验结果和实验结论。']

## step4 分词器

## step5 构建模型和分词器

In [8]:
from sentence_transformers import SentenceTransformer

# 从本地读取模型 ./chemical-bert-uncased-simcse
model_name = "./chemical-bert-uncased-simcse"
model = SentenceTransformer('./chemical-bert-uncased-simcse')
model

  from tqdm.autonotebook import tqdm, trange
No sentence-transformers model found with name ./chemical-bert-uncased-simcse. Creating a new one with mean pooling.


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [14]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer



BertTokenizerFast(name_or_path='chemical-bert-uncased-simcse', vocab_size=31090, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	104: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
model(**inputs)

## step6 检索