<a href="https://colab.research.google.com/github/hanlintao/langchain/blob/main/PDF2BiTerm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install openai
%pip install langchain==0.0.263

In [None]:
import os
import json
import pandas as pd
import re

from langchain.document_loaders import PyPDFLoader
from langchain.output_parsers import StructuredOutputParser, ResponseSchema #结构化输出工具
from langchain import PromptTemplate
#from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.callbacks import get_openai_callback

os.environ['OPENAI_API_KEY'] = "sk-***"  #改成自己的key

In [None]:
#以下函数用于抽取双语术语

def term_extract(pdf):
  response_schemas = [
    ResponseSchema(name="english", description="terminology in English"),
    ResponseSchema(name="chinese", description="terminology in Chinese, the translation of English source")
  ]
  output_parser = StructuredOutputParser.from_response_schemas(response_schemas)

  format_instructions = output_parser.get_format_instructions()

  prompt = PromptTemplate(
      template="请从下面的内容中抽取英文术语并翻译成中文，并按以下格式输出\n{format_instructions}\n{chunk}",
      input_variables=["chunk"],
      partial_variables={"format_instructions": format_instructions}
  )
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=0)

  split_pdf_num = text_splitter.split_documents(pdf)

  print(f'你的PDF文件被切割成了 {len(split_pdf_num)} 份')

  OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

  llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, model_name="gpt-4-0613") #可以切换成其他模型，如gpt-3.5-turbo-0613

  total_cost = 0  # 初始化总费用
  results = {}  # 初始化一个空字典来存储结果

  for i, split_pdf in enumerate(split_pdf_num):
    print(f'\n--- 第{i+1}页 ---')

    cleaned_text = split_pdf.page_content.replace('\n', '')

    print(cleaned_text)

    with get_openai_callback() as cb:
        chain = LLMChain(llm=llm, prompt=prompt)
        extracted_terms = chain.run(cleaned_text)
        print(f'\n术语为：{extracted_terms}')
        print(f"\n本次消耗: {cb.total_cost} 美元")
        total_cost += cb.total_cost  # 更新总费用

        results[i] = extracted_terms  # 直接将extracted_terms存储在字典中

  print(f"\n所有页面术语抽取完毕。总费用为: {total_cost} 美元")  # 所有循环结束后，打印总费用

  # 所有循环结束后，将结果写入JSON文件
  with open('results.json', 'w') as f:
      json.dump(results, f)

In [None]:
#以下函数用于翻译PDF文件

def read_pdf(pdf):
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=0)
  split_pdf_num = text_splitter.split_documents(pdf)
  print(f'你的PDF文件被切割成了 {len(split_pdf_num)} 份')

  template = "请将下面的内容翻译中文：{chunk}"

  prompt = PromptTemplate(
      input_variables=["chunk"],
      template=template,
  )

  OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
  llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY, model_name="gpt-4-0613")

  total_cost = 0  # 初始化总费用

  for i, split_pdf in enumerate(split_pdf_num):
    print(f'\n--- 第{i+1}页 ---')

    cleaned_text = split_pdf.page_content.replace('\n', '')

    print(cleaned_text)

    with get_openai_callback() as cb:
        chain = LLMChain(llm=llm, prompt=prompt)
        translated_text = chain.run(cleaned_text)
        print(f'\n译文为：{translated_text}')
        print(f"\n本次消耗: {cb.total_cost} 美元")
        total_cost += cb.total_cost  # 更新总费用

  print(f"\n所有页面翻译完毕。总费用为: {total_cost} 美元")  # 所有循环结束后，打印总费用

In [None]:
%pip install pypdf

In [None]:
#!wget "https://jostrans.org/issue40/art_herbert.pdf" -O demo.pdf

loader = PyPDFLoader("demo.pdf")
pages = loader.load_and_split()
#read_pdf(pages[:3]) #用于试译
#read_pdf(pages)   #用于全文翻译
term_extract(pages)  #用于术语抽取

你的PDF文件被切割成了 12 份

--- 第1页 ---
Article  How Can ChatGPT Benefit Pharmacy: A Case Report on     Review Writing   Yun Zhu a,1, Dan Han a,1, Shaoqing Chen c,1, Feng Zeng d, Cheng Wang b,* a Department of Pharmacy, Nanjing Drum Tower Hospital, The Affiliated Hospital of Nanjing University Medical School, Nanjing, China  b School of Pharmacy, Changzhou University, Changzhou 213164 Jiangsu, China  c Second People's Hospital of Changzhou, Nanjing Medical University, Changzhou, Jiangsu, China  d State Key Laboratory of Materials -Oriented Chemical Engineering, College of Chemical Engineering, Nanjing Tech University, Nanjing 211816, Jiangsu, China  1 These authors have contributed equally to this work.  * Correspondence:  C. Wang: wangc90@cczu.edu.cn  Abstract : Artificial Intelligence (AI) is a breakthrough  technology that has been widely applied in many fields and its use in pharmacy is also gaining increasing attentions. Recently, ChatGPT, a newly devel oped virtual assistant and large lan

In [None]:
# 打开JSON文件
with open('results.json', 'r') as f:
    data = json.load(f)

# 打印数据以查看内容
print(data)

In [None]:
# 读取 JSON 文件
with open('results.json', 'r') as f:
    data = json.load(f)

# 创建一个空列表，用于存储所有的英文和中文词对
entries = []

# 遍历外层 JSON 的每个键值对
for k, v in data.items():
    # 查找所有的 JSON 对象
    json_objects = re.findall(r'```json\n(.*?)\n```', v, re.DOTALL)

    # 遍历每个 JSON 对象
    for json_object in json_objects:
        # 解析 JSON 对象
        entry = json.loads(json_object)

        # 将解析后的词对添加到列表中
        entries.append(entry)

# 将词对列表转换为 pandas DataFrame
df = pd.DataFrame(entries)

# 将 DataFrame 导出为 Excel 文件
df.to_excel('results.xlsx', index=False)

In [None]:
# 读取 Excel 文件
df = pd.read_excel('results.xlsx')

# 预览 DataFrame
print(df.head())

                              english      chinese
0             Artificial Intelligence         人工智能
1                             ChatGPT      ChatGPT
2  lipid- based drug delivery systems  基于脂质的药物递送系统
3                    machine learning         机器学习
4                       deep learning         深度学习
