In [2]:
from typing import Any, Dict, List, Optional
from langchain import BasePromptTemplate, PromptTemplate
from langchain.chains.base import Chain
from langchain.schema.language_model import BaseLanguageModel
from langchain.chains import LLMChain
from pydantic import Extra, Field
from langchain.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
)

In [3]:
from pydantic import BaseModel


class FileInfo(BaseModel):
    source_path: str
    description: str
    target_path: str

In [4]:
import base64
import itertools
import re
from pathlib import Path


def head_file(path: str, n: int) -> List[str]:
    try:
        with open(path, "r") as f:
            return [str(line) for line in itertools.islice(f, n)]
    except Exception:
        return []

# import markdown
# from markdown.extensions.codehilite import CodeHiliteExtension
def strip_markdown_code(md_string: str) -> str:
    # stripped_string = re.sub(r"^`{1,3}.*?\n", "", md_string, flags=re.DOTALL)
    # stripped_string = re.sub(r"`{1,3}$", "", stripped_string)
    # return stripped_string
    # md = markdown.Markdown(md_string,extensions=[CodeHiliteExtension()])

    # # 转换markdown为html
    # html = md.convert(md_string)

    # # 对于每一个代码块，查看它是否是Python代码
    # for codeblock in md.parser.blockprocessors.get('codehilite').code_blocks:
    # 	if codeblock[0] == 'python':
    # 		return codeblock[1]	
    code_blocks = re.findall(r'```python\r?\n(.*?)\r?\n```', md_string, re.S)
    return "\n".join(code_blocks)

def file_to_base64(path: str) -> str:
    with open(path, "rb") as f:
        return base64.b64encode(f.read()).decode()


def add_file(
    files, source_path: str, target_path: str, description: str
) -> Dict[str, FileInfo]:
    if files is None:
        files: Dict[str, FileInfo] = {}
    if target_path in files:
        raise ValueError("target_path already exists")
    if not Path(source_path).exists():
        raise ValueError("source_path does not exist")
    files[target_path] = FileInfo(
        target_path=target_path,
        source_path=source_path,
        description=description,
    )
    return files


def make_input_files(sfiles: Dict[str, FileInfo]) -> List[dict]:
    files = []
    for target_path, file_info in sfiles.items():
        files.append(
            {
                "pathname": target_path,
                "contentsBasesixtyfour": file_to_base64(file_info.source_path),
            }
        )
    return files

s="""对不起，我无法直接阅读PDF文件的内容。我需要使用Python的PDF处理库，如PyPDF2，来提取PDF文件的文本。然而，这种方法可能无法完全准确地提取所有的文本，特别是如果文本是图像或者有复杂的格式。这是一个可能的代码示例：

```python
import PyPDF2

def extract_text_from_pdf(file_path):
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
    page_obj = pdf_reader.getPage(0)
    text = page_obj.extractText()
    pdf_file_obj.close()
    return text

print(extract_text_from_pdf('swftc_analysis.pdf'))
```

这段代码将打开PDF文件，创建一个PDF阅读器对象，获取第一页，提取文本，然后关闭文件。最后，它将打印出提取的文本。"""
print(strip_markdown_code(s))

import PyPDF2

def extract_text_from_pdf(file_path):
    pdf_file_obj = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file_obj)
    page_obj = pdf_reader.getPage(0)
    text = page_obj.extractText()
    pdf_file_obj.close()
    return text

print(extract_text_from_pdf('swftc_analysis.pdf'))


In [5]:
import json
import requests


class BearlyChain(Chain):
    """
    An example of a custom chain.
    """

    prompt: BasePromptTemplate
    """Prompt object to use."""
    llm: BaseLanguageModel
    api_key: str
    output_key: str = "text"  #: :meta private:

    res_chain: Optional[LLMChain] = Field(default=None, exclude=True)

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        arbitrary_types_allowed = True

    @property
    def input_keys(self) -> List[str]:
        """Will be whatever keys the prompt expects.

        :meta private:
        """
        return self.prompt.input_variables

    @property
    def output_keys(self) -> List[str]:
        """Will always return text key.

        :meta private:
        """
        return [self.output_key]

    def _call(
        self,
        inputs: Dict[str, Any],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Dict[str, str]:
        prompt_value = self.prompt.format_prompt(**inputs)
        if run_manager:
            run_manager.on_text(
                prompt_value.to_string(), color="green", end="\n", verbose=self.verbose
            )
        result = self.llm.generate_prompt(
            [prompt_value], callbacks=run_manager.get_child() if run_manager else None
        )
        if run_manager:
            run_manager.on_text(
                result.generations[0][0].text,
                color="yellow",
                end="\n",
                verbose=self.verbose,
            )
        script = strip_markdown_code(result.generations[0][0].text)
        resp = requests.post(
            "https://exec.bearly.ai/v1/interpreter",
            data=json.dumps(
                {
                    "fileContents": script,
                    "inputFiles": make_input_files(inputs["file_info"]),
                    "outputDir": "output/",
                    "outputAsLinks": True,
                }
            ),
            headers={"Authorization": self.api_key},
        ).json()
        res = (
            "stdout:" + base64.b64decode(resp["stdoutBasesixtyfour"]).decode()
            if resp["stdoutBasesixtyfour"]
            else ""
            + "\n"
            + "stderr:"
            + base64.b64decode(resp["stderrBasesixtyfour"]).decode()
            if resp["stderrBasesixtyfour"]
            else ""
            + "\n"
            + "fileLinks:"
            + resp["fileLinks"]
            + "\n"
            + "exitCode:"
            + resp["exitCode"]
        )
        return {self.output_key: res}

    async def _acall(
        self,
        inputs: Dict[str, Any],
        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
    ) -> Dict[str, str]:
        prompt_value = self.prompt.format_prompt(**inputs)
        if run_manager:
            await run_manager.on_text(
                prompt_value.to_string(), color="green", end="\n", verbose=self.verbose
            )
        result = await self.llm.agenerate_prompt(
            [prompt_value], callbacks=run_manager.get_child() if run_manager else None
        )
        if run_manager:
            await run_manager.on_text(
                result.generations[0][0].text,
                color="yellow",
                end="\n",
                verbose=self.verbose,
            )
        script = strip_markdown_code(result.generations[0][0].text)
        resp = requests.post(
            "https://exec.bearly.ai/v1/interpreter",
            data=json.dumps(
                {
                    "fileContents": script,
                    "inputFiles": make_input_files(inputs["file_info"]),
                    "outputDir": "output/",
                    "outputAsLinks": True,
                }
            ),
            headers={"Authorization": self.api_key},
        ).json()
        res = (
            "stdout:" + base64.b64decode(resp["stdoutBasesixtyfour"]).decode()
            if resp["stdoutBasesixtyfour"]
            else ""
            + "\n"
            + "stderr:"
            + base64.b64decode(resp["stderrBasesixtyfour"]).decode()
            if resp["stderrBasesixtyfour"]
            else ""
            + "\n"
            + "fileLinks:"
            + resp["fileLinks"]
            + "\n"
            + "exitCode:"
            + resp["exitCode"]
        )
        return {self.output_key: res}

    @property
    def _chain_type(self) -> str:
        return "bearly_chain"

    @classmethod
    def file_description(self, files: Dict[str, FileInfo]) -> str:
        if len(files) == 0:
            return ""
        lines = ["The following files available in the evaluation environment:"]
        for target_path, file_info in files.items():
            peek_content = head_file(file_info.source_path, 4)
            lines.append(
                f"- path: `{target_path}` \n first four lines: {peek_content}"
                f" \n description: `{file_info.description}`"
            )
        return "\n".join(lines)

    @classmethod
    def from_llm(
        cls,
        llm: BaseLanguageModel,
        api_key: str,
        **kwargs: Any,
    ) -> Chain:
        template = """If you can't give a direct answer to the question below, please try writing Python code to get the answer. \
And please consider evaluating python code in a sandbox environment. \
The environment resets on every execution. \
You must send the whole script every time and print your outputs. \
Script should be pure python code that can be evaluated. \
It should be in python format NOT markdown. \
The code should NOT be wrapped in backticks. \
All python packages including requests, matplotlib, scipy, numpy, pandas, etc are available. \
If you have any files outputted write them to "output/" relative to the execution path. Output can only be read from the directory, stdout, and stdin. \
Do not use things like plot.show() as it will not work instead write them out `output/` and a link to the file will be returned. \
print() any output and results so you can capture the output.
{file_description}

{file_info}

Question:{question}
Answer:"""
        prompt = PromptTemplate.from_template(template=template)
        return cls(llm=llm, prompt=prompt, api_key=api_key, **kwargs)

/tmp/ipykernel_2258/802105736.py:21: PydanticDeprecatedSince20: `pydantic.config.Extra` is deprecated, use literal values instead (e.g. `extra='allow'`). Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.3/migration/
  extra = Extra.forbid


In [6]:
import os
from langchain.chat_models import ChatOpenAI
from dotenv import load_dotenv

load_dotenv(dotenv_path="env")
llm = ChatOpenAI(model="gpt-4", temperature=0)
chain = BearlyChain.from_llm(llm=llm, api_key=os.getenv("BEARLY_API_KEY"), verbose=True)
filePath = input("Input your file path:")
files = add_file(None, source_path=filePath, target_path=filePath, description="")
file_description = BearlyChain.file_description(files=files)
quesiton = input("Input your quetion:")
res = await chain.arun(
    question=quesiton, file_description=file_description, file_info=files
)
print(f"AI:{res}")

Input your file path: swftc_analysis.pdf
Input your quetion: 请提取一下上面文件第一页的内容。




[1m> Entering new BearlyChain chain...[0m
[32;1m[1;3mIf you can't give a direct answer to the question below, please try writing Python code to get the answer. And please consider evaluating python code in a sandbox environment. The environment resets on every execution. You must send the whole script every time and print your outputs. Script should be pure python code that can be evaluated. It should be in python format NOT markdown. The code should NOT be wrapped in backticks. All python packages including requests, matplotlib, scipy, numpy, pandas, etc are available. If you have any files outputted write them to "output/" relative to the execution path. Output can only be read from the directory, stdout, and stdin. Do not use things like plot.show() as it will not work instead write them out `output/` and a link to the file will be returned. print() any output and results so you can capture the output.
The following files available in the evaluation environment:
- path: `swftc