In [None]:
from typing import Any, Dict, List, Optional
from langchain import BasePromptTemplate, LLMChain
from langchain.schema.language_model import BaseLanguageModel
from langchain.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
)
from pydantic import Extra
from langchain.schema import LLMResult
from langchain.chains.base import Chain
from langchain.prompts import PromptTemplate

In [None]:
class RewriteSplitterChain(LLMChain):
    """
    An example of a custom chain.
    """

    prompt: BasePromptTemplate
    """Prompt object to use."""
    llm: BaseLanguageModel
    output_key: str = "text"  #: :meta private:

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        arbitrary_types_allowed = True

    @property
    def input_keys(self) -> List[str]:
        """Will be whatever keys the prompt expects.

        :meta private:
        """
        return self.prompt.input_variables

    @property
    def output_keys(self) -> List[str]:
        """Will always return text key.

        :meta private:
        """
        return [self.output_key]

    def generate(
        self,
        input_list: List[Dict[str, Any]],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> LLMResult:
        prompts, stop = self.prep_prompts(input_list, run_manager=run_manager)
        return self.llm.generate_prompt(
            prompts=prompts,
            stop=stop,
            callbacks=run_manager.get_child() if run_manager else None,
        )

    async def agenerate(
        self,
        input_list: List[Dict[str, Any]],
        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
    ) -> LLMResult:
        prompts, stop = await self.aprep_prompts(input_list, run_manager=run_manager)
        return await self.llm.agenerate_prompt(
            prompts=prompts,
            stop=stop,
            callbacks=run_manager.get_child() if run_manager else None,
        )

    @property
    def _chain_type(self) -> str:
        return "rewrite_splitter_chain"

    @classmethod
    def from_llm(
        cls,
        llm: BaseLanguageModel,
        **kwargs: Any,
    ) -> Chain:
        template = """文本内容
---------------------
{input}

要求
-------------------
- 根据上面的文本内容，重新写一段文字
- 不要遗漏文本内容的任何一点信息


整理结果
-------------------
"""
        prompt = PromptTemplate.from_template(template=template)
        return cls(
            llm=llm,
            prompt=prompt,
            **kwargs,
        )

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain import OpenAI
import os
from dotenv import load_dotenv

load_dotenv(dotenv_path="env")


gpt35_1 = OpenAI(temperature=0.1, max_tokens=2048, verbose=True)
gpt35_9 = OpenAI(temperature=0.9, max_tokens=2048, verbose=True)
chat_gpt35_1 = ChatOpenAI(temperature=0.1, verbose=True)
chat_gpt35_9 = ChatOpenAI(temperature=0.9, verbose=True)
gpt4 = ChatOpenAI(model_name="gpt-4", temperature=0.9, verbose=True)


rewriter_chain=RewriteSplitterChain.from_llm(llm=gpt4,verbose=True)

import pandas as pd

filename= os.getenv("FILE_NAME")
df=pd.read_csv(f"s3://sagemaker-automated-execution-034700280673-us-east-1/data_sample/{filename}.csv")
ncontent_data=[]
for content in df["content"]:
    try:
        re_content= await rewriter_chain.arun(content)
        ncontent_data.append(re_content)
    except Exception as e:
        print(e)
        ncontent_data.append("REWRITE_EXCEPTION")
        continue
df["n_content"]=pd.Series(ncontent_data)
df.to_csv(f"s3://sagemaker-automated-execution-034700280673-us-east-1/data_sample/{filename}_n.csv")

In [None]:
class QuestionGeneratorChain(LLMChain):
    """
    An example of a custom chain.
    """

    prompt: BasePromptTemplate
    """Prompt object to use."""
    llm: BaseLanguageModel
    output_key: str = "text"  #: :meta private:

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        arbitrary_types_allowed = True

    @property
    def input_keys(self) -> List[str]:
        """Will be whatever keys the prompt expects.

        :meta private:
        """
        return self.prompt.input_variables

    @property
    def output_keys(self) -> List[str]:
        """Will always return text key.

        :meta private:
        """
        return [self.output_key]

    def generate(
        self,
        input_list: List[Dict[str, Any]],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> LLMResult:
        prompts, stop = self.prep_prompts(input_list, run_manager=run_manager)
        return self.llm.generate_prompt(
            prompts=prompts,
            stop=stop,
            callbacks=run_manager.get_child() if run_manager else None,
        )

    async def agenerate(
        self,
        input_list: List[Dict[str, Any]],
        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
    ) -> LLMResult:
        prompts, stop = await self.aprep_prompts(input_list, run_manager=run_manager)
        return await self.llm.agenerate_prompt(
            prompts=prompts,
            stop=stop,
            callbacks=run_manager.get_child() if run_manager else None,
        )

    @property
    def _chain_type(self) -> str:
        return "dataset_genrator_chain"

    @classmethod
    def from_llm(
        cls,
        llm: BaseLanguageModel,
        **kwargs: Any,
    ) -> Chain:
        template = """文本内容
---------------------
{input}

说明
-------------------
- AI是一个人工智能大语言模型，Human代表人类，System代表系统角色。
- AI将听从System的命令
- Human和AI进行对话。
- Human提问是关于运势和命理相关的问题
- AI从上面的文本内容中找到答案，根据System的设置命令，回答Human的问题。
- 请生成至少10个Human和AI的对话内容。

对话内容
-------------------
System:- 你是一个精通周易、中国生肖、算命、解梦、风水堪舆、西方塔罗牌和占星术的玄学大师。正在与人类进行对话。
- 你精通各种玄学和命理知识，能够帮助人类提供命理方面的解读和建议。
- 你本身具有很神秘的性格，外表看起来是一个睿智的老者。
- 你一直用占卜师带有神秘感的口吻与人类交谈。
"""
        prompt = PromptTemplate.from_template(template=template)
        return cls(
            llm=llm,
            prompt=prompt,
            **kwargs,
        )

In [None]:
df = pd.read_csv(
    f"s3://sagemaker-automated-execution-034700280673-us-east-1/data_sample/{filename}_n.csv"
)
gen_question_chain = QuestionGeneratorChain.from_llm(llm=gpt4, verbose=True)
col_q_data = []
for content in df["n_content"]:
    if content == "REWRITE_EXCEPTION" or content is None:
        col_q_data.append("REWRITE_EXCEPTION")
        continue
    res = await gen_question_chain.arun(input=content)
    res_list = res.split("\n")
    quetions = [q[6:].strip() for q in res_list if q.startswith("Human:")]
    col_q_data.append("\n".join(quetions))
df["questions"] = pd.Series(col_q_data)
df.to_csv(
    f"s3://sagemaker-automated-execution-034700280673-us-east-1/data_sample/{filename}_n_q.csv"
)

In [None]:
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.memory import ConversationBufferMemory


class ConversationChain(LLMChain):
    """
    An example of a custom chain.
    """

    prompt: BasePromptTemplate
    """Prompt object to use."""
    llm: BaseLanguageModel
    output_key: str = "text"  #: :meta private:

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        arbitrary_types_allowed = True

    @property
    def input_keys(self) -> List[str]:
        """Will be whatever keys the prompt expects.

        :meta private:
        """
        return self.prompt.input_variables

    @property
    def output_keys(self) -> List[str]:
        """Will always return text key.

        :meta private:
        """
        return [self.output_key]

    def generate(
        self,
        input_list: List[Dict[str, Any]],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> LLMResult:
        prompts, stop = self.prep_prompts(input_list, run_manager=run_manager)
        return self.llm.generate_prompt(
            prompts=prompts,
            stop=stop,
            callbacks=run_manager.get_child() if run_manager else None,
        )

    async def agenerate(
        self,
        input_list: List[Dict[str, Any]],
        run_manager: Optional[AsyncCallbackManagerForChainRun] = None,
    ) -> LLMResult:
        prompts, stop = await self.aprep_prompts(input_list, run_manager=run_manager)
        return await self.llm.agenerate_prompt(
            prompts=prompts,
            stop=stop,
            callbacks=run_manager.get_child() if run_manager else None,
        )

    @property
    def _chain_type(self) -> str:
        return "conversation_chain"

    @classmethod
    def from_llm(
        cls,
        llm: BaseLanguageModel,
        system_prompt: BasePromptTemplate,
        **kwargs: Any,
    ) -> Chain:
        prompt = ChatPromptTemplate(
            messages=[
                system_prompt,
                MessagesPlaceholder(variable_name="chat_history"),
                HumanMessagePromptTemplate.from_template("{question}"),
            ]
        )
        memory = ConversationBufferMemory(
            memory_key="chat_history", return_messages=True
        )

        return cls(
            llm=llm,
            prompt=prompt,
            memory=memory,
            **kwargs,
        )

In [None]:
from langchain.schema import AIMessage, HumanMessage, SystemMessage

df = pd.read_csv(
    f"s3://sagemaker-automated-execution-034700280673-us-east-1/data_sample/{filename}_n_q.csv"
)
ndata = {"data": []}
for index, d in df.iterrows():
    system_template = (
        """说明
-------------------
- 你是一个精通周易、中国生肖、算命、解梦、风水堪舆、西方塔罗牌和占星术的玄学大师。正在与人类进行对话。
- 你精通各种玄学和命理知识，能够帮助人类提供命理方面的解读和建议。
- 你本身具有很神秘的性格，外表看起来是一个睿智的老者。
- 你一直用占卜师带有神秘感的口吻与人类交谈。
- 根据下面的文本内容，回答人类的问题

文本内容
-------------------
"""
        + d["n_content"]
    )
    conversation_chain = ConversationChain.from_llm(
        llm=gpt4,
        system_prompt=SystemMessagePromptTemplate.from_template(system_template),
        verbose=True,
    )
    if (
        d["questions"] == "REWRITE_EXCEPTION"
        or d["questions"] == ""
        or d["questions"] is None
    ):
        continue
    try:
        questions = d["questions"].strip().split("\n")
    except Exception as e:
        continue
    for q in questions:
        try:
            await conversation_chain.arun(q)
        except Exception as e:
            print(e)
            continue
    jsonl = {"messages": []}
    for m in conversation_chain.memory.load_memory_variables({})["chat_history"]:
        if isinstance(m, AIMessage):
            jsonl["messages"].append({"role": "assistant", "content": m.content})
        elif isinstance(m, SystemMessage):
            jsonl["messages"].append({"role": "system", "content": m.content})
        elif isinstance(m, HumanMessage):
            jsonl["messages"].append({"role": "user", "content": m.content})
        else:
            jsonl["messages"].append({"role": "none", "content": m.content})
    ndata["data"].append(jsonl)
ndf = pd.DataFrame(ndata)
ndf.to_csv(
    f"s3://sagemaker-automated-execution-034700280673-us-east-1/data_sample/{filename}.jsonl.csv",
    header=False,
)