In [4]:
import os
from dotenv import load_dotenv
from langchain_core.globals import set_debug

set_debug(True)
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_ENDPOINT = os.getenv("OPENAI_API_ENDPOINT")
MODEL = "gpt-4o"

## Base LangChain Usage

In [5]:
from langchain_openai.chat_models import ChatOpenAI

llm = ChatOpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_API_ENDPOINT, model=MODEL)

llm.invoke("hi")

[32;1m[1;3m[llm/start][0m [1m[llm:ChatOpenAI] Entering LLM run with input:
[0m{
  "prompts": [
    "Human: hi"
  ]
}
[36;1m[1;3m[llm/end][0m [1m[llm:ChatOpenAI] [2.51s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Hello! How can I help you today?",
        "generation_info": {
          "finish_reason": "stop",
          "logprobs": null
        },
        "type": "ChatGeneration",
        "message": {
          "lc": 1,
          "type": "constructor",
          "id": [
            "langchain",
            "schema",
            "messages",
            "AIMessage"
          ],
          "kwargs": {
            "content": "Hello! How can I help you today?",
            "additional_kwargs": {
              "refusal": null
            },
            "response_metadata": {
              "token_usage": {
                "completion_tokens": 9,
                "prompt_tokens": 8,
                "total_tokens": 17,
                "completion_

AIMessage(content='Hello! How can I help you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_67802d9a6d', 'finish_reason': 'stop', 'logprobs': None}, id='run-93cf1d9b-6032-4292-bef4-ab66ff5dd5db-0', usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17, 'input_token_details': {}, 'output_token_details': {}})

In [6]:
from typing import cast
from langchain_openai.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

# https://aclanthology.org/2024.tacl-1.10/
# Title: Red Teaming Language Model Detectors with Language Models
abstract = (
    "The prevalence and strong capability of large language models (LLMs) present significant safety and ethical risks if exploited by malicious users. "
    "To prevent the potentially deceptive usage of LLMs, recent work has proposed algorithms to detect LLM-generated text and protect LLMs. "
    "In this paper, we investigate the robustness and reliability of these LLM detectors under adversarial attacks. "
    "We study two types of attack strategies: "
    "1) replacing certain words in an LLM’s output with their synonyms given the context; "
    "2) automatically searching for an instructional prompt to alter the writing style of the generation. "
    "In both strategies, we leverage an auxiliary LLM to generate the word replacements or the instructional prompt. "
    "Different from previous works, we consider a challenging setting where the auxiliary LLM can also be protected by a detector. "
    "Experiments reveal that our attacks effectively compromise the performance of all detectors in the study with plausible generations, "
    "underscoring the urgent need to improve the robustness of LLM-generated text detection systems. "
    "Code is available at https://github.com/shizhouxing/LLM-Detector-Robustness."
)


class AbstractResult(BaseModel):
    content: str = Field(..., title="Summary of the paper abstract in Traditional Chinese")


def summary_paper_abstract(llm: ChatOpenAI, abstract: str) -> AbstractResult:
    prompt = ChatPromptTemplate(
        [
            ("system", "You are a AI researcher."),
            ("human", "Summary the paper abstract in Traditional Chinese."),
            ("human", "Paper Abstract:```{abstract}```"),
        ]
    )
    llm = llm.with_structured_output(AbstractResult).with_retry(stop_after_attempt=6)
    chain = prompt | llm
    response = chain.invoke(input={"abstract": abstract})
    return cast(AbstractResult, response)


summary_paper_abstract(llm, abstract)

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "abstract": "The prevalence and strong capability of large language models (LLMs) present significant safety and ethical risks if exploited by malicious users. To prevent the potentially deceptive usage of LLMs, recent work has proposed algorithms to detect LLM-generated text and protect LLMs. In this paper, we investigate the robustness and reliability of these LLM detectors under adversarial attacks. We study two types of attack strategies: 1) replacing certain words in an LLM’s output with their synonyms given the context; 2) automatically searching for an instructional prompt to alter the writing style of the generation. In both strategies, we leverage an auxiliary LLM to generate the word replacements or the instructional prompt. Different from previous works, we consider a challenging setting where the auxiliary LLM can also be protected by a detector. Experiments reveal that our at

AbstractResult(content='大型語言模型（LLM）的普及和強大能力，如果被惡意用戶利用，將帶來顯著的安全和倫理風險。為了防止LLM的潛在欺騙性使用，近期的研究提出了檢測LLM生成文本和保護LLM的算法。在本文中，我們調查了這些LLM檢測器在對抗攻擊下的穩健性和可靠性。我們研究了兩種攻擊策略：1）在給定上下文的情況下，用同義詞替換LLM輸出中的某些詞；2）自動搜索指令提示以改變生成的寫作風格。在這兩種策略中，我們利用輔助LLM來生成詞替換或指令提示。與之前的工作不同，我們考慮了一種具有挑戰性的設置，即輔助LLM也可以受到檢測器的保護。實驗結果顯示，我們的攻擊有效地損害了研究中的所有檢測器的性能，生成了合理的文本，強調了提高LLM生成文本檢測系統穩健性的迫切需要。代碼可在https://github.com/shizhouxing/LLM-Detector-Robustness獲得。')