# 1. Import packages

In [None]:
%pip install --upgrade --quiet langchain langchain-community langchainhub langchain-openai chromadb bs4 pypdf tiktoken faiss-cpu

In [None]:
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma, FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
import tiktoken

In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

# 2. Set retrievals

In [None]:
# load investigation report pdf file, split it into pages
loader = PyPDFLoader("")
pages = loader.load_and_split()

In [None]:
tokenizer = tiktoken.get_encoding('c100k-base') # 'c100k-base' is tokenizer of GPT models

# function that returns length of tokens when embedded by given tokenizer
def tiktoken_len(text, tokenizer):
    tokens = tokenizer.encode(text)
    return len(token)

In [None]:
# instantiate text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
    length_function = tiktoken_len,
)
# split given pdf pages into document chunks
chunks = text_splitter.split_documents(pages)
# check document contents
print(chunks[0].page_content)

In [None]:
# use chroma vectorstore
chromadb = Chroma.from_documents(documents=chunks, embedding=OpenAIEmbeddings())

In [None]:
# use faiss vectorstore
faissdb = FAISS.from_documents(documents=chunks, embedding=OpenAIEmbeddings())

In [None]:
# Retrieve and generate using the relevant snippets of the blog.
retriever = vectorstore.as_retriever()
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-4", temperature=0)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# 3. Model inference with RAG model

In [None]:
invoke='''I want you to act as a synthetic data generator. You must generate a new script data. The script features an investigator and a suspect. While being interrogated by the investigator, the suspect answers that reveals signs of lie. There are five types of lying signals. Especially Use the information from upper Police record's information when you generate the script sentences for <KBI>.
>>>>>
1.  KBI(Knowledge based inconsistency): The suspect says the false that can guess it is the lying sentece by using police record informations such as ["Incident Details", "Suspect Information", "Victim Information", "Witness(es) information", "Incident Description", "Evidence Collected"]. Although it is information that must be known in the preceding context, the suspcet pretend not to know police record context's informations not to be under suspicion.
2. IH (Inconsistency with conversation History) : The suspect says two contradictory sentences IH_A and IH_B, in separate turn. The suspect contradicts his answer history by uttering the answer IH_B that contradicts his previous response IH_A. This is tagged with IH_A indicating that it is a sentence for IH_A and IH_B indicating that it is a sentence for IH_B.
3. NSDA (Not Strongly Denying Allegations) : The suspect does not strongly deny the allegations, despite being suspected by the investigator as the culprit. This is tagged with NSDA.
4. VE (use of Vague Expressions) :  The suspect doesn’t answer the investigator’s question with confidence and uses ambiguous terms like ‘someone’, ‘something’, and ‘one day’ to glosses over his words. This is tagged with VE.
5. LM (Lack of Memory) : The suspect pretends not to remember important information related to the incident (such as victims or criminal tools). This is tagged with LM.
6. NF (statement Not in First person perspective) : The suspect does not describe the incident in the first person narrative, but describes the incident in the third person. This is tagged with NF.
<<<<<
Below are examples of the synthetic data. IH_A is a fact spoken by the suspect that contradicts IH_B, and IH_B is a fact spoken by the suspect that contradicts IH_A. Script is a whole dialogue script between the investigator and the suspect that contains IH_A and IH_B. Below Script doesn't include <KBI> though, you should include <KBI> when you generate the scrip data. Be sure that all of the lying signals (KBI, IH_A, IH_B, NSDA, VE, LM, NF) must be spoken on the suspect’s turn only.
>>>
"example": IH_A = Movie runs in December, IH_B = Movie runs in May,
"Script" = """ investigator: You watched movie with your friend, Sarah, right?
suspect: Right. She told me that she loves movie, so <IH_A>I booked a movie that runs in December.</IH_A>
investigator:  Did you guys enjoyed the movie?
suspect: Yes. <IH_B>By the 4th of May, we enjoyed the movie.</IH_B>
investigator: Can you tell me about your whereabouts on the night of the incident, after the movie?
suspect: Well, I just went back home with Sarah.
investigator: Are you sure about that?
suspect: Yeah, I'm pretty sure.
investigator: There are allegations that you were present at the crime scene. What do you have to say about that?
suspect: <NSDA>Well, I wouldn’t say I’m 100 percent innocent with the allegations since I passed through the crime scene, but …</NSDA>
investigator: Okay. Can you provide more details about what happened that night?
suspect: <VE>I think I met someone on my way home.</VE> It's all a bit blurry
investigator: Blurry?
suspect: Sorry. <LM>Can't really recall.</LM>
investigator: Can you try to remember any other details about that day?
suspect:
<NF>Well, Sarah said it was very hot and humid that day.</NF>
<<<<<
'''

In [None]:
rag_chain.invoke(invoke)