# 1. Import packages

In [1]:
%pip install --upgrade --quiet langchain langchain-community langchainhub langchain-openai chromadb bs4 pypdf tiktoken

Note: you may need to restart the kernel to use updated packages.


In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub, HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain import LLMChain
import tiktoken
import os

In [11]:
os.environ["OPENAI_API_KEY"] = "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"
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'
os.environ['HF_HOME'] = '/Users/parkjunhyeong/Desktop/GitHub/deception-detection/model'

# 2. Set retrievals

In [12]:
tokenizer = tiktoken.get_encoding('cl100k_base') # 'c100k-base' is tokenizer of GPT models

# function that returns length of tokens 
def tiktoken_len(text):
    tokens = tokenizer.encode(text)
    return len(tokens)

In [13]:
# document loader
loader = PyPDFLoader("../data/investigation-report.pdf")
pages = loader.load_and_split()

# chunking 
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    length_function = tiktoken_len
)
chunks = text_splitter.split_documents(pages)

# embedding & store to vectorstore
docsearch = Chroma.from_documents(documents=chunks, embedding=OpenAIEmbeddings())

# # fine-tuned llm
repo_id = model_id = 'junbro1016/deception-detection'
llm = HuggingFacePipeline.from_model_id(
    model_id=model_id,
    device=-1,
    task='text-generation',
    model_kwargs={"temperature":1,
                  "max_length":64},
)

# Retriever
qa = RetrievalQA.from_chain_type(llm = llm,
                                 chain_type = 'stuff',
                                 retriever = docsearch.as_retriever(
                                     search_type = 'mmr',
                                     search_kwargs = {'k':3, 'fetch_k':10}),
                                 return_source_documents = True)

# 3. Model inference with RAG model

In [14]:
INSTRUCTION = '''
I want you to find lying signals from a given conversation script. The given script features an investigator and a suspect. There are several lying signals in the suspect’s utterances. You have to find sentences that reveal lying signals and tag the sentences with the signal type. Note that all lying signals (IH_A, IH_B, VE, LM, TP, KI) are spoken on the suspect’s turn only. There are five types of lying signals.
>>>>
1. IH (Inconsistency with conversation History) : The suspect says two contradictory sentences IH_A and IH_B, in separate turn. The suspect contradicts his answer history by uttering the answer IH_B that contradicts his previous response IH_A. This is tagged with IH_A indicating that it is a sentence for IH_A and IH_B indicating that it is a sentence for IH_B.
2. VE (use of Vague Expressions) :  The suspect uses ambiguous terms like ‘someone’, ‘something’, or ‘one day’ to glosses over his words. This is tagged with VE.
3. LM (Lack of Memory) : The suspect pretends to forget important information related to the incident (such as victims or criminal tools). This is tagged with LM.
4. TP (describing in Third person Perspective) : The suspect describes the incident in the third person, although the suspect can describe the incident in the first person. This is tagged with TP.
5. KI (Knowledge-based Inconsistency) : The suspect gives an answer that is inconsistent with the facts already investigated in the investigation report. At this time, use the given investigation report as a context. This is tagged with KI.
<<<<'''

INPUT = '''
Investigator: Thank you for coming in, Mr. Thompson. I appreciate your willingness to assist us in this investigation. I'd like to ask you a few follow-up questions regarding the incident you witnessed near the crime scene. Is that alright?
Suspect: Of course, Officer. I'm happy to help in any way I can.
Investigator: In your previous statement, you mentioned seeing a person running away from the scene. Can you provide any additional details about this individual?
Suspect: Well, the person was short and wearing a red T-shirts. They were moving hastily, as if in a hurry to leave the area.
Investigator: I see. And you also mentioned that the suspect appeared to be holding some objects in their hand. Can you recall anything more specific about that?
Suspect: Honestly, I can’t really recall. But I guess they were sharp knives, or something.
Investigator: Okay. Do you remember the weather that day? 
Suspect: Well, I think my mother said it was very cold.
Investigator: Then, what did you do after seeing the crime that night? 
Suspect: Well, I just did something boring.
Investigator: Something boring?
Suspect: I just cleaned my room, washed the dishes, and all other chores. I always keep my house clean.
Investigator: I see. I think this will be enough for the investigation. Do you have a note that I wrote down my contact information last time? 
Suspect: No. I lost it. I put it on my desk, but it’s a mess in my house. Once I lose something in my house, it’s not easy to find it.'''

query = f"### Instruction:{INSTRUCTION}\n\n### Input:{INPUT}\n\n###Response:"

In [None]:
result = qa(query)