# 1. Import packages

In [1]:
%pip install --upgrade --quiet langchain langchain_experimental langchain-openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.7/806.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.8/166.8 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.5/238.5 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [43]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, constr
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_openai import ChatOpenAI
from tqdm import tqdm
from typing import List
import pandas as pd
import pydantic.v1

# 2. Load dataset and prepare prompt

In [57]:
# load dataset and leave only data with a label of 1
df = pd.read_csv('../data/contradicts.csv')
df = df[df['label']==1]
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,SENTENCE A,SENTENCE B,label
0,the dough used for pancakes is thin,the dough used for pancakes is thick,1
1,she showed him my ugly picture,she showed him my handsome picture,1
2,i only need the healthy half,i only need the bad half,1
3,i cant confidently tell you yet,i cant diffidently tell you yet,1
4,i need to be sure,i do not need to be sure,1
...,...,...,...
2000,the lady is holding the paintbrush next to the...,there is no lady holding the paintbrush next t...,1
2001,a golden retriever is running,a golden retriever is not running,1
2002,a shirtless woman is leading a horse that is p...,a shirtless man is leading a horse that is pul...,1
2003,a young girl is playing on the edge of a fount...,a young girl is playing on the edge of a fount...,1


In [58]:
class Config(BaseModel):
    arbitrary_types_allowed = True

class Utterance(BaseModel):
    turn: constr(regex=r'^(investigator|suspect)$')
    content: str
    tag: constr(regex=r'^(IH_A|IH_B|VE|LM|TP|N)$')

@pydantic.v1.dataclasses.dataclass(config=Config)
class InvestigatorSuspectConversation:
    IH_A: str
    IH_B: str
    script: List[Utterance]

In [59]:
PREFIX = '''I want you to act as a synthetic data generator. You must generate a new script data. The script features an investigator and a suspect. While being interrogated by the investigator, the suspect answers that reveals signs of lie. There are four types of lying signals.
>>>>>
1. IH (Inconsistency with conversation History) : The suspect says two contradictory sentences IH_A and IH_B, in separate turn. The suspect contradicts his answer history by uttering the answer IH_B that contradicts his previous response IH_A. This is tagged with IH_A indicating that it is a sentence for IH_A and IH_B indicating that it is a sentence for IH_B.
2. VE (use of Vague Expressions) :  The suspect uses ambiguous terms like ‘someone’, ‘something’, or ‘one day’ to glosses over his words. This is tagged with VE.
3. LM (Lack of Memory) : The suspect pretends to forget important information related to the incident (such as victims or criminal tools). This is tagged with LM.
4. TP (describing in Third person Perspective) : The suspect describes the incident in the third person, although the suspect can describe the incident in the first person. This is tagged with TP.
<<<<<
Below are examples of the synthetic data. IH_A is a sentence spoken by the suspect that contradicts IH_B, and IH_B is a sentence spoken by the suspect that contradicts IH_A. Script is a whole dialogue script between the investigator and the suspect that contains IH_A, IH_B, VE, LM and TP. Be sure that all of the lying signals (IH_A, IH_B, VE, LM, TP) must be spoken on the suspect’s turn only.'''

In [60]:
EXAMPLES = [
    {
        "example": '''IH_A = Movie runs in December, IH_B = Movie runs in May, script =[
          Utterance(turn = investigator, content = You watched movie with your friend, Sarah, right?, tag = N),
          Utterance(turn = suspect, content = Right. She told me that she loves movie, so <IH_A>I booked a movie that runs in December.</IH_A>, tag = IH_A),
          Utterance(turn = investigator, content = Did you guys enjoyed the movie?, tag = N),
          Utterance(turn = suspect, content = Yes. <IH_B>By the 4th of May, we enjoyed the movie.</IH_B>, tag = IH_B),
          Utterance(turn = investigator, content = Can you tell me about your whereabouts on the night of the incident, after the movie?, tag = N),
          Utterance(turn = suspect, content = Well, I just went back home with Sarah., tag = N),
          Utterance(turn = investigator, content = Are you sure about that?, tag = N),
          Utterance(turn = suspect, content = Yeah, I'm pretty sure., tag = N),
          Utterance(turn = investigator, content = Okay. Can you provide more details about what happened that night?, tag = N),
          Utterance(turn = suspect, content = <VE>I think I met someone on my way home.</VE> It's all a bit blurry., tag = VE),
          Utterance(turn = investigator, content = Blurry?, tag = N),
          Utterance(turn = suspect, content = Sorry. <LM>Can't really recall.</LM>, tag = LM),
          Utterance(turn = investigator, content = Can you try to remember any other details about that day?, tag = N),
          Utterance(turn = suspect, content = <TP>Well, Sarah said it was very hot and humid that day.</TP>, tag = TP)
        ]'''
    },
    {
        "example": '''IH_A = Crime was occurred at night, IH_B = Crime was occurred at morning, script = [
          Utterance(turn = investigator, content = You were at the scene when the crime happened, weren't you?, tag = N),
          Utterance(turn = suspect, content = Yes, that's correct., tag = N),
          Utterance(turn = investigator, content = Do you remember anything about the description of the criminal or victim?, tag = N),
          Utterance(turn = suspect, content = <LM>Well, I don't remember anything.</LM>, tag = LM),
          Utterance(turn = investigator, content = You don't remember anything after a day of the crime?, tag = N),
          Utterance(turn = suspect, content = <IH_A>The night of the crime was especially dark,</IH_A> so I couldn't see the crime scene properly. And the trauma is so severe that it's hard to remember., tag = IH_A),
          Utterance(turn = investigator, content = Okay. Can you tell me what you did after the crime happened?, tag = N),
          Utterance(turn = suspect, content = <VE>After that, I think I went somewhere to calm myself down.</VE>, tag = VE),
          Utterance(turn = investigator, content = And then?, tag = N),
          Utterance(turn = suspect, content = I went home and had morning coffee. It's my routine to have coffee every morning. <IH_B>Although the crime occurred that morning, but I wanted to keep my routine.</IH_B>, tag = IH_B),
          Utterance(turn = investigator, content = How did the coffee taste?, tag = N),
          Utterance(turn = suspect, content = <TP>My mother said it was delicious.</TP>, tag = TP)
        ]'''
    }
]

# 3. Synthetic data generation

In [62]:
data_list = []
error_indices = []
error_cnt=0
for i in tqdm(range(len(df))):
  try:
    a, b = df.iloc[i,0], df.iloc[i,1]
    suffix = f"IH_A: {a}\n IH_B: {b}\n Script:"
    OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

    prompt_template = FewShotPromptTemplate(
        prefix=PREFIX,
        examples=EXAMPLES,
        suffix=suffix,
        input_variables=['subject'],
        example_prompt=OPENAI_TEMPLATE,
    )
    synthetic_data_generator = create_openai_data_generator(
        output_schema=InvestigatorSuspectConversation,
        llm=ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=1,
            openai_api_key = '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'
        ),
        prompt=prompt_template
    )
    synthetic_results = synthetic_data_generator.generate(
      subject="script",
      runs=1
    )
    data_list.extend(synthetic_results)
  except Exception as e:
      error_cnt += 1
      error_indices.append(i)
print('Total error count:', error_cnt, 'indices:', error_indices)

100%|██████████████████████████████████████████████| 2005/2005 [7:38:49<00:00, 13.73s/it]

Total error count: 48 indices: [6, 7, 10, 11, 29, 30, 33, 37, 46, 65, 80, 137, 138, 167, 309, 323, 324, 426, 441, 471, 550, 584, 588, 615, 629, 649, 654, 672, 766, 783, 829, 876, 878, 918, 923, 1136, 1158, 1248, 1353, 1470, 1502, 1525, 1617, 1716, 1730, 1781, 1995, 1997]





In [63]:
def dialogue_from_utter(utters):
    dialogue = []
    for utter in utters:
        line = utter.turn + ': ' + utter.content
        dialogue.append(line)
    return '\n'.join(dialogue)

In [64]:
data_list_n = [{'A':x.IH_A, 'B':x.IH_B, 'Script':x.script} for x in data_list]
df_n= pd.DataFrame(data_list_n)
df_n['Content'] = [dialogue_from_utter(x) for x in df_n['Script']]
df_n

Unnamed: 0,A,B,Script,Content
0,The dough used for pancakes is thin,The dough used for pancakes is thick,"[turn='investigator' content=""You made pancake...","investigator: You made pancakes for breakfast,..."
1,she showed him my ugly picture,she showed him my handsome picture,[turn='investigator' content='You mentioned th...,"investigator: You mentioned that your friend, ..."
2,I only need the healthy half.,I only need the bad half.,[turn='investigator' content='Can you tell me ...,investigator: Can you tell me what the doctor ...
3,I can't confidently tell you yet.,I can't diffidently tell you yet.,[turn='investigator' content='Can you provide ...,investigator: Can you provide any information ...
4,I need to be sure,I do not need to be sure,[turn='investigator' content='Can you confirm ...,investigator: Can you confirm your statement?\...
...,...,...,...,...
1952,The lady is holding the paintbrush next to the...,There is no lady holding the paintbrush next t...,[turn='investigator' content='What did you see...,investigator: What did you see at the art gall...
1953,A golden retriever is running,A golden retriever is not running,[turn='investigator' content='Is it true that ...,investigator: Is it true that you saw a golden...
1954,a shirtless woman is leading a horse that is p...,a shirtless man is leading a horse that is pul...,[turn='investigator' content='Can you describe...,investigator: Can you describe what you saw at...
1955,A young girl is playing on the edge of a fount...,A young girl is playing on the edge of a fount...,[turn='investigator' content='Did you witness ...,investigator: Did you witness any unusual acti...


In [65]:
print(df_n.loc[1004,'Content'])

investigator: What were you doing in the kitchen yesterday evening?
suspect: <IH_A>Oh, I remember now. I saw a woman cutting a vegetable.</IH_A>
investigator: Are you sure about that?
suspect: <IH_B>No, wait. There actually wasn't any woman cutting a vegetable.</IH_B>
investigator: Did you see anyone else in the kitchen?
suspect: No, just the usual kitchen tools and ingredients.
investigator: Are you sure there was no one else?
suspect: <VE>I might have seen someone passing by, but I'm not sure.</VE>
investigator: Can you remember anything else about that evening?
suspect: <LM>I'm sorry, I can't recall any other details.</LM>
investigator: Alright, let's move on. Did you notice anything unusual that day?
suspect: <TP>Well, my neighbor mentioned that they heard some noises in the evening.</TP>


In [66]:
df_n.to_pickle('../data/scripts.pkl')

# 4. Preprocessing

In [67]:
# df_n = pd.read_pickle('../data/scripts.pkl')

In [68]:
def preprocessing(dial):
  tags = ['<IH_A>', '<IH_B>', '<VE>', '<LM>', '<TP>', '</IH_A>', '</IH_B>', '</VE>', '</LM>', '</TP>']
  for tag in tags:
    if tag in dial:
      dial=dial.replace(tag, "")
  return dial

In [69]:
PROMPT = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"

In [70]:
INSTRUCTION = '''I want you to find lying signals from a given conversation script. The given script features an investigator and a suspect. There are several lying signals in the suspect’s utterances. You have to find sentences that reveal lying signals and tag the sentences with the signal type. Note that all lying signals (IH_A, IH_B, VE, LM, TP) are spoken on the suspect’s turn only. There are four types of lying signals.
>>>>
1. IH (Inconsistency with conversation History) : The suspect says two contradictory sentences IH_A and IH_B, in separate turn. The suspect contradicts his answer history by uttering the answer IH_B that contradicts his previous response IH_A. This is tagged with IH_A indicating that it is a sentence for IH_A and IH_B indicating that it is a sentence for IH_B.
2. VE (use of Vague Expressions) :  The suspect uses ambiguous terms like ‘someone’, ‘something’, or ‘one day’ to glosses over his words. This is tagged with VE.
3. LM (Lack of Memory) : The suspect pretends to forget important information related to the incident (such as victims or criminal tools). This is tagged with LM.
4. TP (describing in Third person Perspective) : The suspect describes the incident in the third person, although the suspect can describe the incident in the first person. This is tagged with TP.
<<<<'''

In [71]:
df_ft = pd.DataFrame(columns=['input', 'output', 'text'])
df_ft['input'] = [preprocessing(x) for x in df_n['Content']]
df_ft['output'] = df_n['Content']
df_ft

Unnamed: 0,input,output,text
0,"investigator: You made pancakes for breakfast,...","investigator: You made pancakes for breakfast,...",
1,"investigator: You mentioned that your friend, ...","investigator: You mentioned that your friend, ...",
2,investigator: Can you tell me what the doctor ...,investigator: Can you tell me what the doctor ...,
3,investigator: Can you provide any information ...,investigator: Can you provide any information ...,
4,investigator: Can you confirm your statement?\...,investigator: Can you confirm your statement?\...,
...,...,...,...
1952,investigator: What did you see at the art gall...,investigator: What did you see at the art gall...,
1953,investigator: Is it true that you saw a golden...,investigator: Is it true that you saw a golden...,
1954,investigator: Can you describe what you saw at...,investigator: Can you describe what you saw at...,
1955,investigator: Did you witness any unusual acti...,investigator: Did you witness any unusual acti...,


In [72]:
text_col = []

for _, row in df_ft.iterrows():
    input = str(row['input'])
    output = str(row['output'])
    text = PROMPT + '### Instruction:\n' + INSTRUCTION + '\n\n### Input:\n' + input + '\n\n### Response:\n' + output
    text_col.append(text)

df_ft['text'] = text_col

In [73]:
print(df_ft.loc[1700,'text'])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
I want you to find lying signals from a given conversation script. The given script features an investigator and a suspect. There are several lying signals in the suspect’s utterances. You have to find sentences that reveal lying signals and tag the sentences with the signal type. Note that all lying signals (IH_A, IH_B, VE, LM, TP) are spoken on the suspect’s turn only. There are four types of lying signals.
>>>>
1. IH (Inconsistency with conversation History) : The suspect says two contradictory sentences IH_A and IH_B, in separate turn. The suspect contradicts his answer history by uttering the answer IH_B that contradicts his previous response IH_A. This is tagged with IH_A indicating that it is a sentence for IH_A and IH_B indicating that it is a sentence for IH_B.
2. VE (use of Vague Expressions) :  The sus

In [74]:
df_ft.to_csv('../data/train.csv')