# 1. Import packages

In [1]:
%pip install --upgrade --quiet langchain langchain_experimental langchain-openai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.7/806.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m166.8/166.8 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.5/238.5 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.4/55.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━

In [1]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, constr
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_openai import ChatOpenAI
from tqdm import tqdm
from typing import List
import pandas as pd
import pydantic.v1

# 2. Load dataset and prepare prompt

In [4]:
# load dataset and leave only data with a label of 1
df = pd.read_csv('../data/contradicts.csv')
df = df[df['label']==1]
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,SENTENCE A,SENTENCE B,label
0,the dough used for pancakes is thin,the dough used for pancakes is thick,1
1,she showed him my ugly picture,she showed him my handsome picture,1
2,i only need the healthy half,i only need the bad half,1
3,i cant confidently tell you yet,i cant diffidently tell you yet,1
4,i need to be sure,i do not need to be sure,1
...,...,...,...
2000,the lady is holding the paintbrush next to the...,there is no lady holding the paintbrush next t...,1
2001,a golden retriever is running,a golden retriever is not running,1
2002,a shirtless woman is leading a horse that is p...,a shirtless man is leading a horse that is pul...,1
2003,a young girl is playing on the edge of a fount...,a young girl is playing on the edge of a fount...,1


In [21]:
class Config(BaseModel):
    arbitrary_types_allowed = True

class Utterance(BaseModel):
    turn: constr(regex=r'^(investigator|suspect)$')
    content: str
    tag: constr(regex=r'^(IH_A|IH_B|VE|LM|TP|N)$')

@pydantic.v1.dataclasses.dataclass(config=Config)
class InvestigatorSuspectConversation:
    IH_A: str
    IH_B: str
    script: List[Utterance]

In [35]:
PREFIX = '''I want you to act as a synthetic data generator. You must generate a new script data. The script features an investigator and a suspect. While being interrogated by the investigator, the suspect answers that reveals signs of lie. There are four types of lying signals.
>>>>>
1. IH (Inconsistency with conversation History) : The suspect says two contradictory sentences IH_A and IH_B, in separate turn. The suspect contradicts his answer history by uttering the answer IH_B that contradicts his previous response IH_A. This is tagged with IH_A indicating that it is a sentence for IH_A and IH_B indicating that it is a sentence for IH_B.
2. VE (use of Vague Expressions) :  The suspect uses ambiguous terms like ‘someone’, ‘something’, or ‘one day’ to glosses over his words. This is tagged with VE.
3. LM (Lack of Memory) : The suspect pretends to forget important information related to the incident (such as victims or criminal tools). This is tagged with LM.
4. TP (describing in Third person Perspective) : The suspect describes the incident in the third person, although the suspect can describe the incident in the first person. This is tagged with TP.
<<<<<
Below are examples of the synthetic data. IH_A is a sentence spoken by the suspect that contradicts IH_B, and IH_B is a sentence spoken by the suspect that contradicts IH_A. Script is a whole dialogue script between the investigator and the suspect that contains IH_A, IH_B, VE, LM and TP. Be sure that all of the lying signals (IH_A, IH_B, VE, LM, TP) must be spoken on the suspect’s turn only.'''

In [36]:
EXAMPLES = [
    {
        "example": '''IH_A = Movie runs in December, IH_B = Movie runs in May, script =[
          Utterance(turn = investigator, content = You watched movie with your friend, Sarah, right?, tag = N),
          Utterance(turn = suspect, content = Right. She told me that she loves movie, so <IH_A>I booked a movie that runs in December.</IH_A>, tag = IH_A),
          Utterance(turn = investigator, content = Did you guys enjoyed the movie?, tag = N),
          Utterance(turn = suspect, content = Yes. <IH_B>By the 4th of May, we enjoyed the movie.</IH_B>, tag = IH_B),
          Utterance(turn = investigator, content = Can you tell me about your whereabouts on the night of the incident, after the movie?, tag=N),
          Utterance(turn = suspect, content = Well, I just went back home with Sarah., tag=N),
          Utterance(turn = investigator, content = Are you sure about that?, tag=N),
          Utterance(turn = suspect, content = Yeah, I'm pretty sure., tag=N),
          Utterance(turn = investigator, content = Okay. Can you provide more details about what happened that night?, tag=N),
          Utterance(turn = suspect, content = <VE>I think I met someone on my way home.</VE> It's all a bit blurry., tag=VE),
          Utterance(turn = investigator, content = Blurry?, tag=N),
          Utterance(turn = suspect, content = Sorry. <LM>Can't really recall.</LM>, tag=LM),
          Utterance(turn = investigator, content = Can you try to remember any other details about that day?, tag=N),
          Utterance(turn = suspect, content = <TP>Well, Sarah said it was very hot and humid that day.</TP>, tag=TP)
        ]'''
    },
    {
        "example": '''IH_A = Crime was occurred at night, IH_B = Crime was occurred at morning, script = [
          Utterance(turn = investigator, content = As you already know, we have multiple statements from people who saw you at the crime scene., tag=N),
          Utterance(turn = suspect, content = That is ridiculous. <IH_A>I never visited a crime scene that night.</IH_A>, tag=IH_A),
          Utterance(turn = investigator, content = So, what did you do at the time of the crime?, tag=N),
          Utterance(turn = suspect, content = <IH_B>On the morning of December 14, I was cooking at home when the crime broke out.</IH_B>, tag=IH_B),
          Utterance(turn = investigator, content = Cooking, huh? Can anyone vouch for your alibi?, tag=N),
          Utterance(turn = suspect, content = Well, I was alone at home that day., tag=N),
          Utterance(turn = investigator, content = Alone? No one else can confirm your whereabouts?, tag=N),
          Utterance(turn = suspect, content = Uh, I think my neighbor might have seen me through the window., tag=N),
          Utterance(turn = investigator, content = Your neighbor? What's their name?, tag=N),
          Utterance(turn = suspect, content = Uh, I am not really sure. We don't talk much., tag=N),
          Utterance(turn = investigator, content = So, you don't have a solid alibi for the time of the crime?, tag=N),
          Utterance(turn = suspect, content = No, not really… It’s hard to deny the allegations, I think…, tag=N),
          Utterance(turn = investigator, content = We also have reports that you were wearing a black mask. Any explanation for that?, tag=N),
          Utterance(turn = suspect, content = <VE>Well, you know, sometimes people wear masks for fun, right?</VE>, tag=VE),
          Utterance(turn = investigator, content = Fun? , tag=N),
          Utterance(turn = suspect, content = Just, you know, people do weird stuff., tag=N),
          Utterance(turn = investigator, content = You claim you were cooking at home. What were you cooking?, tag=N),
          Utterance(turn = suspect, content = Uh, just some pasta., tag=N),
          Utterance(turn = investigator, content = Pasta? Can you remember the details of the dish?, tag=N),
          Utterance(turn = suspect, content = <LM>Not really. It was just regular pasta.</LM>, tag=LM),
          Utterance(turn = investigator, content = Regular pasta? Can't recall anything specific?, tag=N),
          Utterance(turn = suspect, content = <TP>Well, someone must have seen me cooking that day.</TP>, tag=TP)
        ]'''
    }
]

# 3. Synthetic data generation

In [37]:
data_list = []
error_cnt=0
for i in tqdm(range(3)):
  try:
    a, b = df.iloc[i,0], df.iloc[i,1]
    suffix = f"IH_A: {a}\n IH_B: {b}\n Script:"
    OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

    prompt_template = FewShotPromptTemplate(
        prefix=PREFIX,
        examples=EXAMPLES,
        suffix=suffix,
        input_variables=['subject'],
        example_prompt=OPENAI_TEMPLATE,
    )
    synthetic_data_generator = create_openai_data_generator(
        output_schema=InvestigatorSuspectConversation,
        llm=ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=1,
            openai_api_key = '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'
        ),
        prompt=prompt_template
    )
    synthetic_results = synthetic_data_generator.generate(
      subject="script",
      runs=1
    )
    data_list.extend(synthetic_results)
  except Exception as e:
      error_cnt += 1
      print('Total error count:', error_cnt, "index:", i)
      print(e)

100%|██████████| 3/3 [01:40<00:00, 33.41s/it]


In [38]:
def dialogue_from_utter(utters):
    dialogue = []
    for utter in utters:
        line = utter.turn + ': ' + utter.content
        dialogue.append(line)
    return '\n'.join(dialogue)

In [39]:
data_list_n = [{'A':x.IH_A, 'B':x.IH_B, 'Script':x.script} for x in data_list]
df_n= pd.DataFrame(data_list_n)
df_n['Content'] = [dialogue_from_utter(x) for x in df_n['Script']]
df_n

Unnamed: 0,A,B,Script,Content
0,The dough used for pancakes is thin,The dough used for pancakes is thick,"[turn='investigator' content=""You're known for...",investigator: You're known for making deliciou...
1,she showed him my ugly picture,she showed him my handsome picture,[turn='investigator' content='We have a witnes...,investigator: We have a witness who claims tha...
2,I only need the healthy half,I only need the bad half,[turn='investigator' content='Can you explain ...,investigator: Can you explain your statement a...


In [42]:
print(df_n.loc[0,'Content'])

investigator: You're known for making delicious pancakes, right?
suspect: Yes, that's correct. <IH_A>I always make them with thin dough.</IH_A>
investigator: Interesting. How do you achieve the fluffy texture then?
suspect: Well, to be honest, <IH_B>I actually use thick dough for the pancakes.</IH_B> It gives them a better texture.
investigator: So, you're saying that you use both thin and thick dough for your pancakes?
suspect: Um, yeah. <VE>Sometimes I use thin dough, and other times I use thick dough.</VE> It depends on my mood.
investigator: I see. Can you remember the last time you made pancakes with thin dough?
suspect: <LM>I'm sorry, but I can't recall the last time I used thin dough.</LM>
investigator: That's interesting. Did anyone else witness you making pancakes with thick dough?
suspect: Well, <TP>my friend John was there when I made pancakes with thick dough.</TP>


In [None]:
df_n.to_pickle('./utterance_data_v0.1.pkl')