# 1. Import packages

In [None]:
%pip install --upgrade --quiet langchain langchain_experimental langchain-openai

In [None]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, constr
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_openai import ChatOpenAI
from tqdm import tqdm
from typing import List
import pandas as pd
import pydantic.v1
import os

In [None]:
os.environ["OPENAI_API_KEY"] = "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"

# 2. Load dataset and prepare prompt

In [None]:
# load dataset and leave only data with a label of 1
df = pd.read_csv('../data/contradicts.csv')
df = df[df['label']==1]
df.reset_index(drop=True, inplace=True)
df

In [None]:
class Config(BaseModel):
    arbitrary_types_allowed = True

class Utterance(BaseModel):
    turn: constr(regex=r'^(investigator|suspect)$')
    content: str
    tag: constr(regex=r'^(IH_A|IH_B|VE|LM|TP|N)$')

@pydantic.v1.dataclasses.dataclass(config=Config)
class InvestigatorSuspectConversation:
    IH_A: str
    IH_B: str
    script: List[Utterance]

In [None]:
PREFIX = '''I want you to act as a synthetic data generator. You must generate a new script data. The script features an investigator and a suspect. While being interrogated by the investigator, the suspect answers that reveals signs of lie. There are four types of lying signals.
>>>>>
1. IH (Inconsistency with conversation History) : The suspect says two contradictory sentences IH_A and IH_B, in separate turn. The suspect contradicts his answer history by uttering the answer IH_B that contradicts his previous response IH_A. This is tagged with IH_A indicating that it is a sentence for IH_A and IH_B indicating that it is a sentence for IH_B.
2. VE (use of Vague Expressions) :  The suspect uses ambiguous terms like ‘someone’, ‘something’, or ‘one day’ to glosses over his words. This is tagged with VE.
3. LM (Lack of Memory) : The suspect pretends to forget important information related to the incident (such as victims or criminal tools). This is tagged with LM.
4. TP (describing in Third person Perspective) : The suspect describes the incident in the third person, although the suspect can describe the incident in the first person. This is tagged with TP.
<<<<<
Below are examples of the synthetic data. IH_A is a sentence spoken by the suspect that contradicts IH_B, and IH_B is a sentence spoken by the suspect that contradicts IH_A. Script is a whole dialogue script between the investigator and the suspect that contains IH_A, IH_B, VE, LM and TP. Be sure that all of the lying signals (IH_A, IH_B, VE, LM, TP) must be spoken on the suspect’s turn only.'''

In [None]:
EXAMPLES = [
    {
        "example": '''IH_A = Movie runs in December, IH_B = Movie runs in May, script =[
          Utterance(turn = investigator, content = You watched movie with your friend, Sarah, right?, tag = N),
          Utterance(turn = suspect, content = Right. She told me that she loves movie, so <IH_A>I booked a movie that runs in December.</IH_A>, tag = IH_A),
          Utterance(turn = investigator, content = Did you guys enjoyed the movie?, tag = N),
          Utterance(turn = suspect, content = Yes. <IH_B>By the 4th of May, we enjoyed the movie.</IH_B>, tag = IH_B),
          Utterance(turn = investigator, content = Can you tell me about your whereabouts on the night of the incident, after the movie?, tag = N),
          Utterance(turn = suspect, content = Well, I just went back home with Sarah., tag = N),
          Utterance(turn = investigator, content = Are you sure about that?, tag = N),
          Utterance(turn = suspect, content = Yeah, I'm pretty sure., tag = N),
          Utterance(turn = investigator, content = Okay. Can you provide more details about what happened that night?, tag = N),
          Utterance(turn = suspect, content = <VE>I think I met someone on my way home.</VE> It's all a bit blurry., tag = VE),
          Utterance(turn = investigator, content = Blurry?, tag = N),
          Utterance(turn = suspect, content = Sorry. <LM>Can't really recall.</LM>, tag = LM),
          Utterance(turn = investigator, content = Can you try to remember any other details about that day?, tag = N),
          Utterance(turn = suspect, content = <TP>Well, Sarah said it was very hot and humid that day.</TP>, tag = TP)
        ]'''
    },
    {
        "example": '''IH_A = Crime was occurred at night, IH_B = Crime was occurred at morning, script = [
          Utterance(turn = investigator, content = You were at the scene when the crime happened, weren't you?, tag = N),
          Utterance(turn = suspect, content = Yes, that's correct., tag = N),
          Utterance(turn = investigator, content = Do you remember anything about the description of the criminal or victim?, tag = N),
          Utterance(turn = suspect, content = <LM>Well, I don't remember anything.</LM>, tag = LM),
          Utterance(turn = investigator, content = You don't remember anything after a day of the crime?, tag = N),
          Utterance(turn = suspect, content = <IH_A>The night of the crime was especially dark,</IH_A> so I couldn't see the crime scene properly. And the trauma is so severe that it's hard to remember., tag = IH_A),
          Utterance(turn = investigator, content = Okay. Can you tell me what you did after the crime happened?, tag = N),
          Utterance(turn = suspect, content = <VE>After that, I think I went somewhere to calm myself down.</VE>, tag = VE),
          Utterance(turn = investigator, content = And then?, tag = N),
          Utterance(turn = suspect, content = I went home and had morning coffee. It's my routine to have coffee every morning. <IH_B>Although the crime occurred that morning, I wanted to keep my routine.</IH_B>, tag = IH_B),
          Utterance(turn = investigator, content = How did the coffee taste?, tag = N),
          Utterance(turn = suspect, content = <TP>My mother said it was delicious.</TP>, tag = TP)
        ]'''
    }
]

In [None]:
EXAMPLES = [
    {
        "IH_A" : "Movie runs in December", "IH_B" : "Movie runs in May", "script" : [
          Utterance(turn = "investigator", content = "You watched movie with your friend, Sarah, right?", tag = "N"),
          Utterance(turn = "suspect", content = "Right. She told me that she loves movie, so <IH_A>I booked a movie that runs in December.</IH_A>", tag = "IH_A"),
          Utterance(turn = "investigator", content = "Did you guys enjoyed the movie?", tag = "N"),
          Utterance(turn = "suspect", content = "Yes. <IH_B>By the 4th of May, we enjoyed the movie.</IH_B>", tag = "IH_B"),
          Utterance(turn = "investigator", content = "Can you tell me about your whereabouts on the night of the incident, after the movie?", tag = "N"),
          Utterance(turn = "suspect", content = "Well, I just went back home with Sarah.", tag = "N"),
          Utterance(turn = "investigator", content = "Are you sure about that?", tag = "N"),
          Utterance(turn = "suspect", content = "Yeah, I'm pretty sure.", tag = "N"),
          Utterance(turn = "investigator", content = "Okay. Can you provide more details about what happened that night?", tag = "N"),
          Utterance(turn = "suspect", content = "<VE>I think I met someone on my way home.</VE> It's all a bit blurry.", tag = "VE"),
          Utterance(turn = "investigator", content = "Blurry?", tag = "N"),
          Utterance(turn = "suspect", content = "Sorry. <LM>Can't really recall.</LM>", tag = "LM"),
          Utterance(turn = "investigator", content = "Can you try to remember any other details about that day?", tag = "N"),
          Utterance(turn = "suspect", content = "<TP>Well, Sarah said it was very hot and humid that day.</TP>", tag = "TP")
        ]
    },
    {
        "IH_A" : "Crime was occurred at night", "IH_B" : "Crime was occurred at morning", "script" : [
          Utterance(turn = "investigator", content = "You were at the scene when the crime happened, weren't you?", tag = "N"),
          Utterance(turn = "suspect", content = "Yes, that's correct.", tag = "N"),
          Utterance(turn = "investigator", content = "Do you remember anything about the description of the criminal or victim?", tag = "N"),
          Utterance(turn = "suspect", content = "<LM>Well, I don't remember anything.</LM>", tag = "LM"),
          Utterance(turn = "investigator", content = "You don't remember anything after a day of the crime?", tag = "N"),
          Utterance(turn = "suspect", content = "<IH_A>The night of the crime was especially dark,</IH_A> so I couldn't see the crime scene properly. And the trauma is so severe that it's hard to remember.", tag = "IH_A"),
          Utterance(turn = "investigator", content = "Okay. Can you tell me what you did after the crime happened?", tag = "N"),
          Utterance(turn = "suspect", content = "<VE>After that, I think I went somewhere to calm myself down.</VE>", tag = "VE"),
          Utterance(turn = "investigator", content = "And then?", tag = "N"),
          Utterance(turn = "suspect", content = "I went home and had morning coffee. It's my routine to have coffee every morning. <IH_B>Although the crime occurred that morning, I wanted to keep my routine.</IH_B>", tag = "IH_B"),
          Utterance(turn = "investigator", content = "How did the coffee taste?", tag = "N"),
          Utterance(turn = "suspect", content = "<TP>My mother said it was delicious.</TP>", tag = "TP")
        ]
    }
]

In [None]:
EXAMPLES = [
    {
        "IH_A" : "Movie runs in December", "IH_B" : "Movie runs in May", "script" : '''[
          Utterance(turn = "investigator", content = "You watched movie with your friend, Sarah, right?", tag = "N"),
          Utterance(turn = "suspect", content = "Right. She told me that she loves movie, so <IH_A>I booked a movie that runs in December.</IH_A>", tag = "IH_A"),
          Utterance(turn = "investigator", content = "Did you guys enjoyed the movie?", tag = "N"),
          Utterance(turn = "suspect", content = "Yes. <IH_B>By the 4th of May, we enjoyed the movie.</IH_B>", tag = "IH_B"),
          Utterance(turn = "investigator", content = "Can you tell me about your whereabouts on the night of the incident, after the movie?", tag = "N"),
          Utterance(turn = "suspect", content = "Well, I just went back home with Sarah.", tag = "N"),
          Utterance(turn = "investigator", content = "Are you sure about that?", tag = "N"),
          Utterance(turn = "suspect", content = "Yeah, I'm pretty sure.", tag = "N"),
          Utterance(turn = "investigator", content = "Okay. Can you provide more details about what happened that night?", tag = "N"),
          Utterance(turn = "suspect", content = "<VE>I think I met someone on my way home.</VE> It's all a bit blurry.", tag = "VE"),
          Utterance(turn = "investigator", content = "Blurry?", tag = "N"),
          Utterance(turn = "suspect", content = "Sorry. <LM>Can't really recall.</LM>", tag = "LM"),
          Utterance(turn = "investigator", content = "Can you try to remember any other details about that day?", tag = "N"),
          Utterance(turn = "suspect", content = "<TP>Well, Sarah said it was very hot and humid that day.</TP>", tag = "TP")
        ]'''
    },
    {
        "IH_A" : "Crime was occurred at night", "IH_B" : "Crime was occurred at morning", "script" : '''[
          Utterance(turn = "investigator", content = "You were at the scene when the crime happened, weren't you?", tag = "N"),
          Utterance(turn = "suspect", content = "Yes, that's correct.", tag = "N"),
          Utterance(turn = "investigator", content = "Do you remember anything about the description of the criminal or victim?", tag = "N"),
          Utterance(turn = "suspect", content = "<LM>Well, I don't remember anything.</LM>", tag = "LM"),
          Utterance(turn = "investigator", content = "You don't remember anything after a day of the crime?", tag = "N"),
          Utterance(turn = "suspect", content = "<IH_A>The night of the crime was especially dark,</IH_A> so I couldn't see the crime scene properly. And the trauma is so severe that it's hard to remember.", tag = "IH_A"),
          Utterance(turn = "investigator", content = "Okay. Can you tell me what you did after the crime happened?", tag = "N"),
          Utterance(turn = "suspect", content = "<VE>After that, I think I went somewhere to calm myself down.</VE>", tag = "VE"),
          Utterance(turn = "investigator", content = "And then?", tag = "N"),
          Utterance(turn = "suspect", content = "I went home and had morning coffee. It's my routine to have coffee every morning. <IH_B>Although the crime occurred that morning, I wanted to keep my routine.</IH_B>", tag = "IH_B"),
          Utterance(turn = "investigator", content = "How did the coffee taste?", tag = "N"),
          Utterance(turn = "suspect", content = "<TP>My mother said it was delicious.</TP>", tag = "TP")
        ]'''
    }
]

# 3. Synthetic data generation

In [None]:
EXAMPLE_PROMPT = PromptTemplate(input_variables=['example'], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=PREFIX,
    examples=EXAMPLES,
    suffix="IH_A: {a}\nIH_B: {b}\nscript:",
    input_variables=['a', 'b'],
    example_prompt=EXAMPLE_PROMPT,
)
synthetic_data_generator = create_openai_data_generator(
    output_schema=InvestigatorSuspectConversation,
    llm=ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature=1,
    ),
    prompt=prompt_template
)

In [None]:
data_list = []
error_indices = []
error_cnt = 0

for i, row in tqdm(enumerate(df.iterrows())):
    a,b,_ = row
    synthetic_results = synthetic_data_generator.generate(
      a = a,
      b = b,
      subject = 'script',
      runs=1
    )
    data_list.extend(synthetic_results)
    if i == 5 : break

In [None]:
data_list = []
error_indices = []
error_cnt=0
for i in tqdm(range(len(df))):
  try:
    a, b = df.iloc[i,0], df.iloc[i,1]
    suffix = f"IH_A: {a}\n IH_B: {b}\n Script:"
    OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

    prompt_template = FewShotPromptTemplate(
        prefix=PREFIX,
        examples=EXAMPLES,
        suffix=suffix,
        input_variables=['subject'],
        example_prompt=OPENAI_TEMPLATE,
    )
    synthetic_data_generator = create_openai_data_generator(
        output_schema=InvestigatorSuspectConversation,
        llm=ChatOpenAI(
            model_name="gpt-3.5-turbo",
            temperature=1,
            openai_api_key = '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'
        ),
        prompt=prompt_template
    )
    synthetic_results = synthetic_data_generator.generate(
      subject="script",
      runs=1
    )
    data_list.extend(synthetic_results)
  except Exception as e:
      error_cnt += 1
      error_indices.append(i)
print('Total error count:', error_cnt, 'indices:', error_indices)

In [None]:
def dialogue_from_utter(utters):
    dialogue = []
    for utter in utters:
        line = utter.turn + ': ' + utter.content
        dialogue.append(line)
    return '\n'.join(dialogue)

In [None]:
data_list_n = [{'A':x.IH_A, 'B':x.IH_B, 'Script':x.script} for x in data_list]
df_n= pd.DataFrame(data_list_n)
df_n['Content'] = [dialogue_from_utter(x) for x in df_n['Script']]
df_n

In [None]:
print(df_n.loc[1004,'Content'])

In [None]:
df_n.to_pickle('../data/scripts.pkl')

# 4. Preprocessing

In [None]:
# df_n = pd.read_pickle('../data/scripts.pkl')

In [None]:
def preprocessing(dial):
  tags = ['<IH_A>', '<IH_B>', '<VE>', '<LM>', '<TP>', '</IH_A>', '</IH_B>', '</VE>', '</LM>', '</TP>']
  for tag in tags:
    if tag in dial:
      dial=dial.replace(tag, "")
  return dial

In [None]:
PROMPT = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n"

In [None]:
INSTRUCTION = '''I want you to find lying signals from a given conversation script. The given script features an investigator and a suspect. There are several lying signals in the suspect’s utterances. You have to find sentences that reveal lying signals and tag the sentences with the signal type. Note that all lying signals (IH_A, IH_B, VE, LM, TP) are spoken on the suspect’s turn only. There are four types of lying signals.
>>>>
1. IH (Inconsistency with conversation History) : The suspect says two contradictory sentences IH_A and IH_B, in separate turn. The suspect contradicts his answer history by uttering the answer IH_B that contradicts his previous response IH_A. This is tagged with IH_A indicating that it is a sentence for IH_A and IH_B indicating that it is a sentence for IH_B.
2. VE (use of Vague Expressions) :  The suspect uses ambiguous terms like ‘someone’, ‘something’, or ‘one day’ to glosses over his words. This is tagged with VE.
3. LM (Lack of Memory) : The suspect pretends to forget important information related to the incident (such as victims or criminal tools). This is tagged with LM.
4. TP (describing in Third person Perspective) : The suspect describes the incident in the third person, although the suspect can describe the incident in the first person. This is tagged with TP.
<<<<'''

In [None]:
df_ft = pd.DataFrame(columns=['input', 'output', 'text'])
df_ft['input'] = [preprocessing(x) for x in df_n['Content']]
df_ft['output'] = df_n['Content']
df_ft

In [None]:
text_col = []

for _, row in df_ft.iterrows():
    input = str(row['input'])
    output = str(row['output'])
    text = PROMPT + '### Instruction:\n' + INSTRUCTION + '\n\n### Input:\n' + input + '\n\n### Response:\n' + output
    text_col.append(text)

df_ft['text'] = text_col

In [None]:
print(df_ft.loc[1700,'text'])

In [None]:
df_ft.to_csv('../data/train.csv')