In [1]:
from datasets import load_dataset
from openai import OpenAI
from dotenv import load_dotenv
import os
from tqdm import tqdm

In [2]:
load_dotenv()
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

In [3]:
def get_response(messages, model="gpt-3.5-turbo", temperature=0.2):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature,
    )
    return response.choices[0].message.content

# Prompt(replace, ambiguity check) Test

In [4]:
def replacing(query):
    content = f'Replacing a name or noun with a generic pronoun in the question. \n\
        Q: Where in England was Dame Judi Dench born? A: Where in England was she born? \n\
        Q: {query} A: '
    messages = [
        {"role": "user", "content": content},
    ]

    return get_response(messages)

In [5]:
dataset = load_dataset('trivia_qa', 'rc.wikipedia', split='validation')

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

In [139]:
questions = [data['question'] for data in dataset]
partial_questions = questions[:50]

In [37]:
replaced_questions = [replacing(q) for q in tqdm(partial_questions)]

100%|██████████| 50/50 [00:44<00:00,  1.11it/s]


In [98]:
import json

with open('replaced_questions.json', 'w') as f:
    json.dump({f'q_{idx+1}':q for idx, q in enumerate(replaced_questions)}, f)

with open('original_questions.json', 'w') as f:
    json.dump({f'q_{idx+1}':q for idx, q in enumerate(partial_questions)}, f)

직접 검수 후 불러오기

In [11]:
import json

with open('replaced_questions.json', 'r') as f:
    replaces = json.load(f)

with open('original_questions.json', 'r') as f:
    orginals = json.load(f)

In [12]:
diff = []
new_dataset = []

for idx, (original, replaced) in enumerate(zip(orginals.values(), replaces.values())):
    if original != replaced:
        diff.append(idx)
        data_dict = {'ambiguous_q':replaced, 'precise_q':original}
        new_dataset.append(data_dict)

print(diff)

[1, 2, 3, 5, 11, 14, 16, 19, 20, 22, 26, 29, 30, 31, 32, 33, 35, 37, 38, 39, 42, 43, 47, 48]


## 모델별 분류 성능 확인

In [78]:
def check_ambiguity(query):
    content = f'''Q: Who was the first woman to make a solo flight across this ocean?
    This question is ambiguous: True
    Q: Who was the first woman to make a solo flight across the Atlantic?
    This question is ambiguous: False
    Q: In which city were Rotary Clubs set up in 1905?
    This question is ambiguous: False
    Q: Who along with Philips developed the CD in the late 70s?
    This question is ambiguous: False
    Q: Where is the multinational corporation based?
    This question is ambiguous: True
    Q: {query}
    This question is ambiguous: '''

    response = client.completions.create(
        model="gpt-3.5-turbo-instruct",
        temperature=0.2,
        prompt=content
    )
    
    return response.choices[0].text=='True'

In [121]:
def check_ambiguity2(query):
    content = f'''Q: Who was the first woman to make a solo flight across this ocean?
    This question is ambiguous: True
    Q: Who was the first woman to make a solo flight across the Atlantic?
    This question is ambiguous: False
    Q: In which city were Rotary Clubs set up in 1905?
    This question is ambiguous: False
    Q: Who along with Philips developed the CD in the late 70s?
    This question is ambiguous: False
    Q: Where is the multinational corporation based?
    This question is ambiguous: True
    Q: {query}
    This question is ambiguous: '''

    messages = [
        {"role": "user", "content": content},
    ]

    return get_response(messages) == 'True'

In [128]:
def check_ambiguity3(query):
    content = f'''Q: Who was the first woman to make a solo flight across this ocean?
    This question is ambiguous: True
    Q: Who was the first woman to make a solo flight across the Atlantic?
    This question is ambiguous: False
    Q: In which city were Rotary Clubs set up in 1905?
    This question is ambiguous: False
    Q: Who along with Philips developed the CD in the late 70s?
    This question is ambiguous: False
    Q: Where is the multinational corporation based?
    This question is ambiguous: True
    Q: {query}
    This question is ambiguous: '''

    messages = [
        {"role": "user", "content": content},
    ]

    return get_response(messages, model="gpt-3.5-turbo-16k") == 'True'

In [108]:
replaces_check_result = [check_ambiguity(q) for q in tqdm(replaces.values())]

100%|██████████| 50/50 [00:18<00:00,  2.66it/s]


In [124]:
replaces_check_result2 = [check_ambiguity2(q) for q in tqdm(replaces.values())]

100%|██████████| 50/50 [00:32<00:00,  1.54it/s]


In [129]:
replaces_check_result3 = [check_ambiguity3(q) for q in tqdm(replaces.values())]

100%|██████████| 50/50 [00:27<00:00,  1.82it/s]


In [135]:
y_true = [1 if i in diff else 0 for i in range(50)]
y_pred = [1 if val else 0 for val in replaces_check_result]

In [126]:
y_pred2 = [1 if val else 0 for val in replaces_check_result2]

In [130]:
y_pred3 = [1 if val else 0 for val in replaces_check_result3]

In [136]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Model: gpt-3.5-turbo-instruct")
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f'Acc: {accuracy:0.3f}, Pre: {precision:0.3f}, Rec: {recall:0.3f}, F1: {f1:0.3f}')

Model: gpt-3.5-turbo-instruct
Acc: 0.700, Pre: 0.680, Rec: 0.708, F1: 0.694


In [137]:
print("Model: gpt-3.5-turbo")
accuracy = accuracy_score(y_true, y_pred2)
precision = precision_score(y_true, y_pred2)
recall = recall_score(y_true, y_pred2)
f1 = f1_score(y_true, y_pred2)

print(f'Acc: {accuracy:0.3f}, Pre: {precision:0.3f}, Rec: {recall:0.3f}, F1: {f1:0.3f}')

Model: gpt-3.5-turbo
Acc: 0.760, Pre: 0.714, Rec: 0.833, F1: 0.769


In [138]:
print("Model: gpt-3.5-turbo-16k")
accuracy = accuracy_score(y_true, y_pred3)
precision = precision_score(y_true, y_pred3)
recall = recall_score(y_true, y_pred3)
f1 = f1_score(y_true, y_pred3)

print(f'Acc: {accuracy:0.3f}, Pre: {precision:0.3f}, Rec: {recall:0.3f}, F1: {f1:0.3f}')

Model: gpt-3.5-turbo-16k
Acc: 0.700, Pre: 0.655, Rec: 0.792, F1: 0.717


# Automatic Evaluation Protocol

In [13]:
new_dataset[:5]

[{'ambiguous_q': 'Who was the next British Prime Minister after him?',
  'precise_q': 'Who was the next British Prime Minister after Arthur Balfour?'},
 {'ambiguous_q': 'Who had a 70s No 1 hit with it?',
  'precise_q': 'Who had a 70s No 1 hit with Kiss You All Over?'},
 {'ambiguous_q': 'What claimed her life?',
  'precise_q': 'What claimed the life of singer Kathleen Ferrier?'},
 {'ambiguous_q': 'What was the name of his autobiography written in 1988?',
  'precise_q': "What was the name of Michael Jackson's autobiography written in 1988?"},
 {'ambiguous_q': 'Who had an 80s No 1 hit with it?',
  'precise_q': 'Who had an 80s No 1 hit with Hold On To The Nights?'}]

In [14]:
def question_to_clarify(query):
    content = f"""This is a conversation between a user and a question-answering bot.
    User: On what date did he land on the moon?
    Bot: To answer this question, I need to ask the following clarifying question:
    Who is he?
    
    ###
    
    User: Which country on this continent has the largest population?
    Bot: To answer this question, I need to ask the following clarifying question:
    Which continent?
    
    ###
    
    User: {query}
    Bot: To answer this question, I need to ask the following clarifying question: """

    messages = [
        {"role": "user", "content": content},
    ]
    
    return get_response(messages)

In [15]:
def get_clarified_answer(pair):
    added_question = question_to_clarify(pair['ambiguous_q'])
    content=f'''This is a conversation between a user and a question-answering bot. The user wants to get an answer to the following question: “{pair['precise_q']}”
    User: {pair['ambiguous_q']}
    Bot: {added_question}
    User: '''

    messages = [
        {"role": "system", "content": "Your answer should be noun, name or nouns"},
        {"role": "user", "content": content},
    ]
    
    return get_response(messages)

In [183]:
q_idx = 1
print(new_dataset[q_idx]['ambiguous_q'])
answer = get_clarified_answer(new_dataset[q_idx])
print(f"Context: {new_dataset[q_idx]['precise_q']}")
print(answer)

Who had a 70s No 1 hit with it?
Context: Who had a 70s No 1 hit with Kiss You All Over?
Kiss You All Over


In [184]:
q_idx = 20
print(new_dataset[q_idx]['ambiguous_q'])
answer = get_clarified_answer(new_dataset[q_idx])
print(f"Context: {new_dataset[q_idx]['precise_q']}")
print(answer)

Which country does it come from?
Context: Which country does the airline TAAG come from?
TAAG


In [16]:
def get_final_answer(ambiguous_q, clarifying_q, clarifying_a):
    content=f'''User: {ambiguous_q}
    Bot: {clarifying_q}
    User: {clarifying_a}
    Bot: '''
    
    messages = [
        {"role": "user", "content": content}
    ]

    return get_response(messages)

In [17]:
def automatic_protocol(pair):
    print(f"Original Question: {pair['precise_q']}\n")
    print(f"User: {pair['ambiguous_q']}")
    clarifying_question = question_to_clarify(pair['ambiguous_q'])
    print(f'Bot: {clarifying_question}')
    clarifying_answer = get_clarified_answer(pair)
    print(f'User: {clarifying_answer}')
    final_answer = get_final_answer(pair['ambiguous_q'], clarifying_question, clarifying_answer)
    return final_answer

In [18]:
final_answer = automatic_protocol(new_dataset[14])
print(f"Bot: {final_answer}")

Original Question: In Greek mythology, who were Arges, Brontes and Steropes?

User: In Greek mythology, who were they?
Bot: Who are you referring to as "they" in Greek mythology?
User: Arges, Brontes, and Steropes.
Bot: Arges, Brontes, and Steropes were the Cyclopes in Greek mythology. They were giant, one-eyed beings who were skilled blacksmiths and craftsmen. They were known for forging Zeus' thunderbolts and other powerful weapons for the gods.


In [19]:
final_answer = automatic_protocol(new_dataset[18])
print(f"Bot: {final_answer}")

Original Question: How was the European Recovery Program in the 1940s more commonly known?

User: How was it more commonly known?
Bot: What is "it" referring to?
User: The European Recovery Program in the 1940s.
Bot: The European Recovery Program in the 1940s was more commonly known as the Marshall Plan.


In [20]:
final_answer = automatic_protocol(new_dataset[7])
print(f"Bot: {final_answer}")

Original Question: What was Truman Capote's last name before he was adopted by his stepfather?

User: What was his last name before he was adopted by his stepfather?
Bot: Who are you referring to?
User: Truman Capote
Bot: Truman Capote's last name before he was adopted by his stepfather was Streckfus.
