# 듣고 질문에 답하기 유형 데이터 만들기

## 질문 생성 Chain

In [None]:
pip install langchain

In [None]:
pip install langchain_openai

In [None]:
pip install openai

In [2]:
import json
from typing import List

from tqdm.notebook import tqdm
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser, CommaSeparatedListOutputParser
from langchain.pydantic_v1 import BaseModel, Field
from langchain.schema import HumanMessage, AIMessage, StrOutputParser
import pandas as pd

In [3]:
model = ChatOpenAI(model="gpt-4-1106-preview", openai_api_key="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")

### 질문 주제 샘플링하기

In [4]:
csv_parser = CommaSeparatedListOutputParser()

In [5]:
csv_format_instruction = csv_parser.get_format_instructions()

In [6]:
subjet_prompt_template = PromptTemplate.from_template(template="Convert easy topics that might appear on a language speaking conversation test into words.\n{format_instruction}",
                                                      partial_variables={"format_instruction": csv_format_instruction})

In [7]:
subject_chain = subjet_prompt_template | model | csv_parser

In [8]:
subject_list = subject_chain.invoke({})

In [9]:
subject_list

['hobbies',
 'weather',
 'travel',
 'family',
 'food',
 'sports',
 'music',
 'movies',
 'books',
 'pets',
 'holidays',
 'education',
 'daily routine',
 'shopping',
 'health',
 'technology',
 'work',
 'culture',
 'leisure activities',
 'current events']

In [10]:
subject_list = subject_list[:4]

In [11]:
subject_list

['hobbies', 'weather', 'travel', 'family']

### 질문 만들기

In [12]:
template = """\

- Create an easy question about {input} topic that might appear on a language speaking test.
- Make it related to the other person
- Just make one sentence.
- Don't make multiple examples.
- In English"""

question_prompt_template = PromptTemplate.from_template(template=template)

In [13]:
question_chain = question_prompt_template | model | StrOutputParser()

In [None]:
question_list = []
for subject in tqdm(subject_list):
    # question_list.append(question_chain.invoke({"input": subject, "prev_questions": question_list}))
    question_list.append(question_chain.invoke({"input": subject}))

In [15]:
question_list

['What hobbies do you enjoy in your free time?',
 'How do you usually prepare for a rainy day in your city?',
 'Can you tell me about the last place you visited and what you enjoyed most about the trip?',
 'Could you tell me about your favorite family tradition?']

## 질문에 대한 오디오 파일 만들기

In [16]:
from openai import OpenAI

In [17]:
import os
os.environ["OPENAI_API_KEY"] = "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"
client = OpenAI()

In [18]:
def gen_speech_file(text, output_file_path):
    response = client.audio.speech.create(
        model="tts-1",
        voice="alloy", # alloy, echo, fable, onyx, nova, and shimmer
        input=text
    )
    response.stream_to_file(output_file_path)

In [19]:
!mkdir -p ./data/speaking_listen_and_answer

In [21]:
save_dir = "./data/speaking_listen_and_answer"

In [22]:
question_list

['What hobbies do you enjoy in your free time?',
 'How do you usually prepare for a rainy day in your city?',
 'Can you tell me about the last place you visited and what you enjoyed most about the trip?',
 'Could you tell me about your favorite family tradition?']

In [23]:
record_list = []

for i, q in tqdm(enumerate(question_list), total=len(question_list)):
    output_file_path = f"{save_dir}/question_{i}.wav"
    gen_speech_file(q, output_file_path)

    record = {"question": q, "audio_file_path": output_file_path}
    record_list.append(record)

  0%|          | 0/4 [00:00<?, ?it/s]

  response.stream_to_file(output_file_path)


In [24]:
df = pd.DataFrame(record_list)
df

Unnamed: 0,question,audio_file_path
0,What hobbies do you enjoy in your free time?,./data/speaking_listen_and_answer/question_0.wav
1,How do you usually prepare for a rainy day in ...,./data/speaking_listen_and_answer/question_1.wav
2,Can you tell me about the last place you visit...,./data/speaking_listen_and_answer/question_2.wav
3,Could you tell me about your favorite family t...,./data/speaking_listen_and_answer/question_3.wav


In [25]:
df.to_csv(f"{save_dir}/question_and_audio.csv", index=False)

In [26]:
from IPython.display import Audio

In [27]:
Audio(f"{save_dir}/question_2.wav")