# Generating SFT Dataset with GPT and Converting to JSON

In [None]:
import json

def process_jsonl_file(file_path):

    content = []

    try:
        with open(file_path, 'r', encoding='utf-8') as jsonl_file:
            for line_number, line in enumerate(jsonl_file, start=1):
                try:
                    # JSON 객체로 파싱
                    data = json.loads(line.strip())
                    
                    # TEXT와 meta 가져오기
                    text = data.get('TEXT', '')
                    meta = data.get('meta', '')
                    
                    # TEXT와 meta 가공 및 리스트에 추가
                    if text and meta:
                        content.append(f"{text} This is taken from information related to {meta}")
                except json.JSONDecodeError as e:
                    # JSON 파싱 실패 시 로그 출력
                    print(f"JSONDecodeError 발생 (줄 번호 {line_number}): {e}")
                    print(f"문제 발생 라인: {line.strip()}")
                except Exception as e:
                    # 기타 예외 처리
                    print(f"알 수 없는 오류 발생 (줄 번호 {line_number}): {e}")
    except FileNotFoundError:
        print("파일을 찾을 수 없습니다. 경로를 확인해주세요.")
    except Exception as e:
        print(f"오류 발생: {e}")

    return content

# 사용 예시
file_path = r'C:\Users\USER\Desktop\real\KHUrious_pretraining_data.jsonl'
result = process_jsonl_file(file_path)

# 결과 확인
print("처리된 데이터 수:", len(result))


처리된 데이터 수: 1689


In [None]:
# describe data 
content[0]

"Academic calendar/course registration method/course registration-related systems \n\nAcademic calendar for the first semester of the 2024 school year \nThe academic calendar for the first semester of the 2024 school year is as follows. \nFriday, March 1 is Independence Movement Day, and the semester begins. Monday, March 4 is the first day of classes. Course registration confirmation and changes are possible from March 4 to Friday, March 8. Additionally, credit withdrawal applications will be accepted during the same period. The application period for double majors and minors is from Monday, March 11 to Wednesday, March 13. From Monday, March 25 to Friday, March 29, you can apply for course withdrawal. Also, during the same period, you can apply for the major entry grade evaluation center by department. \nThe following is the schedule for April. From Tuesday, April 2 to Tuesday, April 30, the first round of teacher training will be held. From Monday, April 8, the application for the '

In [None]:
import openai

api_key = "Your_API_KEY" 
client = openai.OpenAI(api_key=api_key)
def process_and_save_gpt_responses(client, content, output_path):

    try:
        with open(output_path, 'a', encoding='utf-8') as output_file:  # 'a' means accumulation data
            for i, item in enumerate(content):  
                try:
                    # GPT API 호출
                    response = client.chat.completions.create(
                        model="gpt-4o-mini",  
                        messages=[
                            {
                                "role": "system",
                                "content": "You are tasked with creating pair-wise QA interactions encapsulated in a list of lists.\n You start with an initial question that paves the way for deeper, more detailed follow-up inquiries. These questions are carefully designed to be relevant and interconnected, often referring back to subjects or details from previous answers. They can include techniques like using pronouns to maintain continuity, starting questions with phrases like 'if so,' or requesting examples for clarity. Your answers are expected to be rooted in thorough logical analysis. The dialogue can unfold over 3 to 5 exchanges. If the data provided falls short, you may limit your response to a single turn, or if even that proves challenging, you're to acknowledge the limitation by stating, 'The corresponding sentence could not be generated.'\nEach list contains two elements:\nA human object containing the question.\nA gpt object containing the corresponding detailed answer.\n\nThe output format should strictly adhere to the following structure:\n  [\n    { \"from\": \"human\", \"value\": \"Question goes here\" },\n    { \"from\": \"gpt\", \"value\": \"Detailed answer goes here, written with examples, explanations, and deep insights.\" }\n  ],\n  [\n    { \"from\": \"human\", \"value\": \"Another question goes here\" },\n    { \"from\": \"gpt\", \"value\": \"Another detailed answer goes here, continuing to provide examples, step-by-step breakdowns, and rich context.\" }\n  ]\n\nKey Requirements\nPair-Wise QA Structure:\nEach pair (a question and its answer) should be a nested list with two elements:\n\n\"from\": \"human\" containing the question.\n\"from\": \"gpt\" containing the corresponding answer.\n\n\"from: human\" Questions should alternate between short and direct to detailed and multifaceted. \n For instance:\nShort Example: \"How can I find the number of digits in an integer using Python?\", \"Explain the concepts of demand, supply, and equilibrium in economics.\" , \" \nDetailed Example: \"\"What strategies can be implemented to emphasize the practical applications of chemistry in the classroom to enhance student understanding of the subject?\"? , \"How do scientists determine the true color of a star's light and distinguish it from the Doppler shift, which indicates the star's relative speed?\"\n\nDetailed Answers:\nAnswers should be long, descriptive, and comprehensive. Use examples, step-by-step reasoning, and clear formatting (e.g., lists or code snippets) where appropriate. Explain the nuances and implications of the topic being addressed.\n\nQuestion Length Variability:\nQuestions should be of random lengths—some short and concise, others more detailed and multi-faceted. This ensures a natural conversational flow.\n\nLogical Question Flow:\nQuestions and answers should build on each other in a logical sequence. Follow-up questions should naturally explore or clarify aspects of previous answers.\n\nCode and Examples:\nIf the topic requires coding or practical examples, provide detailed code snippets with thorough explanations and use cases. Ensure the code is readable and includes comments.\n\nEdge Cases:\nInclude scenarios where the system must handle special cases, ambiguities, or alternative interpretations (e.g., truthiness/falsiness differences across languages).\n\nError Handling:\nIf the input lacks sufficient information to generate a meaningful response, include an answer stating:\n\"gpt\": \"Cannot generate a corresponding answer based on the provided data.\"\n\nExample Prompt\nInstruction:\nStart of provided document:\n{document}\nEnd of provided document\n\nGenerate a list of nested lists where each sublist contains:\n\nA \"human\" question.\nA \"gpt\" answer.\nEnsure that the answers are highly detailed, with step-by-step explanations and examples. Questions should vary in length and complexity. The output format must adhere strictly to the following example:\n\nExample Output\nGiven the document:\n\"Boolean operators are used in programming to evaluate logical expressions. They include AND, OR, and NOT operators. Short-circuit evaluation is a common feature of many programming languages, where the second operand is evaluated only if needed.\"\n\nThe JSON output should look like this:\n[ \n{ \"from\": \"human\", \"value\": \"How would you adapt the given code to teach division and fractions to students with different learning styles, such as auditory and tactile learners? Additionally, how would you modify the program to include real-life scenarios for division and fractions, such as splitting pizzas or sharing candy, to make it more engaging for the students?\" }, \n{ \"from\": \"gpt\", \"value\": \"Adapting the given code to teach division and fractions to students with different learning styles can be achieved by using a combination of instructional methods that cater to their needs. For auditory learners, audio recordings of the instruction can be created, while for tactile learners, manipulatives such as fraction circles, blocks, or rods can be used. Below are some ways to modify the code to cater to different learning styles:\\n\\n1. For Auditory Learners:\\n- Create audio recordings of the instructions that can be accessed by the students.\\n- Use the \\\"speak\\\" function in the code to read out the questions and answers.\\n- Use music or rhymes to help students memorize multiplication facts that are essential for division and fractions.\\n\\n2. For Tactile Learners:\\n- Use fraction manipulatives such as fraction circles, blocks, or rods, to help students visualize fractions and understand division.\\n- Use real-life scenarios, such as splitting pizzas, sharing candy, or dividing a cake, to make it more engaging for the students.\\n- Allow the students to use manipulatives to solve the problems.\\n\\nModifying the Code to Include Real-Life Scenarios:\\nTo modify the code to include real-life scenarios, we can change the questions and answers to reflect situations where division and fractions are used in everyday life. For example, instead of asking \\\"What is 3 divided by 6?\\\" we could ask \\\"If there are 12 slices of pizza, and 3 friends want to share them equally, how many slices of pizza will each friend get?\\\" or \\\"If there are 30 pieces of candy, and 5 children want to share them equally, how many pieces of candy will each child get?\\\" \\n\\nBy incorporating real-life scenarios, we can make the learning experience more engaging and meaningful for the students, helping them to better understand the concept of division and fractions.\" } \n],\n[ { \"from\": \"human\", \"value\": \"How can the size of the expanding universe be estimated by scientists using the red shift of spectral lines?\" }, \n   { \"from\": \"gpt\", \"value\": \"Scientists estimate the size of the expanding universe by analyzing the redshift of spectral lines. This phenomenon, known as the Doppler effect, causes light from distant galaxies to shift towards the red end of the spectrum, indicating that these galaxies are moving away from us. As the wavelength increases, it suggests the universe is expanding. By measuring the extent of this redshift, scientists can determine the speed at which galaxies are receding and, consequently, infer the scale of the universe. This method helps visualize the current state of the universe's expansion.\" } ],\n  [\n    { \"from\": \"human\", \"value\": \"What happens in languages that don’t support short-circuit evaluation? Can you implement it manually?\" },\n    { \"from\": \"gpt\", \"value\": \"In languages without native support for short-circuit evaluation, every operand in a boolean expression is evaluated, regardless of whether the result can be determined early. This can lead to inefficiencies and potential runtime errors. To implement short-circuit evaluation manually, you can use nested conditional statements. For example, in pseudocode:\\n\\n```\\nif condition1:\\n    if condition2:\\n        # Both conditions are true, perform action\\n```\\nHere is an example of implementing short-circuit evaluation manually in a language without native support (e.g., certain older versions of C):\\n\\n```c\\n#include <stdio.h>\\nint main() {\\n    int x = 0;\\n    int y = 10;\\n\\n    if (x != 0) {\\n        if (y / x > 5) {\\n            printf(\\\"Condition met\\\\n\\\");\\n        }\\n    }\\n\\n    return 0;\\n}\\n```\\nThis manual implementation ensures that the second condition is evaluated only if the first condition is true, mimicking short-circuit evaluation.\" }\n  ]\n\nThis prompt ensures that:\nEvery question and answer is paired within a single list.\nAnswers are long, detailed, and explanatory.\nQuestions vary in length and complexity.\nLogical flow and real-world use cases are addressed.\nGenerate 3–5 turns of QA based on the given data.\nDo not use quoted expressions.\nQuestions should use casual phrasing like \"What's this about?\" and \"Why's that?\" while answers should use formal phrasing like \"It is because...\" or \"It provides...\""
                            }, #extremely prompt engineering! oh my money ㅠㅠ
                            {
                                "role": "user",
                                "content": item 
                            }],
                        max_tokens=3000,
                        temperature=0.7 # If temeprature is over 1.0 that means you give creativity for gpt! so checking for creativity~~~
                    )
                    
                    explanation = response.choices[0].message.content
                    output_file.write(json.dumps({"output": explanation}, ensure_ascii=False) + '\n')

                    print(f"Processed {i + 1}/{len(content)}: Saved GPT response to {output_path}")
                except Exception as e:
                    print(f"Error processing content[{i}]: {e}")
    except Exception as e:
        print(f"Error opening or writing to {output_path}: {e}")

# 파일 경로와 결과 저장 경로 설정
file_path = r'C:\Users\USER\Desktop\real\KHUrious_pretraining_data.jsonl'
output_path = r'C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl'

# process_jsonl_file 함수를 호출하여 content 생성
content = process_jsonl_file(file_path)

# GPT 호출 및 결과 저장 함수 실행
process_and_save_gpt_responses(client, content, output_path)



Processed 1/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 2/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 3/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 4/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 5/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 6/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 7/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 8/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 9/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_Data\KUHrious_SFT_Dataset.jsonl
Processed 10/1689: Saved GPT response to C:\Users\USER\Desktop\real\SFT_D

KeyboardInterrupt: 