In [None]:
# pip install -q -U google-generativeai
# pip install sentence-transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.3.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


# Text Loading & Chunking

In [None]:
department='산업공학과' #과이름
file_path = "./"+department+".txt"  # 파일 경로 입력
with open(file_path, "r", encoding="utf-8") as file:
    text_data = file.read()

In [11]:
print(text_data)

Department of Industrial and Management Engineering
Industrial management engineering can be said to be a discipline that harmoniously directs and controls all the fields that make up the industrial system. Industrial management engineering can be said to be a conductor who conducts the orchestra called industry a manager who places the positions and batting order of baseball players wearing the uniform called industry in the right places an editor who cuts and pastes articles written by various reporters of the industry daily and a field commander who commands the industry corps to win battles.
What these people do is not to play a musical instrument hit a home run write articles or fight by shooting guns and cannons but to make the system they are commanding more beautiful so that the performances of many people can be more beautiful so that the game can be won so that the newspaper can be rich and readable so that the battle can be won and so that the system they are commanding can 

In [4]:
from sentence_transformers import SentenceTransformer, util

def semantic_chunking(text, similarity_threshold=0.7, overlap_size=2):
    # 텍스트가 비어 있는 경우 처리
    if not text.strip():
        return []

    # 문장을 분리 (". " 기준)
    sentences = text.split(". ")
    model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
    embeddings = model.encode(sentences)
    chunks = []
    current_chunk = []

    for i in range(len(sentences)):
        if current_chunk:
            # 현재 청크에 추가할 문장의 유사도 계산
            similarity = util.pytorch_cos_sim(embeddings[i - 1], embeddings[i])
            if similarity < similarity_threshold:
                # 현재 청크 저장
                chunks.append(" ".join(current_chunk))
                # 새 청크 생성: 이전 청크의 마지막 overlap_size 문장을 추가
                current_chunk = current_chunk[-overlap_size:]  # 이전 청크의 뒤쪽 문장
        # 현재 문장을 새 청크에 추가
        current_chunk.append(sentences[i])

    # 마지막 청크 추가
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
chunked_text =semantic_chunking(text_data, similarity_threshold=0.65)

# 각 청크를 순서대로 출력
for i, chunk in enumerate(chunked_text):
    print(f"{i + 1}:")
    print(chunk)
    print("=" * 40)  # 청크 간 구분선


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


1:
Department of Industrial and Management Engineering
Industrial management engineering can be said to be a discipline that harmoniously directs and controls all the fields that make up the industrial system
2:
Department of Industrial and Management Engineering
Industrial management engineering can be said to be a discipline that harmoniously directs and controls all the fields that make up the industrial system Industrial management engineering can be said to be a conductor who conducts the orchestra called industry a manager who places the positions and batting order of baseball players wearing the uniform called industry in the right places an editor who cuts and pastes articles written by various reporters of the industry daily and a field commander who commands the industry corps to win battles.
What these people do is not to play a musical instrument hit a home run write articles or fight by shooting guns and cannons but to make the system they are commanding more beautiful so 

In [6]:
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
import os
os.environ['GOOGLE_API_KEY'] = "Your Gemini API key" # 본인의 Gemini API key


GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)


In [8]:
model = genai.GenerativeModel('gemini-1.5-flash')

In [9]:
def generate_qa_prompt(chunk):
    prompt = f"""
    You are an AI tasked with analyzing a text about Kyung Hee University and generating 3 questions and answers based on the text.
    The questions and answers you generate should be realistic and something students might ask while attending the university.
    You are a senior at the university, and students are asking you about Kyung Hee University. Use a friendly tone.

    Students are not directly reading the text, so do not respond by explicitly stating that you are looking at the text.
    Do not provide baseless answers; only provide factual information based on the given text.

    Text: {chunk}

    Please create the output in the following format:
    [
        {{
            "q": "When was the Department of Architecture at Kyung Hee University first established, and how many students were there initially?",
            "a": "The Department of Architecture was first established on the Seoul campus in 1970, and the initial student capacity was 30."
        }},
        {{
            "q": "What career paths are available after graduating from the Department of Architecture? Can students enter various fields?",
            "a": "Yes, many! Graduates can work in architectural design firms, construction companies, research institutes, or as public officials. Recently, many have also joined BIM-related architecture IT companies. Employment in the top 50 construction companies is also possible!"
        }},
        {{
            "q": "What do first-year students learn in the architectural design course? How important is computer usage?",
            "a": "The introductory architectural design course focuses on developing design skills using computers. Students learn the basic principles of architectural design and aim to cultivate creative design skills by leveraging computer design techniques essential for various fields."
        }}
    ]
    """
    return prompt

In [12]:
import time
import json
import os

# JSON 파일 경로
output_file = "./"+department+"_sft.json"

# 기존 데이터 복원
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        qa_data = json.load(f)
else:
    qa_data = []  # 새로 시작하는 경우 빈 리스트 초기화

# Chunk 처리
for i, chunk in enumerate(chunked_text[len(qa_data):], start=len(qa_data)):
    try:
        # Prompt 생성
        prompt = generate_qa_prompt(chunk)

        # Gemini API 호출
        response = model.generate_content(prompt)

        # finish_reason 확인
        if response.candidates[0].finish_reason == 5:
            print(f"응답 생성 실패 (finish_reason: {response.candidates[0].finish_reason})")
            qa_data.append("응답 생성 실패")
            continue  # 다음 chunk로 넘어감

        # 응답 텍스트만 저장
        qa_data.append(response.text)

        # 결과 출력
        print("Chunk: {}".format(chunk))
        print("Gemini: {}".format(response.text))

        # 50개마다 저장
        if (i + 1) % 50 == 0:
            with open(output_file, "w", encoding="utf-8") as f:
                json.dump(qa_data, f, ensure_ascii=False, indent=4)
            print(f"{i + 1}개의 응답이 저장되었습니다.")

        # 딜레이
        time.sleep(1.4)

    except Exception as e:
        print(f"에러 발생: {e}")
        continue  # 에러 발생 시 다음 chunk로 넘어감

# 마지막으로 저장
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(qa_data, f, ensure_ascii=False, indent=4)
print(f"모든 응답이 {output_file} 파일에 저장되었습니다.")


Chunk: Department of Industrial and Management Engineering
Industrial management engineering can be said to be a discipline that harmoniously directs and controls all the fields that make up the industrial system
Gemini: ```json
[
  {
    "q": "Hey, I'm thinking about applying to the Department of Industrial and Management Engineering.  What's the focus of the program?  Is it more theoretical or practical?",
    "a": "It's a pretty balanced program.  From what I understand, it focuses on harmoniously directing and controlling all aspects of an industrial system.  So you'll get both the theoretical background and practical applications."
  },
  {
    "q": "What kind of career opportunities are there after graduating from Industrial and Management Engineering?",
    "a": "That's a great question!  Because it's such a broad field, you'll have lots of options.  You could go into various areas of industrial management, potentially in manufacturing, logistics, or even consulting.  The possib