## Prep


In [53]:
import openai
from dotenv import load_dotenv
import os
from langfuse import Langfuse
from langchain_community.document_loaders import PyPDFLoader  
import os
import re
from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langgraph.graph import StateGraph
from langfuse import Langfuse
from langfuse.callback import CallbackHandler
from langchain.schema import Document
from datetime import datetime
from datetime import datetime

# from langfuse.client import CreateTracer, Creategeneration, CreateSpan


load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
LANGFUSE_SECRET_KEY = os.getenv("LANGFUSE_SECRET_KEY")
LANGFUSE_PUBLIC_KEY = os.getenv("LANGFUSE_PUBLIC_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

client = openai.OpenAI(api_key=OPENAI_API_KEY)
langfuse = Langfuse(
    secret_key=LANGFUSE_SECRET_KEY,
    public_key=LANGFUSE_PUBLIC_KEY,
    host="https://us.cloud.langfuse.com",
)

## Read PDF Files + RAG using Pinecone


### read PDF Files


In [54]:
import os

# 복사한 절대 경로 붙여넣기
path = r"C:\Users\yjw64\projects\github\kairos\KAIROS_Podcast\example_files\[Lecture] 12. IP in Linux.pdf"

print("경로:", path)
print("존재 여부:", os.path.exists(path))

경로: C:\Users\yjw64\projects\github\kairos\KAIROS_Podcast\example_files\[Lecture] 12. IP in Linux.pdf
존재 여부: True


In [55]:
pdf_paths = [
    r"C:\Users\yjw64\projects\github\kairos\KAIROS_Podcast\example_files\[Lecture] 12. IP in Linux.pdf"
]

documents = []

for path in pdf_paths:
    if not os.path.exists(path):
        print(f"파일 없음: {path}")
        continue
    # 1. 파일명에서 [세션명] 추출
    file_name = os.path.basename(path)
    match = re.search(r"\[(.*?)\]", file_name)
    session = match.group(1) if match else "Unknown Session"

    # 2. PDF 로딩
    loader = PyPDFLoader(path)
    docs = loader.load()
    # date 정보 추가
    date = datetime.today().strftime("%Y-%m-%d") #ex)2025-01-01

    # 3. 메타데이터 추가
    for doc in docs:
        doc.metadata["session"] = session
        doc.metadata["date"] = date

    documents.extend(docs)

Ignoring wrong pointing object 12 0 (offset 0)


Ignoring wrong pointing object 14 0 (offset 0)
Ignoring wrong pointing object 26 0 (offset 0)
Ignoring wrong pointing object 28 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 32 0 (offset 0)
Ignoring wrong pointing object 34 0 (offset 0)
Ignoring wrong pointing object 36 0 (offset 0)
Ignoring wrong pointing object 47 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 59 0 (offset 0)
Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 84 0 (offset 0)
Ignoring wrong pointing object 87 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong pointing object 95 0 (offset 0)
Ignoring wrong pointing object 97 0 (offset 0)
Ignoring wrong pointing object 103 0 (offset 0)
Ignoring wrong pointing object 105 0 (offset 0)
Ignoring wrong pointing object 134 0 (offset 0)
Ignoring wrong pointing object 148 0 (offset 0)
Ignoring wrong pointing object 154 0 (offset 0)
Ignoring

In [56]:
#check documents
documents[0]

Document(metadata={'producer': 'macOS 버전 15.4.1(빌드 24E263) Quartz PDFContext', 'creator': 'PyPDF', 'creationdate': "D:20250514085129Z00'00'", 'moddate': "D:20250514085129Z00'00'", 'source': 'C:\\Users\\yjw64\\projects\\github\\kairos\\KAIROS_Podcast\\example_files\\[Lecture] 12. IP in Linux.pdf', 'total_pages': 34, 'page': 0, 'page_label': '1', 'session': 'Lecture', 'date': '2025-05-21'}, page_content='System Programming\nLecture#12IP in Linux')

### RAG


In [87]:
# OpenAIEmbeddings 인스턴스 생성
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY,
    model="text-embedding-3-small"
)
pc = Pinecone(api_key=PINECONE_API_KEY)
index = pc.Index("kairos-podcast")

In [88]:
#check connection
print(index.describe_index_stats())

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'lecturedata': {'vector_count': 120},
                'userqna': {'vector_count': 3}},
 'total_vector_count': 123,
 'vector_type': 'dense'}


In [89]:
#저장할 vector store 불러오기
vector_store = PineconeVectorStore(
    index=index, 
    embedding=embeddings,
    namespace="lecturedata"
)

In [90]:
#split documents
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)

In [91]:
vector_store.add_documents(
    documents=split_docs
)

['eb5f9808-e7e9-47fe-9c54-2cb701347004',
 '7d19935e-c6f5-49b9-a2e1-bce350668ed6',
 '367b91c5-f6a1-4662-8b96-14f1cf6a03c4',
 '2f792f4b-ca97-407e-bcac-c47fba052064',
 '2a33189b-73d6-4855-80f0-cfb9c4e06bed',
 'bf90527a-3945-4e4d-9d1f-e785adbda20b',
 '48781205-f12f-4f9b-8999-067805b3f783',
 '1411deea-a42a-42bc-b9a9-4948eb7d3ce7',
 '5b9192f8-2e13-49c2-8976-4f35e4d13467',
 '8cf9dce0-21da-471e-84e5-8d5f09773103',
 'fad6d761-7005-4c32-9d5f-526ffc1349b3',
 'a97e3b7a-3ce6-4678-b4a4-0f8d4beea6f2',
 '69813981-de98-4411-acf2-22189c64030a',
 '220dae39-af5a-4e6e-8069-dd06fefc01e0',
 '39cf7462-2e1e-4300-9559-d4625ccb5fab',
 '7bb87265-ed92-406e-ba74-007081ba51a4',
 'bef4e50b-c618-4da9-ad61-91844512b945',
 '9c3fbdea-b7d3-461b-a50a-cab4d4577f3c',
 '2417b549-b864-4a75-980f-fcd2f95c2867',
 '41242c2b-c502-4313-b8de-553ffd6ee613',
 'b292c524-394a-4702-8d7a-1be4896a975b',
 '9724a4e1-5469-4baf-a38f-c957ca4f01a0',
 'd4839a9d-cf28-4182-8f3b-4a78ced67769',
 'cf6ffcca-0ef3-4a75-8c72-8cdef610e221',
 'cd0ff439-7f65-

In [92]:
# # retrieve documents Agent

# def retrieve_docs(state):
#     query= state["init"]
#     lecture_docs = vectorstore.similarity_search(query, k=5, namespace="lecture")
#     qa_docs = vectorstore.similarity_search(query, k=3, namespace="userqa")
#     all_docs = lecture_docs + qa_docs
#     return {"docs": all_docs, "query": query}

## Summary LLM


In [93]:
def read_prompt(filename):
    with open(filename, "r", encoding="utf-8") as f:
        return f.read().strip()

In [108]:
def get_qa_docs_with_fallback(query, k):
    try:
        docs = vector_store.similarity_search(query, k=k, namespace="userqna")
        if docs:
            return docs
        else:
            # Fallback: If no documents found, return a default message
            return []
    except Exception as e:
        print("에러 발생:", e)
        return []

In [106]:
#generate script Agent

def generate_script(state):

    # 1. Retrieve documents
    query= state["init"]
    lecture_docs = vector_store.similarity_search(query, k=5, namespace="lecture")
    qa_docs = get_qa_docs_with_fallback(query, k=1)
    all_docs = lecture_docs + qa_docs

    # 2. Create user and system prompts
    content = "\n\n".join([doc.page_content for doc in all_docs])
    user_prompt = f"다음은 참고할 문서 내용입니다:\n\n{content}"
    system_prompt = read_prompt("summary_prompt.txt")


    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.7
    )
    return {"script": response.choices[0].message.content, "docs": all_docs, "query": query}


## TTS


In [96]:
def synthesize_tts(state):
    script = state["script"]
    audio_response = client.audio.speech.create(
        model="tts-1",
        voice="nova",
        input=script
    )
    file_name = "podcast_script.mp3"
    with open(file_name, "wb") as f:
        f.write(audio_response.content)
    return {"script": script, "audio_file": file_name}

## TBU : User Query (Added to Vector Store)


In [109]:
def answer_from_vectorstore(state):
    user_query = state["user_query"]
    docs = vector_store.similarity_search(user_query, k=5, namespace="lecturedata")
    content = "\n\n".join([doc.page_content for doc in docs])

    prompt = f"다음 문서를 참고하여 사용자의 질문에 답해주세요:\n\n{content}\n\n질문: {user_query}"
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5
    )
    answer = response.choices[0].message.content

    # vectorstore에 질문/답변 저장
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    qa_doc = Document(
        page_content=f"Q: {user_query}\nA: {answer}",
        metadata={"source": "userqa", "timestamp": now}
    )
    vector_store.add_documents([qa_doc], namespace="userqna")
    return {"answer": answer} 

## MultiAgent


In [98]:
from langfuse import Langfuse
from langfuse.callback import CallbackHandler

langfuse_handler = CallbackHandler(
  secret_key=LANGFUSE_SECRET_KEY,
  public_key=LANGFUSE_PUBLIC_KEY,
  host="https://us.cloud.langfuse.com"
)

In [99]:
#LangGraph State definition

from typing import TypedDict, Optional


class ChatState(TypedDict):
    init: str
    docs: Optional[list[Document]]
    script: Optional[str]
    audio_file: Optional[str]
    user_query: Optional[str]
    answer: Optional[str]

In [100]:
#agent node definition


def script_generation(state: ChatState) -> ChatState:
    generated_script = generate_script(state)
    return {**state, "script": generated_script["script"]}

def tts_generation(state: ChatState) -> ChatState:
    tts_result = synthesize_tts(state)
    return {**state, "audio_file": tts_result["audio_file"]}

def save_to_vector_store(state: ChatState) -> None:
    user_qna_data = answer_from_vectorstore(state)
    return {**state, "answer": user_qna_data["answer"]}
    

In [101]:
#workflow connection
from langchain_core.runnables import RunnableLambda

workflow = StateGraph(ChatState)

workflow.add_node("Script Generation", RunnableLambda(script_generation))
workflow.add_node("TTS Generation", RunnableLambda(tts_generation))
workflow.add_node("User QnA", RunnableLambda(save_to_vector_store))

workflow.set_entry_point("Script Generation")
workflow.add_edge("Script Generation", "TTS Generation")
workflow.add_edge("TTS Generation", "User QnA")
workflow.set_finish_point("User QnA")

graph = workflow.compile().with_config({"callbacks": [langfuse_handler]})

In [110]:
state = {
    "init": "강의 내용을 요약한 팟캐스트를 만들어줘",
    "user_query" : "ip_finish_output 과 ip_finish_output2의 차이점은?",
    "docs": None,
    "script": None,
    "audio_file": None,
    "answer": None
}


result = graph.invoke(state)

print(result)

{'init': '강의 내용을 요약한 팟캐스트를 만들어줘', 'docs': None, 'script': '[인트로 음악]\n\n안녕하세요, 여러분! \'테크 토크\'에 오신 걸 환영합니다! 저는 여러분의 팟캐스트 호스트, 카이로스입니다. 오늘은 조금은 기술적인 이야기를 해보려고 해요. 바로 소프트웨어 개발에서 자주 언급되는 함수, `ip_finish_output`과 `ip_finish_output2`에 대한 이야기입니다.\n\n자, 여러분. 혹시 프로그래밍을 하면서 비슷한 이름의 함수들을 보고 "이게 뭐가 다른 거지?" 하고 헷갈렸던 적 있지 않나요? 오늘 다룰 주제가 바로 그런 케이스입니다. `ip_finish_output`과 `ip_finish_output2`, 이름만 보면 둘 다 무언가를 \'끝내는\' 역할을 하는 것 같죠? 하지만 구체적으로 어떻게 다르고, 왜 이렇게 이름이 비슷한 두 함수가 존재하는지 궁금하지 않으세요?\n\n아쉽게도, 제가 직접 그 차이점을 설명하는 문서를 가지고 있진 않아요. 그래서 오늘은 여러분이 이런 상황에 마주쳤을 때, 어떻게 접근하면 좋을지에 대해 이야기해보려고 합니다. \n\n먼저, 함수의 차이를 이해하려면 그 함수들이 정의된 문서나 소스 코드를 직접 살펴보는 게 가장 중요해요. 각 함수의 목적이 무엇인지, 어떤 매개변수를 받는지, 반환값은 무엇인지, 그리고 내부적으로 어떤 로직으로 구현되어 있는지를 살펴보는 것이죠.\n\n예를 들어, `ip_finish_output`과 `ip_finish_output2`라는 함수가 있다고 칩시다. 이 두 함수는 아마도 비슷한 기능을 수행하면서 약간의 차이점을 가질 가능성이 큽니다. 이런 경우, 함수의 이름만 보고 판단하기보다는, 문서화된 설명이나 주석을 참고하여 두 함수가 정확히 어떤 기능을 수행하는지 비교해보는 것이 중요해요.\n\n또한, 이 함수들이 사용되는 실제 사례를 살펴보는 것도 좋은 방법입니다. 소스 코드에서 이 함수들이 어떻게 사용되는지를 보면, 일반적으로 예상되는 사용 패턴

In [103]:
print(graph.get_graph().draw_mermaid())

---
config:
  flowchart:
    curve: linear
---
graph TD;
	__start__([<p>__start__</p>]):::first
	Script_Generation(Script Generation)
	TTS_Generation(TTS Generation)
	User_QnA(User QnA)
	__end__([<p>__end__</p>]):::last
	Script_Generation --> TTS_Generation;
	TTS_Generation --> User_QnA;
	__start__ --> Script_Generation;
	User_QnA --> __end__;
	classDef default fill:#f2f0ff,line-height:1.2
	classDef first fill-opacity:0
	classDef last fill:#bfb6fc

