In [3]:
import os, glob, uuid
from typing import List, Dict, Tuple
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader
import tiktoken

In [4]:
# Load environment variables in a file called .env

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')
serp_api_key = os.getenv("SERP_API_KEY")
perplexity_api_key = os.getenv("PEPLEXITY_API_KEY")
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")

if serp_api_key:
    print(f"serp_api_key exists and begins {serp_api_key[:8]}")
else:
    print("serp_api_key not set")

if perplexity_api_key:
    print(f"perplexity_api_key exists and begins {perplexity_api_key[:8]}")
else:
    print("perplexity_api_key not set")

# GPT 모델 선언
openai = OpenAI()
MODEL = 'gpt-4o-mini'

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyBU
serp_api_key exists and begins c3ee9cec
perplexity_api_key exists and begins pplx-r4f


In [5]:
OPENAI_MODEL="gpt-4o-mini"  # 원하면 바꾸세요 (ex. gpt-4.1-mini)

# 임베딩 모델 (문서/질의 공통, 다국어 강력)
EMBEDDING_MODEL="BAAI/bge-m3"

# 검색 설정
TOP_K=6
MMR=False             # True로 바꾸면 다양성 우선 검색
RERANK=False          # True로 바꾸면 cross-encoder 재랭킹(아래 옵션 참고)


In [6]:
DATA_DIR = "data"
DB_DIR = "vectordb"
COLLECTION_NAME = "rag_collection"

EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "BAAI/bge-m3")
CHUNK_TOKENS = 400
CHUNK_OVERLAP = 80

In [7]:
# ---------- 유틸 ----------
def read_text_file(path: str) -> str:
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        return f.read()

def read_pdf_file(path: str) -> str:
    reader = PdfReader(path)
    texts = []
    for page in reader.pages:
        txt = page.extract_text() or ""
        texts.append(txt)
    return "\n".join(texts)

def load_documents(data_dir: str) -> List[Tuple[str, str]]:
    docs = []
    patterns = ["**/*.txt", "**/*.md", "**/*.pdf"]
    for pat in patterns:
        for p in glob.glob(os.path.join(data_dir, pat), recursive=True):
            ext = os.path.splitext(p)[1].lower()
            if ext in [".txt", ".md"]:
                text = read_text_file(p)
            elif ext == ".pdf":
                text = read_pdf_file(p)
            else:
                continue
            if text.strip():
                docs.append((p, text))
    return docs

def token_chunk(text: str, max_tokens=CHUNK_TOKENS, overlap=CHUNK_OVERLAP) -> List[str]:
    enc = tiktoken.get_encoding("cl100k_base")
    toks = enc.encode(text)
    chunks = []
    start = 0
    while start < len(toks):
        end = min(start + max_tokens, len(toks))
        chunk = enc.decode(toks[start:end])
        chunks.append(chunk)
        if end == len(toks):
            break
        start = max(0, end - overlap)
    return chunks

# ---------- 임베딩 ----------
def get_embedder():
    model = SentenceTransformer(EMBEDDING_MODEL)
    # bge/e5 류는 보통 normalize 추천
    model.max_seq_length = 512
    return model

# ---------- 메인 ----------
def main():
    os.makedirs(DB_DIR, exist_ok=True)
    client = chromadb.PersistentClient(path=DB_DIR, settings=Settings(allow_reset=False))
    colls = [c.name for c in client.list_collections()]
    if COLLECTION_NAME in colls:
        collection = client.get_collection(COLLECTION_NAME)
    else:
        collection = client.create_collection(COLLECTION_NAME, metadata={"hnsw:space": "cosine"})

    embedder = get_embedder()

    docs = load_documents(DATA_DIR)
    if not docs:
        print(f"[INGEST] No documents found in ./{DATA_DIR}. Add files and rerun.")
        return

    ids, texts, metadatas = [], [], []
    for path, full_text in docs:
        chunks = token_chunk(full_text)
        for i, ch in enumerate(chunks):
            ids.append(str(uuid.uuid4()))
            texts.append(ch)
            metadatas.append({"source": path, "chunk_idx": i})

    print(f"[INGEST] Embedding {len(texts)} chunks with {EMBEDDING_MODEL} ...")
    embs = embedder.encode(texts, normalize_embeddings=True, show_progress_bar=True).tolist()

    print(f"[INGEST] Upserting to Chroma ({COLLECTION_NAME}) ...")
    collection.upsert(ids=ids, embeddings=embs, metadatas=metadatas, documents=texts)

    print(f"[DONE] {len(docs)} files, {len(texts)} chunks indexed at {DB_DIR}/")

if __name__ == "__main__":
    main()

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


[INGEST] Embedding 1 chunks with BAAI/bge-m3 ...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [00:03<00:00,  3.09s/it]

[INGEST] Upserting to Chroma (rag_collection) ...
[DONE] 1 files, 1 chunks indexed at vectordb/





In [10]:
def rag_answer(query: str, k: int = TOP_K, use_lexical: bool = True, use_rerank: bool = False) -> str:
    hits = retrieve(query, k=k, mmr=USE_MMR)
    if use_lexical:
        hits = bm25_mix(query, hits)
    if use_rerank or USE_RERANK:
        hits = rerank_cross_encoder(query, hits)
    answer = generate_answer(query, hits[:k])
    sources = "\n".join(
        f"[{i+1}] {h['meta'].get('source')} (chunk {h['meta'].get('chunk_idx')})"
        for i, h in enumerate(hits[:k], 1)
    )
    return f"{answer}\n\n=== SOURCES ===\n{sources}"


In [11]:
def get_collection():
    client = chromadb.PersistentClient(path=DB_DIR, settings=Settings(allow_reset=False))
    try:
        return client.get_collection(COLLECTION_NAME)
    except Exception:
        raise RuntimeError(
            f"Chroma 컬렉션 '{COLLECTION_NAME}'이 없습니다. 먼저 ingest.py를 실행해 색인하세요."
        )


In [15]:
import os, math
from typing import List, Dict, Tuple
from dotenv import load_dotenv
load_dotenv()

import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer, CrossEncoder

from rank_bm25 import BM25Okapi
import argparse

from openai import OpenAI

# ---------- 설정 ----------
DB_DIR = "vectordb"
COLLECTION_NAME = "rag_collection"
EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "BAAI/bge-m3")
TOP_K = int(os.getenv("TOP_K", "6"))
USE_MMR = os.getenv("MMR", "False").lower() == "true"
USE_RERANK = os.getenv("RERANK", "False").lower() == "true"

OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")

# ---------- 임베딩 & DB ----------
def get_embedder():
    model = SentenceTransformer(EMBEDDING_MODEL)
    model.max_seq_length = 512
    return model

def get_collection():
    client = chromadb.PersistentClient(path=DB_DIR, settings=Settings(allow_reset=False))
    return client.get_collection(COLLECTION_NAME)

# ---------- 검색 ----------
def retrieve(query: str, k: int = TOP_K, mmr: bool = USE_MMR) -> List[Dict]:
    col = get_collection()
    embedder = get_embedder()
    q_emb = embedder.encode([query], normalize_embeddings=True).tolist()[0]

    res = col.query(query_embeddings=[q_emb], n_results=max(k*3 if mmr else k, k))
    docs = res["documents"][0]
    metas = res["metadatas"][0]
    dists = res["distances"][0] if "distances" in res else [0.0]*len(docs)

    items = [{"doc": d, "meta": m, "dist": dist} for d, m, dist in zip(docs, metas, dists)]

    # 간단 MMR(유사도 다양성) — 임베딩 기반 재선택
    if mmr:
        emb_chunks = embedder.encode([it["doc"] for it in items], normalize_embeddings=True)
        selected = []
        cand_idx = set(range(len(items)))
        # greedy
        while cand_idx and len(selected) < k:
            if not selected:
                # 가장 가까운 것부터
                best = min(cand_idx, key=lambda i: items[i]["dist"])
                selected.append(best)
                cand_idx.remove(best)
                continue
            # 다양성 점수 계산: 후보와 이미 선택된 것들 간 최대 유사도 최소화
            def mmr_score(i):
                import numpy as np
                qsim = 1 - items[i]["dist"]  # cosine distance -> similarity approx
                diversity = max(np.dot(emb_chunks[i], emb_chunks[j]) for j in selected)
                lam = 0.75
                return lam*qsim - (1-lam)*diversity
            best = max(cand_idx, key=mmr_score)
            selected.append(best)
            cand_idx.remove(best)
        items = [items[i] for i in selected]
    else:
        items = sorted(items, key=lambda x: x["dist"])[:k]

    return items

# ---------- (옵션) Lexical + Re-Rank ----------
def bm25_mix(query: str, hits: List[Dict], alpha: float = 0.2) -> List[Dict]:
    """ 벡터 히트에서 BM25 점수와 혼합 (alpha는 BM25 가중치) """
    corpus = [h["doc"] for h in hits]
    bm25 = BM25Okapi([c.split() for c in corpus])
    bm_scores = bm25.get_scores(query.split())
    # 거리(dist)는 낮을수록 유리 → 유사도처럼 바꿔서 합산
    import numpy as np
    dist = np.array([h["dist"] for h in hits])
    sim = 1 - (dist / (dist.max() + 1e-9))
    mixed = alpha*bm_scores + (1-alpha)*sim
    order = mixed.argsort()[::-1]
    return [hits[i] for i in order]

def rerank_cross_encoder(query: str, hits: List[Dict], model_name: str = "BAAI/bge-reranker-v2-m3") -> List[Dict]:
    ce = CrossEncoder(model_name)
    pairs = [[query, h["doc"]] for h in hits]
    scores = ce.predict(pairs)
    ranked = sorted(zip(hits, scores), key=lambda x: x[1], reverse=True)
    return [h for h, _ in ranked]

# ---------- 생성 ----------
def build_prompt(query: str, contexts: List[Dict]) -> List[Dict]:
    context_block = "\n\n".join(
        [f"[{i+1}] SOURCE: {c['meta'].get('source')} (chunk {c['meta'].get('chunk_idx')})\n{c['doc']}"
         for i, c in enumerate(contexts)]
    )
    system = (
        "당신은 정확한 RAG 비서입니다. 제공된 '컨텍스트'만 근거로 한국어로 답하세요. "
        "모르면 모른다고 말하세요. 반드시 근거가 된 출처를 인덱스 번호로 함께 표기하세요."
    )
    user = (
        f"질문:\n{query}\n\n"
        f"컨텍스트(참조용):\n{context_block}\n\n"
        "요구사항:\n- 컨텍스트 범위를 벗어난 추측 금지\n- 핵심 요약 → 근거 표기 [1], [2]...\n"
    )
    return [
        {"role":"system", "content": system},
        {"role":"user", "content": user}
    ]

def generate_answer(query: str, contexts: List[Dict]) -> str:
    client = OpenAI()
    messages = build_prompt(query, contexts)
    resp = client.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
        temperature=0.2,
    )
    return resp.choices[0].message.content

# ---------- CLI ----------
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--q", required=True, help="질문 텍스트")
    parser.add_argument("--k", type=int, default=TOP_K, help="검색 개수")
    parser.add_argument("--lexical", action="store_true", help="BM25 가중 혼합 사용")
    parser.add_argument("--rerank", action="store_true", help="Cross-Encoder 재랭킹 사용")
    args = parser.parse_args()

    hits = retrieve(args.q, k=args.k, mmr=USE_MMR)
    if args.lexical:
        hits = bm25_mix(args.q, hits)
    if args.rerank or USE_RERANK:
        hits = rerank_cross_encoder(args.q, hits)

    answer = generate_answer(args.q, hits[:args.k])

    print("\n=== ANSWER ===\n")
    print(answer.strip())
    print("\n=== SOURCES ===")
    for i, h in enumerate(hits[:args.k], 1):
        src = h["meta"].get("source"); idx = h["meta"].get("chunk_idx")
        print(f"[{i}] {src} (chunk {idx})")

if __name__ == "__main__":
    import sys
    try:
        from IPython import get_ipython
        IN_JUPYTER = get_ipython() is not None
    except Exception:
        IN_JUPYTER = False

    if IN_JUPYTER and len(sys.argv) == 1:
        print("🔎 주피터 감지: 아래 중 하나로 실행하세요.\n"
              "1) %run rag.py --q '질문' --k 6 --lexical\n"
              "2) !python rag.py --q '질문' --k 6 --lexical\n"
              "3) (노트북 셀) sys.argv 지정 후 main() 호출")
    else:
        main()


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Number of requested results 6 is greater than number of elements in index 1, updating n_results = 1



=== ANSWER ===

질문이 구체적으로 무엇인지 명시되지 않았습니다. 질문 내용을 제공해 주시면 그에 대한 답변을 드리겠습니다.

=== SOURCES ===
[1] data\lg.txt (chunk 0)


In [21]:
import rag
from importlib import reload
reload(rag)  # rag.py를 방금 수정했다면 다시 로드

# 어떤 rag.py가 임포트됐는지 확인(경로 확인용)
print("Loaded rag from:", rag.__file__)

def _rag_answer(query: str, k: int = 6, use_lexical: bool = True, use_rerank: bool = False) -> str:
    hits = rag.retrieve(query, k=k, mmr=rag.USE_MMR)
    if use_lexical:
        hits = rag.bm25_mix(query, hits)
    if use_rerank or rag.USE_RERANK:
        hits = rag.rerank_cross_encoder(query, hits)
    answer = rag.generate_answer(query, hits[:k])
    src_lines = []
    for i, h in enumerate(hits[:k], 1):
        src = h["meta"].get("source"); idx = h["meta"].get("chunk_idx")
        src_lines.append(f"[{i}] {src} (chunk {idx})")
    return f"{answer}\n\n=== SOURCES ===\n" + "\n".join(src_lines)

# 모듈에 함수 부착
rag.rag_answer = _rag_answer
print("rag.rag_answer is ready!")


Loaded rag from: C:\Users\HyunJunLee\Documents\hj_git\llm_proposal_mvp\news_project\RAG_test\rag.py
rag.rag_answer is ready!


In [22]:
import rag  # rag.py가 같은 폴더에 있어야 합니다.

def chat(k=6, use_lexical=True, use_rerank=False):
    print("RAG Q&A 모드입니다. 종료하려면 'exit' 또는 'quit' 입력.")
    while True:
        try:
            q = input("\nQ> ").strip()
        except (EOFError, KeyboardInterrupt):
            print("\n종료합니다.")
            break

        if q.lower() in ("exit", "quit"):
            print("종료합니다.")
            break
        if not q:
            continue

        try:
            print("\n--- ANSWER ---")
            ans = rag.rag_answer(q, k=k, use_lexical=use_lexical, use_rerank=use_rerank)
            print(ans)
        except Exception as e:
            print(f"[에러] {type(e).__name__}: {e}")
            print("· ingest.py 먼저 실행했는지, · OPENAI_API_KEY, · 네트워크, · rag.py 경로를 확인하세요.")

chat(k=6, use_lexical=True, use_rerank=False)


RAG Q&A 모드입니다. 종료하려면 'exit' 또는 'quit' 입력.



Q>  lg는 어떤 시스템을써?


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given



--- ANSWER ---


Number of requested results 6 is greater than number of elements in index 1, updating n_results = 1


LG는 AEM, GP1, Magento, Middleware, Jira 등의 시스템을 사용합니다. [1]

=== SOURCES ===
[1] data\lg.txt (chunk 0)



Q>  weekly 미팅은 어떻게 진행되지?


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given



--- ANSWER ---


Number of requested results 6 is greater than number of elements in index 1, updating n_results = 1


weekly 미팅은 Jira 내에서 closed된 티켓 수, WIP(진행 중인 작업) 티켓 수, 리드타임(티켓 완료되기까지 걸린 시간) 등의 주간 현황을 확인하는 방식으로 진행된다. [1]

=== SOURCES ===
[1] data\lg.txt (chunk 0)



Q>  exit


종료합니다.
