In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

# BGE M3 임베딩 모델 설정
embedding_model_name = "../bge-m3"
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

  from .autonotebook import tqdm as notebook_tqdm
  embeddings = HuggingFaceEmbeddings(


In [2]:
# Qwen2.5-1.5B-Instruct 모델 설정
model_name = "../Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [3]:

# 텍스트 생성 파이프라인 설정
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=512,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15
)

llm = HuggingFacePipeline(pipeline=pipe)

def create_rag_pipeline(pdf_path):
    # PDF 문서 로드
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    # 텍스트 분할
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=200
    )
    texts = text_splitter.split_documents(documents)
    
    # 벡터 저장소 생성
    vectorstore = FAISS.from_documents(texts, embeddings)
    
    # RAG 체인 생성
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
        return_source_documents=True
    )
    
    return qa_chain

def query_rag_pipeline(qa_chain, question):
    result = qa_chain({"query": question})
    return {
        "answer": result["result"],
        "source_documents": result["source_documents"]
    }



Device set to use cuda:0
  llm = HuggingFacePipeline(pipeline=pipe)


In [4]:
pdf_path = "sample.pdf"

# RAG 파이프라인 생성
qa_chain = create_rag_pipeline(pdf_path)
print(qa_chain.combine_documents_chain.llm_chain.prompt.template)

Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Helpful Answer:


In [5]:
# 질문 예시
# question = "문서에서 설명하는 주요 내용은 무엇인가요?"
question = "What is the main topic of the document?"

# 질문에 대한 답변 생성
result = query_rag_pipeline(qa_chain, question)
print(result["answer"])
# print("\n참고 문서:")
# for doc in result["source_documents"]:
#     print(f"- {doc.page_content[:200]}...") 

  result = qa_chain({"query": question})


Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

adaptation of BERT and performance on downstream
document classification: Insights from social media.
In Findings of the Association for Computational
Linguistics: EMNLP 2021, pages 2400–2412, Punta
Cana, Dominican Republic. Association for Compu-
tational Linguistics.
Devendra Singh Sachan, Siva Reddy, William L. Hamil-
ton, Chris Dyer, and Dani Yogatama. 2021. End-to-
end training of multi-document reader and retriever
for open-domain question answering. In Advances
in Neural Information Processing Systems 34: An-
nual Conference on Neural Information Processing
Systems 2021, NeurIPS 2021, December 6-14, 2021,
virtual, pages 25968–25981.
Timo Schick, Jane Dwivedi-Yu, Roberto Dessì, Roberta
Raileanu, Maria Lomeli, Luke Zettlemoyer, Nicola
Cancedda, and Thomas Scialom. 2023. Toolformer:

model is tuned for better LLM input c