In [1]:
from unstructured.partition.auto import partition



In [None]:
import os
from unstructured.partition.pdf import partition_pdf
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


def load_documents_from_pdf(pdf_path):
    elements = partition_pdf(filename=pdf_path)
    texts = []
    for elem in elements:
        if hasattr(elem, 'text'):
            texts.append(elem.text)
    return texts


def split_texts(texts, chunk_size=500, chunk_overlap=50):
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    all_chunks = []
    for t in texts:
        chunks = text_splitter.split_text(t)
        all_chunks.extend(chunks)
    return all_chunks

def build_vectorstore(chunks, persist_directory="my_chroma_db"):

    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    vectorstore = Chroma.from_texts(
        chunks,
        embedding=embeddings,
        persist_directory=persist_directory
    )
    vectorstore.persist()
    return vectorstore

pdf_path = "your_document.pdf"
texts = load_documents_from_pdf(pdf_path)
chunks = split_texts(texts)


vectorstore = build_vectorstore(chunks)


model_name = "THUDM/chatglm-6b" 
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto")

text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=1024,
    temperature=0.2,
    do_sample=True,
    top_p=0.9
)
local_llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type="stuff",
    retriever=retriever
)


query = "what is this about"
result = qa_chain.run(query)
print("", result)


ImportError: cannot import name 'open_filename' from 'pdfminer.utils' (/Users/jianxin/Library/Python/3.9/lib/python/site-packages/pdfminer/utils.py)