In [1]:
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_ollama.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

# Jupyter-specific imports
from IPython.display import display, Markdown

# Set environment variable for protobuf
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

In [2]:
local_path = "resume.pdf"
if local_path:
    loader = UnstructuredPDFLoader(file_path=local_path)
    data = loader.load()
    print(f"PDF loaded successfully: {local_path}")
else:
    print("Upload a PDF file")

PDF loaded successfully: resume.pdf


In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(data)
print(f"Text split into {len(chunks)} chunks")

Text split into 7 chunks


In [4]:
# Create vector database
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text"),
    collection_name="local-rag"
)
print("Vector database created successfully")

Vector database created successfully


In [5]:
local_model = "ally"  # or whichever model you prefer
llm = ChatOllama(model=local_model)

In [6]:
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""Consider Yourself as Ally, a Assistance Chatbot, Your task is to generate 2
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

# Set up retriever
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(), 
    llm,
    prompt=QUERY_PROMPT
)

In [7]:
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [8]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [9]:
def chat_with_pdf(question):
    """
    Chat with the PDF using the RAG chain.
    """
    
    return display(Markdown(chain.invoke(question)))

In [10]:
chat_with_pdf("what is this ? ")

This appears to be your resume, a document that showcases your skills, experience, and education in the field of Artificial Intelligence (AI) and Machine Learning. It's a professional online platform where you can share information about yourself with potential employers or clients.

In [11]:
chat_with_pdf("what are the skills ?")

Based on the provided document, here are some of the key skills mentioned:

1. Data Science
2. Artificial Intelligence (AI)
3. Machine Learning (ML)
4. Deep Learning
5. Computer Vision
6. Natural Language Processing (NLP)
7. Python programming language
8. R programming language
9. SQL
10. Automation
11. Explained-AI
12. Multi-modal AI
13. Predictive Maintenance
14. Gastrointestinal Abnormalities Detection
15. Convolutional Neural Networks (CNN)

Additionally, the document highlights specific skills and areas of expertise, such as:

* Image processing
* Medical imaging analysis
* Medical research
* Healthcare data analysis

It also mentions that the individual has received awards and recognition for their contributions to the field of medical image analysis and healthcare technology.