Loading pdf

In [32]:
from langchain_community.document_loaders import PDFPlumberLoader
loader = PDFPlumberLoader("resume 1.pdf")
docs = loader.load()

# Check the number of pages
print("Number of pages in the PDF:",len(docs))

# Load the random page content
docs[0].page_content



Number of pages in the PDF: 2


'Junyuan Fang\nSOFTWARE DEVELOPMENT · MULTiMODAL · DEEP LEARNiNG\nHelsinki, Finland\nGithub| Linkedin\nEducation\nUniversity of Helsinki Helsinki, Finland\nMASTER OF SCiENCE iN COMPUTER SCiENCE 2023 – 2025(Feb. Expected)\nBACHELOR OF SCiENCE iN COMPUTER SCiENCE 2021 – 2023\nNational University of Singapore (NUS) Singapore\nEXCHANGE iN COMPUTER SCiENCE BACHELOR STUDY 2022\nProjects\nEnd to end 3D scene reconstruction & open vocabulary understanding @AaltoVision\n• Used 2D vision language LLM and novel reconstruction model implemented 3D point cloud reconstruction with language feature\nembeddings in 3D, both 3D reconstruction and 3D segmentation can be accomplished in real time.\n• Used Gradio to implement interactional 3D scene open vocabulrary semantic segmentation and point cloud reconstruction demo.\n• Paper “NVSMask3D: Hard Visual Prompting withCamera Pose Interpolation for 3D Open Vocabulary Instance Segmentation” under review\n• Paper Gaussian Splatting in Mirrors accepted to BMV

In [33]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

full_text = " ".join([doc.page_content for doc in docs])
chunk_size = len(full_text) // 5
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
chunks = text_splitter.split_text(full_text)
documents = [Document(page_content=chunk, metadata={"source": "resume 1.pdf"}) for chunk in chunks]


In [34]:
print("Number of chunks created: ", len(documents))

# Printing first few chunks
for i in range(len(documents)):
    print()
    print(f"CHUNK : {i+1}")
    print(documents[i].page_content)

Number of chunks created:  6

CHUNK : 1
Junyuan Fang
SOFTWARE DEVELOPMENT · MULTiMODAL · DEEP LEARNiNG
Helsinki, Finland
Github| Linkedin
Education
University of Helsinki Helsinki, Finland
MASTER OF SCiENCE iN COMPUTER SCiENCE 2023 – 2025(Feb. Expected)
BACHELOR OF SCiENCE iN COMPUTER SCiENCE 2021 – 2023
National University of Singapore (NUS) Singapore
EXCHANGE iN COMPUTER SCiENCE BACHELOR STUDY 2022
Projects
End to end 3D scene reconstruction & open vocabulary understanding @AaltoVision
• Used 2D vision language LLM and novel reconstruction model implemented 3D point cloud reconstruction with language feature
embeddings in 3D, both 3D reconstruction and 3D segmentation can be accomplished in real time.

CHUNK : 2
• Used Gradio to implement interactional 3D scene open vocabulrary semantic segmentation and point cloud reconstruction demo.
• Paper “NVSMask3D: Hard Visual Prompting withCamera Pose Interpolation for 3D Open Vocabulary Instance Segmentation” under review
• Paper Gaussian Sp

Creating embeddings for each chunk by using BERT model

In [35]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

# Instantiate the embedding model
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/bert-base-nli-mean-tokens")

# Create the vector store 
vector = FAISS.from_documents(documents, embedder)

Fetching top 4 chunks by calculating the similarity between the query and the chunks's embeddings

In [36]:
retriever = vector.as_retriever(search_type="similarity", search_kwargs={"k": 4})
retrieved_docs = retriever.invoke("Which university applicant studied for a master degree?")
retrieved_docs

[Document(id='f3a44240-1ec3-4a80-9773-1a495c44da8a', metadata={'source': 'resume 1.pdf'}, page_content='Junyuan Fang\nSOFTWARE DEVELOPMENT · MULTiMODAL · DEEP LEARNiNG\nHelsinki, Finland\nGithub| Linkedin\nEducation\nUniversity of Helsinki Helsinki, Finland\nMASTER OF SCiENCE iN COMPUTER SCiENCE 2023 – 2025(Feb. Expected)\nBACHELOR OF SCiENCE iN COMPUTER SCiENCE 2021 – 2023\nNational University of Singapore (NUS) Singapore\nEXCHANGE iN COMPUTER SCiENCE BACHELOR STUDY 2022\nProjects\nEnd to end 3D scene reconstruction & open vocabulary understanding @AaltoVision\n• Used 2D vision language LLM and novel reconstruction model implemented 3D point cloud reconstruction with language feature\nembeddings in 3D, both 3D reconstruction and 3D segmentation can be accomplished in real time.'),
 Document(id='caeb4522-51bc-4902-846a-caaf479f1253', metadata={'source': 'resume 1.pdf'}, page_content='• Contributed to the design idea of side-scrolling C++ game. Handled different level software unit test

Loading the LLaMA 3.1 model

In [37]:
from langchain_community.llms import Ollama

llama3 = Ollama(model="llama3.1")



In [38]:
from langchain.chains import RetrievalQA
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.prompts import PromptTemplate

prompt = """
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.\n
3. Keep the answer crisp and limited to 3,4 sentences.

Context: {context}

Question: {question}

Helpful Answer:"""


QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt) 

llm_chain = LLMChain(
                  llm=llama3, 
                  prompt=QA_CHAIN_PROMPT, 
                  callbacks=None, 
                  verbose=True)

document_prompt = PromptTemplate(
    input_variables=["page_content", "source"],
    template="Context:\ncontent:{page_content}\nsource:{source}",
)

combine_documents_chain = StuffDocumentsChain(
                  llm_chain=llm_chain,
                  document_variable_name="context",
                  document_prompt=document_prompt,
                  callbacks=None,
              )

qa = RetrievalQA(
                  combine_documents_chain=combine_documents_chain,
                  verbose=True,
                  retriever=retriever,
                  return_source_documents=True,
              )

In [39]:
qa.invoke("Which university applicant studied for a master degree?")



[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.

3. Keep the answer crisp and limited to 3,4 sentences.

Context: Context:
content:Junyuan Fang
SOFTWARE DEVELOPMENT · MULTiMODAL · DEEP LEARNiNG
Helsinki, Finland
Github| Linkedin
Education
University of Helsinki Helsinki, Finland
MASTER OF SCiENCE iN COMPUTER SCiENCE 2023 – 2025(Feb. Expected)
BACHELOR OF SCiENCE iN COMPUTER SCiENCE 2021 – 2023
National University of Singapore (NUS) Singapore
EXCHANGE iN COMPUTER SCiENCE BACHELOR STUDY 2022
Projects
End to end 3D scene reconstruction & open vocabulary understanding @AaltoVision
• Used 2D vision language LLM and novel reconstruction model implemented 3D point cloud reconstruction with language feature
embeddings in 3D, both 3D reconstr

{'query': 'Which university applicant studied for a master degree?',
 'result': 'Junyuan Fang studied for a Master of Science in Computer Science at the University of Helsinki. He is expected to graduate in 2025.',
 'source_documents': [Document(id='f3a44240-1ec3-4a80-9773-1a495c44da8a', metadata={'source': 'resume 1.pdf'}, page_content='Junyuan Fang\nSOFTWARE DEVELOPMENT · MULTiMODAL · DEEP LEARNiNG\nHelsinki, Finland\nGithub| Linkedin\nEducation\nUniversity of Helsinki Helsinki, Finland\nMASTER OF SCiENCE iN COMPUTER SCiENCE 2023 – 2025(Feb. Expected)\nBACHELOR OF SCiENCE iN COMPUTER SCiENCE 2021 – 2023\nNational University of Singapore (NUS) Singapore\nEXCHANGE iN COMPUTER SCiENCE BACHELOR STUDY 2022\nProjects\nEnd to end 3D scene reconstruction & open vocabulary understanding @AaltoVision\n• Used 2D vision language LLM and novel reconstruction model implemented 3D point cloud reconstruction with language feature\nembeddings in 3D, both 3D reconstruction and 3D segmentation can be a

In [40]:
#qa("Which university applicant studied for a master degree, and which year he started?")
print(qa("Which university applicant studied for a master degree, and which year he started?")["result"])




[1m> Entering new RetrievalQA chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.

3. Keep the answer crisp and limited to 3,4 sentences.

Context: Context:
content:Software Development Docker, Google platform, React, Figma, Qt, RobotFramework, PostgreSQL, Flask, MySQL, Django,
JavaScript, HTML, CSS
Machine Learning Gradio, PyTorch, TensorFlow, Wandb, Keras, Pysyft
Language Chinese (native), Finnish (professional), English (professional), Swedish (beginner)
Management & Tools Git, Poetry, LATEX, Hydra
JULY, 2024 J.FANG · CURRiCULUM ViTAE
source:resume 1.pdf

Context:
content:Junyuan Fang
SOFTWARE DEVELOPMENT · MULTiMODAL · DEEP LEARNiNG
Helsinki, Finland
Github| Linkedin
Education
University of Helsinki Helsinki, Finland
MASTER OF SCiENCE iN COMPUTER SCiENCE 2023 – 2