In [1]:
!pip install langchain
!pip install openai
!pip install PyPDF2
!pip install faiss-cpu
!pip install tiktoken



In [2]:
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter

In [3]:
#environment variables
import os 
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
## Langmith tracking
os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")

In [12]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('7.pdf')

In [13]:
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [14]:
raw_text

'Muhammad Fauzan Bin Abdul Hakim   \n No. 700, Jalan Hj. Buyong Bodeen,   \nKg. Tersusun Sengat, 31300,   \nKg. Kepayang,   \nPerak, Malaysia.   \nI.C. No:  940418 -08-5241  Age: 29  \nMobile No: +6 013 -9141265  Language:  Malay  :  \nEmail:  fauzan.abdhakim@gmail.com                      English:  \n \nOBJECTIVE  \nTo contribute to the field of Electronic Engineering and work for organizations that will further it  \nsupport, motivation, and a better work environment in a way that both benefits society and the \ngrowth of technology.  \n \nFORMER EDUCATIO N \n2023 BACHELOR OF ELECTRONICS \nENGINEERING WITH HONOURS  \nUniversiti Teknikal Malaysia Melaka  \nCurrent CGPA: 3.66/4.00  \n2016  DIPLOMA IN ELECTR ONICS \n(Robotics & Automation) \nMARA-Japan Indus trial Institute, \nCGPA: 3.73/4.00  \n2016  MALAYSIAN SKILLS DIPLOMA  \n(Electronic Product Development) \nAccreditation of Prior Skills \nAchievement. (PPT) \nDepartment of Skills Development, \nMalaysia. \nPROFESSIONAL CERTIFICATE

In [15]:
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [16]:
print(texts)

['Muhammad Fauzan Bin Abdul Hakim   \n No. 700, Jalan Hj. Buyong Bodeen,   \nKg. Tersusun Sengat, 31300,   \nKg. Kepayang,   \nPerak, Malaysia.   \nI.C. No:  940418 -08-5241  Age: 29  \nMobile No: +6 013 -9141265  Language:  Malay  :  \nEmail:  fauzan.abdhakim@gmail.com                      English:  \n \nOBJECTIVE  \nTo contribute to the field of Electronic Engineering and work for organizations that will further it  \nsupport, motivation, and a better work environment in a way that both benefits society and the \ngrowth of technology.  \n \nFORMER EDUCATIO N \n2023 BACHELOR OF ELECTRONICS \nENGINEERING WITH HONOURS  \nUniversiti Teknikal Malaysia Melaka  \nCurrent CGPA: 3.66/4.00  \n2016  DIPLOMA IN ELECTR ONICS \n(Robotics & Automation) \nMARA-Japan Indus trial Institute, \nCGPA: 3.73/4.00', 'ENGINEERING WITH HONOURS  \nUniversiti Teknikal Malaysia Melaka  \nCurrent CGPA: 3.66/4.00  \n2016  DIPLOMA IN ELECTR ONICS \n(Robotics & Automation) \nMARA-Japan Indus trial Institute, \nCGPA:

In [17]:

#langchain dependencies
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

#prompt template
from langchain_core.prompts import ChatPromptTemplate

prompt=ChatPromptTemplate.from_messages(
    [("system", '''You are an AI bot designed to act as a professional for parsing resumes. You are given with resume and your job is to extract the following information from the resume:
    1. Full Name
    2. Email Id
    3. Phone
    4. Github Portfolio
    5. Linkedin Id
    6. Education Details
    7. Employment Details
    8. Technical Skills
    9. Soft Skills
    10. Certifications
    11. Publications
    Give the extracted information in json format only'''),
    ("user", "Content:{content}")
    ]
)
    
#OpenAI LLM
llm=ChatOpenAI(model="gpt-3.5-turbo")
output_parser = StrOutputParser()
chain=prompt|llm|output_parser

print (chain.invoke({'content': texts}))


{
    "Full Name": "Muhammad Fauzan Bin Abdul Hakim",
    "Email Id": "fauzan.abdhakim@gmail.com",
    "Phone": "+6 013-9141265",
    "Github Portfolio": null,
    "Linkedin Id": null,
    "Employment Details": [
        {
            "Company": "Nt Automation & Design Sdn.Bhd.",
            "Position": "Technical Specialist & Assistant Engineer [Robotics & Automation]",
            "Duration": "(July 2021 – October 2022)",
            "Responsibilities": [
                "Manage materials, scheduling, and manpower for projects.",
                "Support in designing and assembling electrical systems for automation machinery.",
                "Support in designing and implementing software systems (PLC) for automation machinery.",
                "Support in designing and assembling CAD models for machinery parts.",
                "Write technical reports for new products.",
                "Collect data and present the efficiency of new products.",
                "Provide support