In [1]:
from dotenv import load_dotenv
from langchain_groq import ChatGroq
import dotenv
import os

In [2]:
load_dotenv()
LLAMA_3_API_KEY = os.getenv("LLAMA_3_API_KEY")

In [3]:
llm = ChatGroq(
    temperature=0,
    groq_api_key=LLAMA_3_API_KEY,
    model_name="llama-3.1-70b-versatile",
    seed=42  # Đặt seed để đảm bảo kết quả luôn giống nhau
)

                    seed was transferred to model_kwargs.
                    Please confirm that seed is what you intended.


In [25]:
from pydantic import BaseModel, Field, EmailStr, field_validator
from typing import List, Optional
from datetime import date

# ForgivingDate model with date validation
class ForgivingDate(BaseModel):
    day: int
    month: int
    year: int

    @field_validator('day')
    def day_validator(cls, v):
        if v == None:
            v = None
        if v > 31 or v < 0:
            raise ValueError("Day not in range")
        return v

    @field_validator('month')
    def month_validator(cls, v):
        if v == None:
            v = None
            
        if v > 12 or v < 0:
            raise ValueError("Month not in range")
    
        return v

# Contact model with email validation
class Contact(BaseModel):
    name: str
    phone_number: str
    email: Optional[str] = Field(None, description="Email address")
    linkedin: str
    location: str = Field(
        default_factory=str,
        description="Complete street address wherever possible."
    )

class Role(BaseModel):
    name: str = Field(description="The position the candidate is applying for")
    num_experience: float = Field(description='Years of experience deducted from the (number of days between the dates)/365 in title "Kinh nghiệm"')

# # DateRange model containing start and end dates
# class DateRange(BaseModel):
#     start: ForgivingDate
#     end: ForgivingDate = Field(description="Date of the end", default=ForgivingDate(day=date.today().day, month=date.today().month, year=date.today().year))

# Skills model with various fields for skills information
# class Skills(BaseModel):
#     name: str = Field(description="Extract the technical tools in the following text. Technical tools are generally in 2-3 words")

# class Major(BaseModel):
#     name: str = Field(description="The major of the candidate")

# # Experience model containing details about work experiences
# class Experience(BaseModel):
#     dates: DateRange
#     title: str = Field(description="Title of the role")
#     num_experience: float = Field(description='Years of experience deducted from the (number of days between the dates)/365')
#     company: str = Field(description="The employer")
#     skills: List[Skills]
#     description: str = Field(description="Detailed description of the experience")

# Education model for education details
# class Education(BaseModel):
#     college: str = Field(description='Institution from which the person received their degree')
#     dates: DateRange

# Project model to capture project details
# class Project(BaseModel):
#     dates: DateRange
#     title: str = Field(description="Title of the role")
#     num_experience: float = Field(description='Years of experience deducted from the (number of days between the dates)/365')
#     name_project: str = Field(description="Name of the project")
#     skills: List[Skills]
#     description: str = Field(description="Detailed description of the project")

# class CertificateAward(BaseModel):
#     name: str = Field(description="The name of the certificate or award")
#     issuing_organization: str = Field(description="The organization that issued the certificate or award")


# Candidate model containing all information about the candidate
class Candidate(BaseModel):
    contact: Contact
    role: List[Role] = Field(description="The position the candidate is applying for")
    language: List[str] = Field(description="The spoken/written language")
    skills: List[str] = Field(description="Extract the technical tools in the following text. Technical tools are generally in 2-3 words")
    major: List[str] = Field(description="The major of the candidate")    
    # programming_language: List[str] = Field(description="The programming language")
    # language: List[str] = Field(description="The spoken/written language")
    # tool: List[str] = Field(description="Technical tool, generally in 2-3 words.")
    # soft_skills: List[str] = Field(description="Name of the soft skills")
    
    # education: List[Education]
    # experience: List[Experience]
    # projects: List[Project]
    
    # certificates: List[CertificateAward]

In [26]:
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=Candidate)

prompt = PromptTemplate(
    template="""Extract the following structured information from the provided CV text. If information is missing, leave it blank. 
    \n{format_instructions}\n{query}\n""",
     input_variables=["query"],
     partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | llm | parser

In [27]:
from underthesea import text_normalize
def normalize(text):
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    text = text.replace("\t", " ")
    text = text_normalize(text)

    return text

In [31]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(
     r'F:\Năm 3 - HK1\DoAnCS311\data\Temp\2.pdf'
)
pages = []
text = ""
for doc in loader.lazy_load():
     pages.append(doc)
     text += normalize(doc.page_content)


In [32]:
pages

[Document(metadata={'source': 'F:\\Năm 3 - HK1\\DoAnCS311\\data\\Temp\\2.pdf', 'page': 0}, page_content='Truong Le Vinh Phuc\nDi An, Binh Duong \n• \ntruonglevinhphuc2006@gmail.com \n• \n+84 367 855 090 \n• \nlinkedin.com/in/sloweyne\ngithub.com/sloweyyy\n \n •  \nslowey.works\nSoftware Engineer with 1 year experience in React Native and Java for Android. Skilled in crafting user-centric mobile experiences and leveraging\ncutting-edge technologies.\nEDUCATION\nSoftware Engineer\nSep 2022 - Jun 2025\nUNIVERSITY OF INFORMATION TECHNOLOGY Ho Chi Minh, Vietnam\nGPA: \n8.37/10\nAchieved "Excellent" grade for Semester 2, Academic Year 2023-2024\nAchieved "Excellent" grade for Semester 1, Academic Year 2023-2024\nAchieved "Excellent" grade for Semester 2, Academic Year 2022-2023\nAchieved "Excellent" grade for Semester 1, Academic Year 2022-2023\nLANGUAGES\nEnglish\n \n(Professional working proficiency)\nSKILLS\nSoftware Development:\n \nReact Native, Node.js, MySQL, MongoDB, Java Spring Boot

In [33]:

chain.invoke({"query":text})

{'contact': {'name': 'Truong Le Vinh Phuc Di An',
  'phone_number': '+84 367 855 090',
  'email': 'truonglevinhphuc2006@gmail.com',
  'linkedin': 'linkedin.com/in/sloweyne',
  'location': 'Di An, Binh Duong'},
 'role': [{'name': 'Software Engineer', 'num_experience': 1}],
 'language': ['English'],
 'skills': ['React Native',
  'Node.js',
  'MySQL',
  'MongoDB',
  'Java',
  'Spring Boot',
  'Android Development',
  'Firebase',
  'AWS',
  'Docker',
  'GitHub Actions',
  'CI/CD Pipelines',
  'Scrum',
  'JavaScript',
  'SQL',
  'NoSQL',
  'C/C++',
  'C#',
  '.NET Framework',
  'Problem-solving',
  'communication',
  'time management',
  'collaboration',
  'adaptability',
  'attention to detail',
  'critical thinking',
  'creativity'],
 'major': ['Software Engineer']}

In [9]:
import pandas as pd

# Gọi chuỗi chain.invoke và lưu kết quả
result = chain.invoke({"query": text})

# Chuyển kết quả thành một DataFrame (giả sử kết quả là một từ điển hoặc chuỗi)
if isinstance(result, dict):
    df = pd.DataFrame([result])  # Nếu là dict, chuyển đổi sang DataFrame
elif isinstance(result, str):
    df = pd.DataFrame([{"response": result}])  # Nếu là chuỗi, gói trong một cột 'response'
else:
    raise ValueError("Result format not supported")

# Lưu vào tệp CSV
output_file = "output.csv"
df.to_csv(output_file, index=False, encoding='utf-8')

print(f"Result saved to {output_file}")


Result saved to output.csv
