In [165]:
from typing import Optional, List
from datetime import date
import pymupdf, os, json, ast
import vertexai
from llama_index.llms.openai import OpenAI
from llama_index.llms.ollama import Ollama
from llama_index.llms.vertex import Vertex
from llama_index.core.prompts import PromptTemplate
from langchain_google_vertexai import ChatVertexAI
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from google.oauth2 import service_account
from dotenv import load_dotenv
from pydantic import BaseModel, Field, EmailStr

In [91]:
load_dotenv()

True

In [3]:
input_folder_path = 'data/cv'
output_folder_path = 'data/output'

In [44]:
class Person(BaseModel):
    first_name: Optional[str] = Field(description="First name of the person")
    last_name: Optional[str] = Field(description="Last name of the person")
    email: Optional[EmailStr] = Field(description="Email address of the person")
    phone: Optional[str] = Field(description="Contact phone number")
    address: str = Field(description="Residential address")
    city: Optional[str] = Field(description="City of residence")
    country: Optional[str] = Field(description="Country of residence")
    date_of_birth: Optional[str] = Field(None, description="Date of birth of the person")


In [45]:
class WorkExperiences(BaseModel):
    job_title: Optional[str] = Field(description="Title of the job role")
    employer: str = Field(description="Name of the employer or organization")
    start_date: Optional[str] = Field(description="Start date of the job")
    end_date: Optional[str] = Field(description="End date of the job, if applicable")
    city: Optional[str] = Field(description="City where the job was based")
    country: Optional[str] = Field(description="Country where the job was based")
    description: Optional[str] = Field(description="Description of the responsibilities and achievements in the job")


In [46]:
class WorkExperiencesList(BaseModel):
    experiences: Optional[List[WorkExperiences]] = Field(description="List of all working experiences")

In [168]:
class Educations(BaseModel):
    """
    Information about academic background and education path (could include college/university, high school, specialization courses, etc.)
    """

    degree: Optional[str] = Field(description="Degree or qualification obtained")
    field_of_study: Optional[str] = Field(description="Field of study or specialization")
    institution_name: Optional[str] = Field(description="Name of the educational institution")
    start_date: Optional[str] = Field(description="Start date of the educational program")
    end_date: Optional[str] = Field(description="End date of the educational program, if applicable")
    city: Optional[str] = Field(description="City where the institution is located")
    country: Optional[str] = Field(description="Country where the institution is located")
    description: Optional[str] = Field(description="Additional details about the education")


In [169]:
class EducationsList(BaseModel):
    """
    List of Educations Pydantic objects
    """
    experiences: Optional[List[Educations]] = Field(description="List of academic education experiences")

In [26]:
class PersonSkills(BaseModel):
    skill_name: Optional[str] = Field(description="Name of the skill")
    skilltype: Optional[str] = Field(description="Type of skill, e.g., technical or soft skill")
    proficiency_level: Optional[str] = Field(description="Proficiency level in the skill, e.g., beginner, intermediate, advanced")
    years_of_experience: Optional[int] = Field(None, description="Years of experience with the skill")


In [None]:
class PersonSkillList(BaseModel):
    experiences: List[PersonSkills] = Field(description="List of all skills of the person")

In [51]:
class PersonLanguages(BaseModel):
    language_name: Optional[str] = Field(description="Language name")
    proficiency_level: Optional[str] = Field(description="Proficiency level in the language, e.g., native, fluent, intermediate")

In [52]:
class PersonLanguagesList(BaseModel):
    experiences: List[PersonLanguages] = Field(description="List of all languages spoken")

In [53]:
class Certifications(BaseModel):
    name: Optional[str] = Field(description="Name of the certification")
    issuing_body: Optional[str] = Field(description="Organization that issued the certification")
    issue_date: Optional[str] = Field(description="Date the certification was issued")
    expiration_date: Optional[str] = Field(description="Expiration date of the certification, if applicable")
    description: Optional[str] = Field(description="Additional details about the certification")

In [54]:
class CertificationList(BaseModel):
    experiences: List[Certifications] = Field(description="List of all certifications")

In [55]:
class Projects(BaseModel):
    name: Optional[str] = Field(description="Name of the project")
    description: Optional[str] = Field(None, description="Details about the project")
    start_date: Optional[str] = Field(description="Start date of the project")
    end_date: Optional[str] = Field(None, description="End date of the project, if applicable")

In [56]:
class ProjectsList(BaseModel):
    experiences: List[Projects] = Field(description="List of all projects")

In [7]:
doc = pymupdf.open(os.path.join(input_folder_path, 'Alessandra_Saitta_ITA.pdf')) # open a document
out = open(os.path.join(output_folder_path, "Alessandra_Saitta_ITA.txt"), "wb") # create a text output

In [8]:
for page in doc: # iterate the document pages
    text = page.get_text().encode("utf8") # get plain text (is in UTF-8)
    out.write(text) # write text of page
    out.write(bytes((12,))) # write page delimiter (form feed 0x0C)
out.close()

## Structured data


In [57]:
with open(os.path.join(output_folder_path, "Alessandra_Saitta_ITA.txt"), "rb") as file:  # open the file in binary read mode
    cv_text = file.read().decode('utf-8')  # read the content of the file

In [92]:
filename = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
credentials: service_account.Credentials = (
    service_account.Credentials.from_service_account_file(filename)
)

In [59]:
open_source_models = ['qwen2:7b', 'llama3.2:3b', 'llama3.1:latest']

In [60]:
llm = OpenAI(model="gpt-4o")
llm_open = Ollama(model=open_source_models[1], request_timeout=120.0)
llm_gemini = Vertex(model="gemini-1.5-pro", project=credentials.project_id, credentials=credentials, location="europe-west4")

In [69]:
prompt = PromptTemplate(
    """
    I will provide an input text containing unstructured data from a person’s resume. 
    This data may include details about work experience, personal information, academic background, skills, certifications, and more.

    Key considerations:

    - You will likely get personal information which are very detailed, you can likely parse all the information needed.
    - Information about city and country may be embedded in the address field; extract it from there if applicable.
    - When the expected data type is a list, you should output a list which has several Pydantic DTO elements in it. It should not be a string.
    - Info about work and academic experiences may contain multiple elements. Pay close attention to the requested data type and don't try to force it.
    - If any requested data is unavailable in the input, leave the corresponding fields with an empty string without making assumptions.
    - Be mindful that certain sections, like work experience and academic background, may include multiple entries.
    
    Your task is to parse this data into a structured format, accurately reflecting all the available information.

    {text}

    """
)

In [70]:
def create_json_response(dto, source_llm: str = 'google'):
    
    if source_llm == 'ollama':
        response = llm_open.structured_predict(
            dto, prompt, text=cv_text
            )
        print(response)
        # json_output = response.model_dump_json()
        json_output = ast.literal_eval(response).model_dump_json()
    elif source_llm == 'google':
        response = llm_gemini.structured_predict(
            dto, prompt, text=cv_text
            )
        json_output = response.model_dump_json()
    else:
        response = llm.structured_predict(
            dto, prompt, text=cv_text
            )
        json_output = response.model_dump_json()
    return json.dumps(json.loads(json_output), indent=2, ensure_ascii=False)

In [71]:
person = create_json_response(Person, source_llm='ollama')
print(person)

first_name='Alessandra' last_name='Saitta' email='alessandra.saitta93@gmail.com' phone='+39 3335462586' address='Via Cappellini, 7, 21049, Tradate, Italia (Abitazione)' city='Tradate' country='Italia' date_of_birth='26/04/1993'


ValueError: malformed node or string: Person(first_name='Alessandra', last_name='Saitta', email='alessandra.saitta93@gmail.com', phone='+39 3335462586', address='Via Cappellini, 7, 21049, Tradate, Italia (Abitazione)', city='Tradate', country='Italia', date_of_birth='26/04/1993')

In [73]:
work_experiences = create_json_response(WorkExperiencesList, source_llm='ollama')
print(work_experiences)

1 validation error for WorkExperiencesList
experiences
  Input should be a valid list [type=list_type, input_value="[{'start_date': '04/07/2...on': '', 'skills': []}]", input_type=str]
    For further information visit https://errors.pydantic.dev/2.9/v/list_type


SyntaxError: invalid syntax (<unknown>, line 1)

In [66]:
person_languages = create_json_response(PersonLanguagesList, source_llm='ollama')
print(person_languages)

1 validation error for PersonLanguagesList
experiences
  Input should be a valid list [type=list_type, input_value='[{"city": "Milano", "cou...oogle Cloud Platform"}]', input_type=str]
    For further information visit https://errors.pydantic.dev/2.9/v/list_type


AttributeError: 'str' object has no attribute 'model_dump_json'

In [31]:
skill_json_output = skill_response.model_dump_json()
print(json.dumps(json.loads(skill_json_output), indent=2, ensure_ascii=False))

{
  "id": 1,
  "name": "Python",
  "type": "technical"
}


In [29]:
person_skill_json_output = person_skill_response.model_dump_json()
print(json.dumps(json.loads(person_skill_json_output), indent=2, ensure_ascii=False))

{
  "id": 1,
  "person_id": 1,
  "skill_id": 1,
  "proficiency_level": "advanced",
  "years_of_experience": 3
}


## Langchain

In [108]:
vertexai.init(project=credentials.project_id, location="europe-west4")

In [115]:
llm = ChatVertexAI(model="gemini-1.5-pro")

In [141]:
prompt_text =  """
    I will provide an input text containing unstructured data from a person’s resume. 
    This data may include details about work experience, personal information, academic background, skills, certifications, and more.

    Key considerations:

    - You will likely get personal information which are very detailed, you can likely parse all the information needed.
    - Information about city and country may be embedded in the address field; extract it from there if applicable.
    - When the expected data type is a list, you should output a list which has several Pydantic DTO elements in it. It should not be a string.
    - Info about work and academic experiences may contain multiple elements. Pay close attention to the requested data type and don't try to force it.
    - If any requested data is unavailable in the input, leave the corresponding fields with an empty string without making assumptions.
    - Be mindful that certain sections, like work experience and academic background, may include multiple entries.
    
    Your task is to parse this data into a structured format, accurately reflecting all the available information.

    Wrap the output in `json` tags\n{format_instructions}

    """

In [145]:
prompt_text_list =  """
    I will provide an input text containing unstructured data from a person’s resume. 
    This data may include details about work experience, personal information, academic background, skills, certifications, and more.

    Key considerations:

    - You will likely get personal information which are very detailed, you can likely parse all the information needed.
    - Information about city and country may be embedded in the address field; extract it from there if applicable.
    - You should output a list which has several Pydantic DTO elements in it. It should not be a string or a list with a single element, except when 
    the person had just one single experience
    - Info about work and academic experiences may contain multiple elements.
    - If any requested data is unavailable in the input, leave the corresponding fields with an empty string without making assumptions.
    - Be mindful that certain sections, like work experience and academic background, may include multiple entries.
    
    Your task is to parse this data into a structured format, accurately reflecting all the available information.

    Wrap the output in `json` tags\n{format_instructions}

    """

In [129]:
class Person(BaseModel):
    first_name: str = Field(description="First name of the person")
    last_name: str = Field(description="Last name of the person")
    email: EmailStr = Field(description="Email address of the person")
    phone: str = Field(description="Contact phone number")
    address: str = Field(description="Residential address")
    city: str = Field(description="City of residence")
    country: str = Field(description="Country of residence")
    date_of_birth: str = Field(None, description="Date of birth of the person")

In [170]:
# Set up a parser
parser = PydanticOutputParser(pydantic_object=EducationsList)

# Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            prompt_text_list,
        ),
        ("human", "{text}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

In [159]:
print(prompt.invoke({"text": cv_text}).to_string())

System: 
    I will provide an input text containing unstructured data from a person’s resume. 
    This data may include details about work experience, personal information, academic background, skills, certifications, and more.

    Key considerations:

    - You will likely get personal information which are very detailed, you can likely parse all the information needed.
    - Information about city and country may be embedded in the address field; extract it from there if applicable.
    - You should output a list which has several Pydantic DTO elements in it. It should not be a string or a list with a single element, except when 
    the person had just one single experience
    - Info about work and academic experiences may contain multiple elements.
    - If any requested data is unavailable in the input, leave the corresponding fields with an empty string without making assumptions.
    - Be mindful that certain sections, like work experience and academic background, may includ

In [171]:
chain = prompt | llm | parser

chain.invoke({"text": cv_text})

EducationsList(experiences=[Educations(degree='LAUREA MAGISTRALE IN INFORMATICA', field_of_study='Data Science e Perceptual Computing', institution_name='Università degli Studi di Milano', start_date='09/2018', end_date='22/04/2021', city='Milano', country='Italia', description='Esami: Algorithms for Massive Datasets, Natural Interaction, Methods\nfor Affective Computing, Bioinformatics, Information Management,\nDidactics for Computer Science, Intelligent Systems, Methods\nfor Image Processing, Web Algorithmics, Information Retrieval,\nStatistical Methods for Machine Learning, Audio Pattern Recognition,\nMultimedial Organization and Digitalization.\nSito Internet https://www.unimi.it/it/corsi/laurea-magistrale/informatica-magistrale \nVoto finale 104/110 \nLivello EQF Livello 7 EQF \nTesi Applicazione di algoritmi di apprendimento per la riqualificazione delle aree dismesse nelle regioni italiane'), Educations(degree='LAUREA TRIENNALE IN INFORMATICA', field_of_study=None, institution_n