# Resume Analysis 

### Goals
The goal is analyze a group of resumes to find out the following:
    * Does the person have a bachelor's degree in computer science?
    * Does the person have at least 7 years work experience in Software Engineering?
    * OR (but not for the program to decide, just report back)
    * Does the person have a master's degree in computer science?
    * Does the person have at least 5 years work experience in Software Engineering?
    * Does the person have a variable list of skills based on the word/phrase occurrence in the resume?

### Basic logic
    1) Find PDF files in a directory.  There will be one to many.  Logic must iterate over each document.
    2) Ask the LLM to analyze for the list of skills
    3) Ask the LLM to analyze for the questions
    4) Skills and Question analysis are two different types of effort by the LLM and need not to be mixed based on testing.
    5) Output the full results where the questions and skills ar the columns, the rows represent one each resume and the file format is CSV

In [1]:
from langchain_openai import OpenAI
from langchain import PromptTemplate
from langchain_core.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field

import fitz  # PyMuPDF
import os
import getpass

class Skills(BaseModel):
    skillList: dict[str,str] = Field(description="list of skills that a person has") 

class ResumeAnalysis(BaseModel):
    name: str = Field(description="What is the name of the person?")
    hasBachelors: str = Field(description="Does the person have a bachelor's degree in Computer Science?")
    has7yearsExperience: str = Field(description="Does the person have at least 7 years work experience?")
    hasMasters: str = Field(description="Does the person have a masters's degree in Computer Science?")
    has5yearsExperience: str = Field(description="Does the person have at least 5 years work experience?")
    skills: Skills

def _set_if_undefined(var: str):
    if not os.environ.get(var):
        os.environ[var] = getpass(f"Please provide your {var}")

_set_if_undefined("OPENAI_API_KEY")
_set_if_undefined("LANGCHAIN_API_KEY")

# Optional, add tracing in LangSmith
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Resume Analysis"

# Load the OpenAI model
model = OpenAI(temperature=0)

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def word_occurs(model, document_text):
    parser = PydanticOutputParser(pydantic_object=Skills)
    word_list = ["java", "docker", "management", "python"]
    # Define the LangChain components
    prompt = PromptTemplate(
        input_variables=["document_text", "word_list"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
        template="""
        You are an AI language model.
        Analyze the following resume and count the occurrences of each word in the word list to analyze if the person has a skill.  A person has a skill if the word appears more than 2 times and you are to reply with yes, othewise, no for each word in the word list.
        Document: {document_text}
        Word List: {word_list}
        Output: {format_instructions}
        """
    )
    # Create the chain
    chain = prompt | model | parser
    # invoke the chain
    occurrence = chain.invoke({
        "document_text": document_text,
        "word_list": ", ".join(word_list)
    })
    return occurrence

def questions(model, document_text):
    parser = PydanticOutputParser(pydantic_object=ResumeAnalysis)
    prompt = PromptTemplate(
        input_variables=["document_text", "word_list"],
        partial_variables={"format_instructions": parser.get_format_instructions()},
        template="""
        You are an AI language model.
        Analyze the following resume and answer the following questions:
            What is the name of the person?
            Does the person have a bachelor's degree in Computer Science?
            Does the person have at least 7 years work experience?
            Does the person have a masters's degree in Computer Science?
            Does the person have at least 5 years work experience?
        Document: {document_text}
        Output: {format_instructions}
        """
    )
    # Create the chain
    chain = prompt | model | parser
    # invoke the chain
    questions = chain.invoke({
        "document_text": document_text
    })
    return questions

def list_files_in_directory(directory):
    # List all files in the specified directory
    file_list = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.pdf'):
                file_list.append(os.path.join(root, file))
    return file_list

directory = "./resumes"
files = list_files_in_directory(directory)

resumeAnalysis = []

for file in files:
    document_text = extract_text_from_pdf(file)
    question = questions(model=model, document_text=document_text)
    skills = word_occurs(model=model, document_text=document_text)
    question.skills = skills
    resumeAnalysis.append(question)

print(resumeAnalysis)


[ResumeAnalysis(name='Daniel M. Halverson', hasBachelors='Yes', has7yearsExperience='Yes', hasMasters='No', has5yearsExperience='Yes', skills=Skills(skillList={'java': 'yes', 'docker': 'yes', 'management': 'yes', 'python': 'no'}))]
