### This notebook for the Resume Parser portion of the project.

In [1]:
import streamlit as st
import openai
from langchain.document_loaders import PyPDFLoader
from dotenv import load_dotenv
import os
import openai
import PyPDF2
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI, ChatAnthropic
import json
from langchain.output_parsers import PydanticOutputParser
from pydantic import ValidationError
from json import JSONDecodeError

from resume_template import Resume, InterviewProfile

# Load environment variables containing API keys
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

os.environ["LANGCHAIN_API_KEY"] = str(os.getenv("LANGCHAIN_API_KEY"))
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ["LANGCHAIN_PROJECT"] = "elevated_ambitions_extraction"

In [2]:

def pdf_to_string(resume_pdf):
    """
    Convert a PDF file to a string.

    Parameters:
    resume_pdf (str): The path to the PDF file.

    Returns:
    str: The extracted text from the PDF.
    """
    with open(resume_pdf, 'rb') as f:
        pdf_reader = PyPDF2.PdfReader(f)
        num_pages = len(pdf_reader.pages)
        text = ''
        for i in range(num_pages):
            page = pdf_reader.pages[i]
            text += page.extract_text()
        return text

# Set up the LLM dictionary
llm_dict = {
    "gpt-4-1106-preview": ChatOpenAI(temperature=0, model="gpt-4-1106-preview"),
    "gpt-4": ChatOpenAI(temperature=0, model="gpt-4"),
    "gpt-3.5-turbo-1106": ChatOpenAI(temperature=0, model="gpt-3.5-turbo-1106"),
    "claude-2": ChatAnthropic(model="claude-2", max_tokens=20_000),
    "claude-instant-1": ChatAnthropic(model="claude-instant-1", max_tokens=20_000)
}

def extract_resume_fields(full_text, model):
    """
    Analyze a resume text and extract structured information using a specified language model.

    Parameters:
    full_text (str): The text content of the resume.
    model (str): The language model object to use for processing the text.

    Returns:
    dict: A dictionary containing structured information extracted from the resume.
    """
    # The Resume object is imported from the local resume_template file

    with open("../prompts/resume_extraction.prompt", "r") as f:
        template = f.read()

    parser = PydanticOutputParser(pydantic_object=Resume)

    prompt_template = PromptTemplate(
        template=template,
        input_variables=["resume"],
        partial_variables={"response_template": parser.get_format_instructions()},
    )

    # Invoke the language model and process the resume
    formatted_input = prompt_template.format_prompt(resume=full_text)
    llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
    output = llm.invoke(formatted_input.to_string())
    
    try:
        parsed_output = parser.parse(output.content)
        return parsed_output
    
    except ValidationError as e:
        print((f"Validation error: {e}"))
        print(output)
        return output.content
    
    except JSONDecodeError as e:
        print((f"JSONDecodeError error: {e}"))
        print(output)
        return output.content

def extract_resume_fields_json(full_text, model):
    """
    This is just like the function above but instead of using pydantic it uses a json template.
    
    Analyze a resume text and extract structured information using a specified language model.

    Parameters:
    full_text (str): The text content of the resume.
    model (str): The language model object to use for processing the text.

    Returns:
    dict: A dictionary containing structured information extracted from the resume.
    """
    # Load the prompt template and response template for resume analysis
    with open("../prompts/resume_extraction.prompt", "r") as f:
        template = f.read()
    with open("../templates/scale_profile_template.json", "r") as f:
        resume_template = f.read()

    # Format the input for the language model
    prompt_template = PromptTemplate(template=template, input_variables=['resume', 'response_template'])
    formatted_input = prompt_template.format(resume=full_text, response_template=resume_template)

    # Invoke the language model and process the resume
    llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
    analysis_output = llm.invoke(formatted_input)

    return analysis_output

def upgrade_experience_bullet(user_experience, bullet, model):
    """
    Enhance a bullet point in a user's experience section using a language model.

    Parameters:
    user_experience (dict): A dictionary containing details of a user's experience.
    bullet (str): The bullet point to be enhanced.
    model (str): The language model object to use for enhancement.

    Returns:
    str: The enhanced bullet point.
    """
    # Load the bullet enhancement template
    with open("../prompts/synthetic_bullet_builder.prompt", "r") as f:
        template = f.read()

    # Format the input for the language model
    prompt_template = PromptTemplate(template=template, input_variables=['user_summary', 'bullet_point'])
    formatted_input = prompt_template.format(user_summary=user_experience, bullet_point=bullet)

    # Invoke the language model and enhance the bullet point
    llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
    analysis_output = llm.invoke(formatted_input)

    return analysis_output.content

def upgrade_resume_bullets(extracted_resume):
    """
    Iterate through the work experience in a resume and upgrade each bullet point.

    Parameters:
    extracted_resume (Resume): A Resume object containing a structured resume.

    Returns:
    Resume: The Resume object with enhanced bullet points in the work experience section.
    """
    # Enhance bullet points for each work experience entry
    for experience in extracted_resume.work_experience:
        experience_desc = ' '.join([str(getattr(experience, field)) for field in ['company', 'title', 'duration', 'description']])
        # skip experience if there are no notable contributions
        if not experience.notable_contributions:
            continue
        else:
            for i, contribution in enumerate(experience.notable_contributions):
                # skip contribution if it is empty
                if contribution.notable_contribution == '':
                    continue
                else:
                    enhanced_contribution = upgrade_experience_bullet(experience_desc, contribution.notable_contribution)
                    # Update the notable contribution with the enhanced version
                    experience.notable_contributions[i].notable_contribution = enhanced_contribution

    return extracted_resume

def generate_questions(user_profile, model):
    """
    Generate interview questions based on a user's profile using a language model.

    Parameters:
    user_profile (dict): A dictionary containing the user's profile information.
    model (str): The language model object to use for question generation.

    Returns:
    dict: A dictionary containing generated interview questions.
    """
    # Load the question generation template and response template
    with open("../prompts/question_generation.prompt", "r") as f:
        question_template = f.read()

    parser = PydanticOutputParser(pydantic_object=InterviewProfile)

    prompt_template = PromptTemplate(
        template=question_template,
        input_variables=["user_profile"],
        partial_variables={"response_template": parser.get_format_instructions()},
    )

    # Invoke the language model and process the resume
    formatted_input = prompt_template.format_prompt(user_profile=user_profile)
    llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
    output = llm.invoke(formatted_input.to_string())
    
    try:
        parsed_output = parser.parse(output.content)
        return parsed_output
    
    except ValidationError as e:
        print((f"Validation error: {e}"))
        print(output)
        return output.content
    
    except JSONDecodeError as e:
        print((f"JSONDecodeError error: {e}"))
        print(output)
        return output.content

def generate_synthetic_responses(user_profile, questions, model):
    """
    Generate synthetic interview responses for a set of questions based on a user's profile.

    Parameters:
    user_profile (dict): A dictionary containing the user's profile information.
    questions (dict): A dictionary containing interview questions.
    model (str): The language model object to use for generating responses.

    Returns:
    dict: A dictionary containing synthetic responses to the interview questions.
    """
    # Load the interview response generation template
    with open("../prompts/synthetic_interview_responses.prompt", "r") as f:
        prompt_temp = f.read()

    parser = PydanticOutputParser(pydantic_object=InterviewProfile)

    prompt_template = PromptTemplate(
        template=prompt_temp,
        input_variables=["user_profile", "questions"],
        partial_variables={"response_template": parser.get_format_instructions()},
    )

    # Invoke the language model and process the resume
    formatted_input = prompt_template.format_prompt(user_profile=user_profile, questions=user_profile)
    llm = llm_dict.get(model, ChatOpenAI(temperature=0, model=model))
    output = llm.invoke(formatted_input.to_string())
    
    try:
        parsed_output = parser.parse(output.content)
        return parsed_output
    
    except ValidationError as e:
        print((f"Validation error: {e}"))
        print(output)
        return output.content
    
    except JSONDecodeError as e:
        print((f"JSONDecodeError error: {e}"))
        print(output)
        return output.content


In [3]:

# read in the resume_texts.pkl file and read in each resume
import pickle

with open('resume_texts.pkl', 'rb') as f:
    resume_texts = pickle.load(f)
# take a random sample of 10 resumes and process each of them

import random
sample_resumes = random.sample([resume for resume in resume_texts if resume != ''], 10)

In [4]:
import uuid

extraction_dict = {}
for resume in sample_resumes:
    resume_id = str(uuid.uuid4())
    extraction_dict[resume_id] = {'rawresume': resume}
    for model in ['gpt-4-1106-preview', 'gpt-3.5-turbo-1106', 'claude-2', 'claude-instant-1']:
        print(f'-------------------{model}-------------------')
        extraction_dict[resume_id][f'{model}_output'] = extract_resume_fields(resume, model=model)
        print (f'initial resume: {resume} \n {model} extractions: {extraction_dict[resume_id][f"{model}_output"]} \n\n')
        print(';;;')

-------------------gpt-4-1106-preview-------------------


KeyboardInterrupt: 

In [9]:
from tqdm import tqdm

def main(resume_pdf_path, user_id):
    # Step 1: Read and Extract Resume
    resume_text = pdf_to_string(resume_pdf_path)

    # Step 2: Extract Structured Resume Data
    extracted_resume = extract_resume_fields(resume_text, user_id)
    print(extracted_resume)

    # Step 3: Upgrade Resume
    # upgraded_resume = upgrade_resume_bullets(extracted_resume, user_id)

    # Step 4: Generate Interview Questions
    # interview_questions = generate_questions(upgraded_resume, user_id)
    interview_questions = generate_questions(extracted_resume, user_id)
    print(interview_questions)

    # Step 5: Generate Synthetic Interview Responses
    interview_responses = generate_synthetic_responses(extracted_resume, interview_questions, user_id)
    print(interview_responses)

    # Step 6: Final Processing (Combine data as needed)
    # ... (your code for final processing)

    # Create a progress bar to track progress through the path
    progress_bar = tqdm(total=6)

    # Update progress bar for each completed step
    progress_bar.update(1)
    progress_bar.set_description("Step 1: Read and Extract Resume")
    progress_bar.update(1)
    progress_bar.set_description("Step 2: Extract Structured Resume Data")
    progress_bar.update(1)
    progress_bar.set_description("Step 3: Upgrade Resume")
    progress_bar.update(1)
    progress_bar.set_description("Step 4: Generate Interview Questions")
    progress_bar.update(1)
    progress_bar.set_description("Step 5: Generate Synthetic Interview Responses")
    progress_bar.update(1)
    progress_bar.set_description("Step 6: Final Processing (Combine data as needed)")

    return extracted_resume, interview_questions, interview_responses

The code below is to run through an example of how you can test out different models on the same resume and judge the output.

In [6]:
for resume in extraction_dict:
    print(resume)
    print(extraction_dict[resume]['rawresume'])
    print(extraction_dict[resume]['gpt-4-1106-preview_output'])
    print(extraction_dict[resume]['gpt-3.5-turbo-1106_output'])
    print(extraction_dict[resume]['claude-2_output'])
    print(extraction_dict[resume]['claude-instant-1_output'])
    print(';;;')

7c4aeb3b-25c3-4c0a-88f6-97181b72e22a
Penny Jacobspenny.jacobs@gmail.com218-310-8761 Objective Energetic and experienced licensed massage therapist with 5+ years of experience in a dynamic hotel environment. Eager to provide the MGM Spa guests with the most current massage treatments and techniques available while ensuring a comfortable and relaxing spa experience. In previous roles performed 1,000+ Swedish massage sections, developed a unique massage style, and maintained a 50% repeat business rate. Work Experience Massage TherapistThe Hilton Hotel, New York, NY2016–Performed massage and body treatments as well as general spa duties.Utilized, maintained, and conducted inventory of supplies and products.Maintained records as required by federal, state, local and company regulations.Responded to guest inquiries and resolved guest service issues in a timely, friendly and efficient manner.Key achievements:Boosted sales 40% in four months thanks to top customer service.Developed a unique co

KeyError: 'gpt-4-1106-preview_output'

In [11]:
resume_pdf_path = '../pdf/LeoWalkerLinkedIn.pdf'
extracted_resume, interview_questions, interview_responses =  main(resume_pdf_path, 1)

InvalidRequestError: The model `1` does not exist