In [508]:
import PyPDF2
import google.generativeai as genai
import pandas as pd
import os
import json
import re

In [509]:
def input_pdf_setup(file_path):
    text = ''
    if file_path is not None:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
    return text

In [510]:
def clean_text(output):
    # Remove "```json" from the beginning
    output = re.sub(r'^```json\n', '', output)
    output = re.sub(r'^```JSON\n', '', output)

    # Remove text before the first '{'
    output = re.sub(r'^[^{]*', '', output)
    
    # Remove text after the last '}'
    output = re.sub(r'[^}]*$', '', output)
    
    # Remove "```" from the end, if it's still there
    output = re.sub(r'```$', '', output)

    return output

In [511]:
def clean_json(data):
    # Remove newlines and extra spaces
    cleaned_data = re.sub(r'\n\s*', '', data)
    
    # Remove extra spaces after colons and commas
    cleaned_data = re.sub(r'\s*:\s*', ': ', cleaned_data)
    cleaned_data = re.sub(r',\s*', ', ', cleaned_data)
    
    # Remove unnecessary escape characters
    cleaned_data = re.sub(r'\\', '', cleaned_data)
    
    return cleaned_data

In [512]:
def read_resumes(folder_path):
    # List all PDF files in the specified folder
    files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    # Prepare a list to hold data
    data = []
    
    # Process each file
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        resume_text = input_pdf_setup(file_path)
        data.append({'file_name': file_name, 'resume_text': resume_text})
    
    # Create a DataFrame
    df = pd.DataFrame(data)
    return df

In [513]:
folder_path = '/Users/parthvinm/Desktop/NLP SSM/Final Project/Resume_Job-Align/Resumes'
df_resumes = read_resumes(folder_path)
df_resumes

Unnamed: 0,file_name,resume_text
0,Resume_Jash_09_SDE.pdf,"Jash Shah Baltimore, MD|+1 (667) 600-9472..."
1,Parthvi_Mehta_Resume.pdf,PARTHVI MEHTA \npmehta14@jh.edu | +1 (667) -...


In [514]:
prompt_structure = """Objective:  Parse a text-formatted resume efficiently and extract diverse applicant's data into a structured JSON format that adheres to the provided TypeScript interface schema.

Input: Text-formatted applicant's resume.

Steps:
1. Analyze Structure: Examine the text-formatted resume to understand its organization and identify key sections (e.g., education, experience, skills).
2. Convert to JSON: Map the extracted information to the corresponding fields in the schema, creating a structured JSON representation.
3. Optimize Output: Ensure the JSON is well-formatted, error-free, and handles missing values appropriately.
4. Handle Variations: Account for different resume styles and formatting to accurately extract relevant data.

Consider following TypeScript Interface for JSON schema:
```
interface Media {
  linkedin: string;
  github: string;
  devpost: string;
  medium: string;
  leetcode: string;
  dagshub: string;
  kaggle: string;
  instagram: string;
}

interface Education {
  degree: string;
  university: string;
  from: string;
  to: string;
  grade?: string;
  coursework?: string[];
}

interface SkillSection {
  name: string;
  skills: string[];
}

interface Work {
  role: string;
  company: string;
  from: string;
  to: string;
  description: string[];
}

interface Project {
  name: string;
  type: string;
  link?: string;
  from: string;
  to: string;
  description: string[];
}

interface Certification {
  name: string;
  by: string;
  link: string;
}

interface Achievements {
  [index: number]: string;
}

interface RESUME_DATA_SCHEMA {
  name: string;
  summary: string;
  phone: string;
  email: string;
  media: Media;
  education: Education[];
  skill_section: SkillSection[];
  work_experience: Work[];
  projects: Project[];
  certifications: Certification[];
  achievements: Achievements;
}
```

Desired Output: Write the Well-formatted JSON adhering to the RESUME_DATA_SCHEMA schema, handling missing values with empty strings or "None".
<JSON_OUTPUT_ACCORDING_TO_RESUME_DATA_SCHEMA>

The results should contain valid JSON only, without any delimiter or characters making invalid JSON format."""

In [515]:
prompt_job_description = """I will provide a job and company description for your analysis. In your analysis you have to identify the key words, expertise & requirements the job demands, but also from the company description.

Your task is consider it to meticulously assess the information and furnish the details like Job title, Keywords(key words are expertise & requirements the job demands), Job purpose, Job duties and responsibilities, Required qualifications, Preferred qualifications, Company name and Company details(points which include overview, mission, values or way of working) in bulleted points. 

Provide them only in JSON format - which is easily parse by any json parser - with following keys:
title, keywords, purpose, duties_responsibilities, required_qualifications, preferred_qualifications, company_name, company_info.

Output:
{
    "title": "RESULT", 
    "keywords": "RESULT", 
    purpose: "RESULT", 
    duties_responsibilities: "RESULT", 
    required_qualifications: "RESULT", 
    preferred_qualifications: "RESULT", 
    company_name: "RESULT",
    company_info: "RESULT",
}"""

In [516]:
model = genai.GenerativeModel('gemini-pro')

def generate_response(row, resume = True):
    if resume:
        response = model.generate_content([prompt_structure, row['resume_text']])
    else:
        response = model.generate_content([prompt_job_description, row['job_description']])
    return clean_text(response.text)

In [517]:
df_resumes['resume_json'] = df_resumes.apply(generate_response, axis=1)
df_resumes

Unnamed: 0,file_name,resume_text,resume_json
0,Resume_Jash_09_SDE.pdf,"Jash Shah Baltimore, MD|+1 (667) 600-9472...","{\n ""name"": ""Jash Shah"",\n ""summary"": null,\..."
1,Parthvi_Mehta_Resume.pdf,PARTHVI MEHTA \npmehta14@jh.edu | +1 (667) -...,"{\n ""name"": ""PARTHVI MEHTA"",\n ""summary"": nu..."


In [538]:
df_jobs = pd.read_csv('/Users/parthvinm/Desktop/NLP SSM/Final Project/Resume_Job-Align/jobs.csv', encoding='iso-8859-1')
df_jobs

Unnamed: 0,title,job_description
0,Software Intern - Generative AI Developer,About the job\n\nRole Overview:\n\n We are se...
1,Machine Learning Engineer,\nAbout the job\nSecond Spectrum is a Sports E...
2,Machine Learning Engineer,About the job\nZS is a place where passion ch...
3,AI Engineer,\nAbout the job\nLinkedIn is the worlds large...
4,Associate Software Engineer,\nJob Description\n\nThe Associate Software En...
5,Data Analyst,\nAbout the job\nMust have primary commercial ...
6,Data Analyst,\nAbout the job\n\nPurpose of the Position: As...
7,Data Scientist,\nAbout the job\nIt's fun to work in a company...
8,AI/ML Data Science Intern,\nAbout the job\nTitle: AI/ML Data Science Int...
9,Software Engineer intern,Description\n\nAbout us:\n\nOne team. Global c...


In [539]:
df_jobs['job_json'] = df_jobs.apply(generate_response, resume = False, axis=1)
df_jobs

Unnamed: 0,title,job_description,job_json
0,Software Intern - Generative AI Developer,About the job\n\nRole Overview:\n\n We are se...,"{\n ""title"": ""Generative AI Developer"",\n ..."
1,Machine Learning Engineer,\nAbout the job\nSecond Spectrum is a Sports E...,"{\n ""title"": ""Machine Learning Engineer"",\n..."
2,Machine Learning Engineer,About the job\nZS is a place where passion ch...,"{\n ""title"": ""Machine Learning Engineer"",\n..."
3,AI Engineer,\nAbout the job\nLinkedIn is the worlds large...,"{\n ""title"": ""Machine Learning Engineer"",\n..."
4,Associate Software Engineer,\nJob Description\n\nThe Associate Software En...,"{\n ""title"": ""Associate Software Engineer"",..."
5,Data Analyst,\nAbout the job\nMust have primary commercial ...,"{\n ""title"": ""Data Analyst"",\n ""keywords..."
6,Data Analyst,\nAbout the job\n\nPurpose of the Position: As...,"{\n ""title"": ""Data Analyst"",\n ""keywords..."
7,Data Scientist,\nAbout the job\nIt's fun to work in a company...,"{\n ""title"": ""Data Scientist"",\n ""keywor..."
8,AI/ML Data Science Intern,\nAbout the job\nTitle: AI/ML Data Science Int...,"{\n ""title"": ""AI/ML Data Science Intern"",\n..."
9,Software Engineer intern,Description\n\nAbout us:\n\nOne team. Global c...,"{\n ""title"": ""Slack Bot Automation Intern"",\n..."


In [522]:
work_exp_prompt = """You are going to write a JSON resume section of "Work Experience" for an applicant applying for job posts.

Step to follow:
1. Analyze my Work details to match job requirements.
2. Create a JSON resume section that highlights strongest matches
3. Optimize JSON section for clarity and relevance to the job description.
4. Do not make up any information, only use the provided work experience.

Instructions:
1. Focus: Craft three highly relevant work experiences aligned with the job description.
2. Content:
  2.1. Bullet points: 3 per experience, closely mirroring job requirements.
  2.2. Impact: Quantify each bullet point for measurable results.
  2.3. Storytelling: Utilize STAR methodology (Situation, Task, Action, Result) implicitly within each bullet point.
  2.4. Action Verbs: Showcase soft skills with strong, active verbs.
  2.5. Honesty: Prioritize truthfulness and objective language.
  2.6. Structure: Each bullet point follows "Did X by doing Y, achieved Z" format.
  2.7. Specificity: Prioritize relevance to the specific job over general achievements.
3. Style:
  3.1. Clarity: Clear expression trumps impressiveness.
  3.2. Voice: Use active voice whenever possible.
  3.3. Proofreading: Ensure impeccable spelling and grammar.

Consider following Work Details in JSON format and look for the description in each and carefully align it.

Consider following Job description delimited by <JOB_DETAIL></JOB_DETAIL> tag.
<JOB_DETAIL>
<JOB_DESCRIPTION>
</JOB_DETAIL>

Desired Output:
Provide JSON object as output like following:
{
  "work_experience": [
    {
      "role": "Software Engineer",
      "company": "Winjit Technologies",
      "location": "Pune, India"
      "from": "Jan 2020",
      "to": "Jun 2022",
      "description": [
        "Engineered 10+ RESTful APIs Architecture and Distributed services; Designed 30+ low-latency responsive UI/UX application features with high-quality web architecture; Managed and optimized large-scale Databases. (Systems Design)",  
        "Initiated and Designed a standardized solution for dynamic forms generation, with customizable CSS capabilities feature, which reduces development time by 8x; Led and collaborated with a 12 member cross-functional team. (Idea Generation)"  
        and so on ...
      ]
    },
    {
      "role": "Research Intern",
      "company": "IMATMI, Robbinsville",
      "location": "New Jersey (Remote)"
      "from": "Mar 2019",
      "to": "Aug 2019",
      "description": [
        "Conducted research and developed a range of ML and statistical models to design analytical tools and streamline HR processes, optimizing talent management systems for increased efficiency.",
        "Created 'goals and action plan generation' tool for employees, considering their weaknesses to facilitate professional growth.",
        and so on ...
      ]
    }
  ],
}
"""

In [523]:
prompt_project = """You are going to write a JSON resume section of "projects" for an applicant applying for job posts.

Step to follow:
1. Analyze my project details to match job requirements.
2. Create a JSON resume section that highlights strongest matches
3. Optimize JSON section for clarity and relevance to the job description.
4. Do not make up any information, only use the provided project details.
5. Keep the order of projects as the most relevant to the job description to the least relevant.


Instructions:
1. Focus: Craft three highly relevant project experiences aligned with the job description.
2. Content:
  2.1. Bullet points: 3 per experience, closely mirroring job requirements.
  2.2. Impact: Quantify each bullet point for measurable results.
  2.3. Storytelling: Utilize STAR methodology (Situation, Task, Action, Result) implicitly within each bullet point.
  2.4. Action Verbs: Showcase soft skills with strong, active verbs.
  2.5. Honesty: Prioritize truthfulness and objective language.
  2.6. Structure: Each bullet point follows "Did X by doing Y, achieved Z" format.
  2.7. Specificity: Prioritize relevance to the specific job over general achievements.
3. Style:
  3.1. Clarity: Clear expression trumps impressiveness.
  3.2. Voice: Use active voice whenever possible.
  3.3. Proofreading: Ensure impeccable spelling and grammar.

Consider following Project Details in JSON format.

Consider following Job description delimited by <JOB_DETAIL></JOB_DETAIL> tag.
<JOB_DETAIL>
<JOB_DESCRIPTION>
</JOB_DETAIL>

Desired Output:
Provide JSON object as output like following:
"projects": [
    {
      "name": "Search Engine for All file types - Sunhack Hackathon - Meta & Amazon Sponsored",
      "type": "Hackathon",
      "link": "https://devpost.com/software/team-soul-1fjgwo",
      "from": "Nov 2023",
      "to": "Nov 2023",
      "description": [
        "1st runner up prize in crafted AI persona, to explore LLM's subtle contextual understanding and create innovative collaborations between humans and machines.",
        "Devised a TabNet Classifier Model having 98.7% accuracy in detecting forest fire through IoT sensor data, deployed on AWS and edge devices 'Silvanet Wildfire Sensors' using technologies TinyML, Docker, Redis, and celery.",
        and So on ...
      ]
    }
    and So on ...
  ]"""

In [526]:
def generate_resume(resume_json, job_description):
    resume_json_object = json.loads(resume_json)
    work_experience = json.dumps(resume_json_object['work_experience'])
    projects = json.dumps(resume_json_object['projects'])
    
    work_exp = model.generate_content([work_exp_prompt, job_description, work_experience],
                                      generation_config={
                                               "temperature": 0.36,
                                                "max_output_tokens": 4000,
                                                "top_p": 0.95})
    print(clean_text(work_exp.text))
    work_exp_object = json.loads(clean_text(work_exp.text))
    
    projects = model.generate_content([prompt_project, job_description, projects],
                                      generation_config={
                                               "temperature": 0.35,
                                                "max_output_tokens": 4000,
                                                "top_p": 0.95})
    print(clean_text(projects.text))
    projects_object = json.loads(clean_text(projects.text))

    resume_json_object['work_experience'] = json.dumps(work_exp_object['work_experience'])
    resume_json_object['projects'] = json.dumps(projects_object['projects'])
    print('Work Ex: ', resume_json_object['work_experience'])
    print('Projects: ', resume_json_object['projects'] )
    return json.dumps(resume_json_object)

In [527]:
for job_index, job in df_jobs.iterrows():
    job_title = job['title']
    job_description = job['job_json']

    # Initialize an empty list to store the responses for all resumes for this job title
    responses = []

    # Iterate through each resume
    for resume_index, resume in df_resumes.iterrows():
        resume_json = resume['resume_json']
        response = generate_resume(resume_json, job_description)
        responses.append(clean_text(response))

    # Assign the list of responses to the corresponding new column in df_resumes
    df_resumes[job_title] = responses


{
  "work_experience": [
    {
      "role": "Software Engineer - I",
      "company": "Forcepoint, Mumbai, India",
      "location": null,
      "from": "July 2022",
      "to": "June 2023",
      "description": [
        "Implemented a robust Certificate Revocation Check feature in Python, bolstering Cloud Web Proxy security. Effectively detected and blocked expired or invalid certificates for accessed websites, substantially enhancing system integrity.",
        "Improved the code coverage by 12% by developing test cases for various components of the Cloud Web Proxy.",
        "Built a Quick Escalation Issue (EI) Analyzer that uses a Slack bot as a utility to provide most similar EI JIRA tickets to the EI ticket that is created. Utilized a combination of BERT and cosine similarity for the same which improved the EI ticket closing time by about 20%.",
        "Was awarded with the Best Business and Engineering Value award at the Forcepoint Global Hackathon 2022 for the same."
      ]

In [528]:
df_resumes.to_csv('new_resumes.csv', index=False)

In [532]:
improvement_prompt = """You are an experienced Technical Human Resource Manager,your task is to review the provided resume against the job description. 
  Please share your professional evaluation on whether the candidate's profile aligns with the role. 
 Highlight the strengths and weaknesses of the applicant in relation to the specified job requirements.
 Assign the percentage Matching based on Jd and the missing keywords with high accuracy. 
 Make sure you provide a detailed analysis of the resume and suggest areas of improvement.
"""

In [545]:
def generate_rec(job_description, resume_json):
    response = model.generate_content([improvement_prompt, job_description, resume_json],
                                      generation_config={
                                               "temperature": 0.35,
                                                "max_output_tokens": 4000,
                                                "top_p": 0.95})
    return response.text

In [547]:
for job_index, job in df_jobs.iterrows():
    job_title = job['title']
    job_description = job['job_json']

    # Initialize an empty list to store the responses for all resumes for this job title
    responses = []

    # Iterate through each resume
    for resume_index, resume in df_resumes.iterrows():
        resume_json = resume['resume_json']
        response = generate_rec(resume_json, job_description)
        responses.append(response)

    # Assign the list of responses to the corresponding new column in df_resumes
    df_resumes[job_title] = responses

In [548]:
df_resumes

Unnamed: 0,file_name,resume_text,resume_json,Job Description 1,Job Description 2,Software Intern - Generative AI Developer,Machine Learning Engineer,AI Engineer,Associate Software Engineer,Data Analyst,Data Scientist,AI/ML Data Science Intern,Software Engineer intern,Machine Learning Engineer intern,Data Engineering intern
0,Resume_Jash_09_SDE.pdf,"Jash Shah Baltimore, MD|+1 (667) 600-9472...","{\n ""name"": ""Jash Shah"",\n ""summary"": null,\...","{""name"": ""Jash Shah"", ""summary"": null, ""phone""...","{""name"": ""Jash Shah"", ""summary"": null, ""phone""...","**Professional Evaluation**\n\nThe candidate, ...",**Candidate's Profile Evaluation**\n\nThe cand...,"**Candidate Evaluation**\n\nThe candidate, Jas...","**Overall Evaluation**\n\nThe candidate, Jash ...",**Evaluation of Candidate Profile**\n\nThe can...,"**Evaluation Summary**\n\nThe candidate, Jash ...","**Overall Evaluation**\n\nThe candidate, Jash ...",**Strengths:**\n\n* **Strong technical skills:...,**Strengths:**\n\n* **Strong technical skills:...,"**Overall Evaluation:**\n\nThe candidate, Jash..."
1,Parthvi_Mehta_Resume.pdf,PARTHVI MEHTA \npmehta14@jh.edu | +1 (667) -...,"{\n ""name"": ""PARTHVI MEHTA"",\n ""summary"": nu...","{""name"": ""PARTHVI MEHTA"", ""summary"": null, ""ph...","{""name"": ""PARTHVI MEHTA"", ""summary"": null, ""ph...","**Evaluation**\n\nThe candidate, Parthvi Mehta...",**Candidate Profile Evaluation**\n\nThe candid...,"**Overall Evaluation**\n\nThe candidate, Parth...",**Evaluation of Candidate's Profile for Associ...,**Evaluation of Candidate Profile**\n\nThe can...,**Resume Evaluation for Data Scientist Role**\...,"**Overall Evaluation:**\n\nThe candidate, Part...","**Overall Assessment:**\n\nThe candidate, Part...",**Evaluation of Candidate Profile**\n\nThe can...,**Candidate Profile Evaluation**\n\n**Job Desc...


In [549]:
df_resumes.to_csv('resume_recs.csv', index=False)