In [231]:
import PyPDF2
import google.generativeai as genai
import pandas as pd
import os
import json
import re

In [232]:
def input_pdf_setup(file_path):
    text = ''
    if file_path is not None:
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page in reader.pages:
                text += page.extract_text()
    return text

In [233]:
def clean_text(output):
    # Remove "```json" from the beginning
    output = re.sub(r'^```json\n', '', output)

    # Remove text before the first '{'
    output = re.sub(r'^[^{]*', '', output)
    
    # Remove text after the last '}'
    output = re.sub(r'[^}]*$', '', output)
    
    # Remove "```" from the end, if it's still there
    output = re.sub(r'```$', '', output)

    return output

In [234]:
def clean_json(data):
    # Remove newlines and extra spaces
    cleaned_data = re.sub(r'\n\s*', '', data)
    
    # Remove extra spaces after colons and commas
    cleaned_data = re.sub(r'\s*:\s*', ': ', cleaned_data)
    cleaned_data = re.sub(r',\s*', ', ', cleaned_data)
    
    # Remove unnecessary escape characters
    cleaned_data = re.sub(r'\\', '', cleaned_data)
    
    return cleaned_data

In [235]:
def read_resumes(folder_path):
    # List all PDF files in the specified folder
    files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    
    # Prepare a list to hold data
    data = []
    
    # Process each file
    for file_name in files:
        file_path = os.path.join(folder_path, file_name)
        resume_text = input_pdf_setup(file_path)
        data.append({'file_name': file_name, 'resume_text': resume_text})
    
    # Create a DataFrame
    df = pd.DataFrame(data)
    return df

In [236]:
folder_path = '/Users/parthvinm/Desktop/NLP SSM/Final Project/Resume_Job-Align/Resumes'
df_resumes = read_resumes(folder_path)
df_resumes

Unnamed: 0,file_name,resume_text
0,Resume_Jash_09_SDE.pdf,"Jash Shah Baltimore, MD|+1 (667) 600-9472..."
1,Parthvi_Mehta_Resume.pdf,PARTHVI MEHTA \npmehta14@jh.edu | +1 (667) -...


In [237]:
prompt_structure = """Objective:  Parse a text-formatted resume efficiently and extract diverse applicant's data into a structured JSON format that adheres to the provided TypeScript interface schema.

Input: Text-formatted applicant's resume.

Steps:
1. Analyze Structure: Examine the text-formatted resume to understand its organization and identify key sections (e.g., education, experience, skills).
2. Convert to JSON: Map the extracted information to the corresponding fields in the schema, creating a structured JSON representation.
3. Optimize Output: Ensure the JSON is well-formatted, error-free, and handles missing values appropriately.
4. Handle Variations: Account for different resume styles and formatting to accurately extract relevant data.

Consider following TypeScript Interface for JSON schema:
```
interface Media {
  linkedin: string;
  github: string;
  devpost: string;
  medium: string;
  leetcode: string;
  dagshub: string;
  kaggle: string;
  instagram: string;
}

interface Education {
  degree: string;
  university: string;
  from: string;
  to: string;
  grade?: string;
  coursework?: string[];
}

interface SkillSection {
  name: string;
  skills: string[];
}

interface Work {
  role: string;
  company: string;
  from: string;
  to: string;
  description: string[];
}

interface Project {
  name: string;
  type: string;
  link?: string;
  from: string;
  to: string;
  description: string[];
}

interface Certification {
  name: string;
  by: string;
  link: string;
}

interface Achievements {
  [index: number]: string;
}

interface RESUME_DATA_SCHEMA {
  name: string;
  summary: string;
  phone: string;
  email: string;
  media: Media;
  education: Education[];
  skill_section: SkillSection[];
  work_experience: Work[];
  projects: Project[];
  certifications: Certification[];
  achievements: Achievements;
}
```

Desired Output: Write the Well-formatted JSON adhering to the RESUME_DATA_SCHEMA schema, handling missing values with empty strings or "None".
<JSON_OUTPUT_ACCORDING_TO_RESUME_DATA_SCHEMA>

The results should contain valid JSON only, without any delimiter or characters making invalid JSON format."""

In [238]:
prompt_job_description = """I will provide a job and company description for your analysis. In your analysis you have to identify the key words, expertise & requirements the job demands, but also from the company description.

Your task is consider it to meticulously assess the information and furnish the details like Job title, Keywords(key words are expertise & requirements the job demands), Job purpose, Job duties and responsibilities, Required qualifications, Preferred qualifications, Company name and Company details(points which include overview, mission, values or way of working) in bulleted points. 

Provide them only in JSON format - which is easily parse by any json parser - with following keys:
title, keywords, purpose, duties_responsibilities, required_qualifications, preferred_qualifications, company_name, company_info.

Output:
{
    "title": "RESULT", 
    "keywords": "RESULT", 
    purpose: "RESULT", 
    duties_responsibilities: "RESULT", 
    required_qualifications: "RESULT", 
    preferred_qualifications: "RESULT", 
    company_name: "RESULT",
    company_info: "RESULT",
}"""

In [239]:
model = genai.GenerativeModel('gemini-pro')

def generate_response(row, resume = True):
    if resume:
        response = model.generate_content([prompt_structure, row['resume_text']])
    else:
        response = model.generate_content([prompt_job_description, row['job_description']])
    return clean_text(response.text)

In [240]:
df_resumes['resume_json'] = df_resumes.apply(generate_response, axis=1)
df_resumes

Unnamed: 0,file_name,resume_text,resume_json
0,Resume_Jash_09_SDE.pdf,"Jash Shah Baltimore, MD|+1 (667) 600-9472...","{\n ""name"": ""Jash Shah"",\n ""summary"": ""None""..."
1,Parthvi_Mehta_Resume.pdf,PARTHVI MEHTA \npmehta14@jh.edu | +1 (667) -...,"{\n ""name"": ""PARTHVI MEHTA"",\n ""summary"": nu..."


In [241]:
input_text1 = """
About the job
Data Analyst Internship -Data Science & Gen-AI Team

At Peerlogic, we're pioneering the integration of artificial intelligence to revolutionize communication within the healthcare community. Guided by our core values of Curiosity, Accountability, Grit, and Enthusiasm (CAGE), we strive for excellence in innovating AI-driven solutions that improve, streamline, and transform professional interactions in medical settings. We're on the hunt for a Data Analysis II Intern to join our Data Science and Gen-AI team, a role critical to our mission of shaping the future of healthcare communication.

Location: Scottsdale, Arizona

Duration: 6 months, with potential for extension or transition to a full-time position

Role Overview

As a Data Analysis Intern, you will dive deep into the intricacies of AI and data science, contributing directly to projects at the cutting edge of technology and healthcare. Reporting to Dr. Amir Yazdavar, PhD, Principal Data Scientist, your work will support the development of AI solutions that embody our CAGE values, pushing the boundaries of what's possible in healthcare communication.

Key Responsibilitiaes

    Conduct advanced data analysis, utilizing statistical software and machine learning algorithms to extract insights from complex healthcare datasets.
    Collaborate with our AI development team to refine and enhance AI models, ensuring they meet the high standards required for healthcare applications.
    Engage in the entire lifecycle of AI solution development, from initial data collection and analysis to implementation and feedback collection.
    Develop and present clear, actionable insights to team members and stakeholders, aiding in decision-making and strategy development.
    Contribute to the creation of a robust knowledge base by documenting findings, methodologies, and best practices.

Qualifications

    Pursuing or recently completed a degree in Data Science, Computer Science, Statistics, Mathematics, or related field, with a strong academic record.
    Demonstrated experience with data analysis and statistical tools (e.g., Python, R), and a keen interest in AI and machine learning.
    A proactive learner with a knack for problem-solving and a passion for healthcare innovation.
    Exceptional communication skills, with the ability to convey complex ideas effectively to a diverse audience.
    Alignment with Peerlogic’s core values of Curiosity, Accountability, Grit, and Enthusiasm, and a strong desire to make a positive impact in healthcare.

What You'll Gain:

    A competitive stipend and flexible scheduling to accommodate your educational commitments.
    Hands-on experience in a fast-paced startup environment, working on groundbreaking projects at the intersection of AI and healthcare.
    Direct mentorship from Dr. Amir Yazdavar and the opportunity to learn from a team of experienced professionals.
    Access to a network of industry professionals and potential for future employment opportunities within Peerlogic.

Interested candidates should apply with a resume, cover letter detailing their interest and fit for the role, and any relevant project samples or portfolios.

Peerlogic is an equal-opportunity employer dedicated to building a diverse and inclusive team. We encourage applications from all qualified individuals who share our vision for improving healthcare communication. """

In [242]:
input_text2 = """ Machine Learning Engineering Intern- Visual Gen AI
Pyxer Inc. · Santa Clara, CA ·

About the job
We are pyxer, a visual gen AI startup in the apparel B2C space, on a mission to revolutionize clothes shopping. We are building an AI personal shopper that helps people find their perfect looks in both the digital and real world, by styling and shopping through their personal photos.


We are seeking an exceptional machine learning engineering intern who is creative, driven, and can get things done. You will have the unique opportunity to have your fingerprints on the next transformational consumer tech company, creating products that have the potential to be used by every person on the planet. 



Our product is already being actively used in 55 countries. You will get to watch your work come to life quickly as it will be frequently deployed into production. You will gain a deep sense of achievement, while acquiring critical insights and skills in generative AI for photos and videos.


We currently have two open positions available for this role.


In your application, please write a short cover letter on why you believe in our idea and mission. We consider each intern as a pyxer brand ambassador, so they must strongly believe in the idea and mission.


This role is ideal for you if:

You are passionate about our mission and are looking for a long-term employment opportunity beyond the internship (e.g. part-time role, full-time founding engineer).
You prefer collaborating in-person with a team, rather than remotely. This internship will take place entirely in-person at our office in the Santa Clara, CA area.


Key Responsibilities:

Train and fine-tune visual gen AI models to be deployed in real world, production settings.
Work with large-scale datasets and apply data preprocessing and augmentation methods.
Evaluate model performance through quantitative metrics and qualitative assessment.
Refine and optimize our pipelines to increase efficiency.
Collaborate with cross-functional teams to integrate AI solutions into broader systems.
Write clean, well-documented, and maintainable code adhering to best practices.


Qualifications:

Strong understanding of the fundamentals of deep learning.
Excellent Python programming and debugging skills, with a strong grasp of its essential data structures and algorithms.
Understanding of shell scripting and working with Ubuntu or other Linux distributions.
Proficiency in PyTorch.


Preferred Skills:

Prior experience or projects related to image processing, video processing, or computer vision.


Duration:

3 months, starting from June.


Compensation:

$5000 per month"""

In [243]:
df_jobs = [('Job Description 1',input_text1), ('Job Description 2',input_text2)]
df_jobs = pd.DataFrame(df_jobs, columns=['title','job_description'])
df_jobs

Unnamed: 0,title,job_description
0,Job Description 1,\nAbout the job\nData Analyst Internship -Data...
1,Job Description 2,Machine Learning Engineering Intern- Visual G...


In [244]:
df_jobs['job_json'] = df_jobs.apply(generate_response, resume = False, axis=1)
df_jobs

Unnamed: 0,title,job_description,job_json
0,Job Description 1,\nAbout the job\nData Analyst Internship -Data...,"{\n ""title"": ""Data Analysis II Intern"",\n ""k..."
1,Job Description 2,Machine Learning Engineering Intern- Visual G...,"{\n ""title"": ""Machine Learning Engineering ..."


In [245]:
work_exp_prompt = """You are going to write a JSON resume section of "Work Experience" for an applicant applying for job posts.

Step to follow:
1. Analyze my Work details to match job requirements.
2. Create a JSON resume section that highlights strongest matches
3. Optimize JSON section for clarity and relevance to the job description.
4. Do not make up any information, only use the provided work experience.

Instructions:
1. Focus: Craft three highly relevant work experiences aligned with the job description.
2. Content:
  2.1. Bullet points: 3 per experience, closely mirroring job requirements.
  2.2. Impact: Quantify each bullet point for measurable results.
  2.3. Storytelling: Utilize STAR methodology (Situation, Task, Action, Result) implicitly within each bullet point.
  2.4. Action Verbs: Showcase soft skills with strong, active verbs.
  2.5. Honesty: Prioritize truthfulness and objective language.
  2.6. Structure: Each bullet point follows "Did X by doing Y, achieved Z" format.
  2.7. Specificity: Prioritize relevance to the specific job over general achievements.
3. Style:
  3.1. Clarity: Clear expression trumps impressiveness.
  3.2. Voice: Use active voice whenever possible.
  3.3. Proofreading: Ensure impeccable spelling and grammar.

Consider following Work Details in JSON format and look for the description in each and carefully align it.

Consider following Job description delimited by <JOB_DETAIL></JOB_DETAIL> tag.
<JOB_DETAIL>
<JOB_DESCRIPTION>
</JOB_DETAIL>

Desired Output:
Provide JSON object as output like following:
{
  "work_experience": [
    {
      "role": "Software Engineer",
      "company": "Winjit Technologies",
      "location": "Pune, India"
      "from": "Jan 2020",
      "to": "Jun 2022",
      "description": [
        "Engineered 10+ RESTful APIs Architecture and Distributed services; Designed 30+ low-latency responsive UI/UX application features with high-quality web architecture; Managed and optimized large-scale Databases. (Systems Design)",  
        "Initiated and Designed a standardized solution for dynamic forms generation, with customizable CSS capabilities feature, which reduces development time by 8x; Led and collaborated with a 12 member cross-functional team. (Idea Generation)"  
        and so on ...
      ]
    },
    {
      "role": "Research Intern",
      "company": "IMATMI, Robbinsville",
      "location": "New Jersey (Remote)"
      "from": "Mar 2019",
      "to": "Aug 2019",
      "description": [
        "Conducted research and developed a range of ML and statistical models to design analytical tools and streamline HR processes, optimizing talent management systems for increased efficiency.",
        "Created 'goals and action plan generation' tool for employees, considering their weaknesses to facilitate professional growth.",
        and so on ...
      ]
    }
  ],
}
"""

In [246]:
prompt_project = """You are going to write a JSON resume section of "projects" for an applicant applying for job posts.

Step to follow:
1. Analyze my project details to match job requirements.
2. Create a JSON resume section that highlights strongest matches
3. Optimize JSON section for clarity and relevance to the job description.
4. Do not make up any information, only use the provided project details.
5. Keep the order of projects as the most relevant to the job description to the least relevant.


Instructions:
1. Focus: Craft three highly relevant project experiences aligned with the job description.
2. Content:
  2.1. Bullet points: 3 per experience, closely mirroring job requirements.
  2.2. Impact: Quantify each bullet point for measurable results.
  2.3. Storytelling: Utilize STAR methodology (Situation, Task, Action, Result) implicitly within each bullet point.
  2.4. Action Verbs: Showcase soft skills with strong, active verbs.
  2.5. Honesty: Prioritize truthfulness and objective language.
  2.6. Structure: Each bullet point follows "Did X by doing Y, achieved Z" format.
  2.7. Specificity: Prioritize relevance to the specific job over general achievements.
3. Style:
  3.1. Clarity: Clear expression trumps impressiveness.
  3.2. Voice: Use active voice whenever possible.
  3.3. Proofreading: Ensure impeccable spelling and grammar.

Consider following Project Details in JSON format.

Consider following Job description delimited by <JOB_DETAIL></JOB_DETAIL> tag.
<JOB_DETAIL>
<JOB_DESCRIPTION>
</JOB_DETAIL>

Desired Output:
Provide JSON object as output like following:
"projects": [
    {
      "name": "Search Engine for All file types - Sunhack Hackathon - Meta & Amazon Sponsored",
      "type": "Hackathon",
      "link": "https://devpost.com/software/team-soul-1fjgwo",
      "from": "Nov 2023",
      "to": "Nov 2023",
      "description": [
        "1st runner up prize in crafted AI persona, to explore LLM's subtle contextual understanding and create innovative collaborations between humans and machines.",
        "Devised a TabNet Classifier Model having 98.7% accuracy in detecting forest fire through IoT sensor data, deployed on AWS and edge devices 'Silvanet Wildfire Sensors' using technologies TinyML, Docker, Redis, and celery.",
        and So on ...
      ]
    }
    and So on ...
  ]"""

In [269]:
def generate_resume(resume_json, job_description):
    resume_json_object = json.loads(resume_json)
    work_experience = json.dumps(resume_json_object['work_experience'])
    projects = json.dumps(resume_json_object['projects'])
    work_exp = model.generate_content([work_exp_prompt, job_description, work_experience],
                                      generation_config={
                                               "temperature": 0.36,
                                                "max_output_tokens": 4000,
                                                "top_p": 0.95})
    # print(work_exp.text)
    work_exp_object = json.loads(clean_json(work_exp.text))
    projects = model.generate_content([prompt_project, job_description, projects],
                                      generation_config={
                                               "temperature": 0.35,
                                                "max_output_tokens": 4000,
                                                "top_p": 0.95})
    # print(projects.text)
    projects_object = json.loads(clean_json(projects.text))
    resume_json_object['work_experience'] = json.dumps(work_exp_object['work_experience'])
    resume_json_object['projects'] = json.dumps(projects_object['projects'])
    # print('Work Ex: ', resume_json_object['work_experience'])
    # print('Projects:', resume_json_object['projects'] )
    return json.dumps(resume_json_object)

In [270]:
for job_index, job in df_jobs.iterrows():
    job_title = job['title']
    job_description = job['job_json']

    # Initialize an empty list to store the responses for all resumes for this job title
    responses = []

    # Iterate through each resume
    for resume_index, resume in df_resumes.iterrows():
        resume_json = resume['resume_json']
        response = generate_resume(resume_json, job_description)
        responses.append(clean_text(response))

    # Assign the list of responses to the corresponding new column in df_resumes
    df_resumes[job_title] = responses


{
  "work_experience": [
    {
      "role": "Lead Course Assistant \u2013 Gateway Computing: Python",
      "company": "The Johns Hopkins University",
      "from": "August 2023",
      "to": "Present",
      "description": [
        "Mentoring, supporting, and helping students with coursework in the classroom and conducting quizzes and project discussion classes.",
        "Reviewing, editing, and grading student assessments and modules and helping with teaching logistics."
      ]
    },
    {
      "role": "Software Engineer - I",
      "company": "Forcepoint, Mumbai, India",
      "from": "July 2022",
      "to": "June 2023",
      "description": [
        "Implemented a robust Certificate Revocation Check feature in Python, bolstering Cloud Web Proxy security. Effectively detected and blocked expired or invalid certificates for accessed websites, substantially enhancing system integrity.",
        "Improved the code coverage by 12% by developing test cases for various components 

In [271]:
df_resumes['Job Description 1'][0]

'{"name": "Jash Shah", "summary": "None", "phone": "+1 (667) 600-9472", "email": "jshah48@jh.edu", "media": {"linkedin": "www.linkedin.com/in/jashshah09", "github": "github.com/jash09", "devpost": "None", "medium": "None", "leetcode": "None", "dagshub": "None", "kaggle": "None", "instagram": "None"}, "education": [{"degree": "Master of Science in Engineering in Computer Science", "university": "Johns Hopkins University, USA", "from": "August 2023", "to": "May 2025", "grade": "None", "coursework": ["Algorithms", "Machine Translation", "Natural Language Processing"]}, {"degree": "Bachelor of Technology in Information Technology", "university": "University of Mumbai, India", "from": "August 2018", "to": "July 2022", "grade": "8.81/10", "coursework": ["Data Structures and Algorithms", "Operating Systems", "Cloud Computing", "Object Oriented Programming", "DevOps"]}], "skill_section": [{"name": "Programming languages", "skills": ["Python", "C++", "C", "Java", "JavaScript", "HTML", "CSS", "P

In [272]:
df_resumes.to_csv('new_resumes.csv', index=False)