# Import Libraries and Set environment

In [None]:
import os
import pandas as pd
import openai
from util import extract_text_from_resume
import json
import re
import textwrap
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

from dotenv import load_dotenv, find_dotenv

# Load environment variables from .env file
_=load_dotenv(find_dotenv())

# Set OpenAI API key
openai.api_key = os.environ['OPENAI_API_KEY']




In [None]:
# Directory containing the resumes
resume_dir = "/Users/yunjaewon/ChatGPT/resumes/"
resume_list=os.listdir(resume_dir)
print(resume_list)


In [None]:
resume_text = extract_text_from_resume(resume_dir+'jaeDE.pdf')
#print(resume_text)

In [None]:
chat = ChatOpenAI(temperature=0.0)
chat

# Simple Extraction Trial (useless)

In [None]:
template_string="""Extract skills from a resume \
that is delimited by triple backticks \
into a style that is {style}. \
text: ```{text}```
"""

In [None]:
from langchain.prompts import ChatPromptTemplate

prompt_template = ChatPromptTemplate.from_template(template_string)
print(prompt_template.messages[0].prompt)
print(prompt_template.messages[0].prompt.input_variables)

In [None]:
result_style="""Python List"""
customer_messages = prompt_template.format_messages(
                    style=result_style,
                    text=resume_text)

In [None]:
print(type(customer_messages))
print(type(customer_messages[0]))

In [None]:
# Call the LLM to translate to the style of the customer message
result = chat(customer_messages)



In [None]:
wrapper = textwrap.TextWrapper(width=100)  # change 50 to any number that suits your needs
word_list = wrapper.wrap(text=result.content)

for line in word_list:
    print(line)

# Output Parsers

Let's start with defining how we would like the LLM output to look like:

In [None]:
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser
from langchain.prompts import ChatPromptTemplate

## Trial 1: Extracting everythin at once
Problem: only extract first project from Branchy solution

In [None]:
contact_schema=ResponseSchema(name="contact",
description="Extracts personal contact information in JSON format. \
    The key values include 'name', 'email', 'phone number', and personal websites such as 'LinkedIn' or 'Github profiles'.")

education_schema=ResponseSchema(name="education",
description="Extracts information about educational background or degrees in JSON format.\
      The key values include 'institutions', 'degree types', 'majors', and 'graduation dates'.")

experience_schema=ResponseSchema(name="experience",
description="Extracts work experience details in JSON format. \
    Key values include 'job title', 'employer name', 'employment duration', and 'job description/responsibilities'.")

skills_schema=ResponseSchema(name="skills",
description="Extracts details about professional and technical skills in JSON format. \
    The key values is 'skills' and the values are listed in a Python list, delimited by commas.")

response_schemas=[contact_schema, education_schema, experience_schema, skills_schema]

In [None]:
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()

In [None]:
print(format_instructions)


In [None]:
# Problem: only extract first project from Branchy solution
output_template="""\
For the following text, extract the following information:

contact:Extracts personal contact information in JSON format. \
    The key values include 'name', 'email', 'phone number', and personal websites such as 'LinkedIn' or 'Github profiles'.

education:Extracts information about educational background or degrees in JSON format.\
      The key values include 'institutions', 'degree types', 'majors', and 'graduation dates'.

experience:Extracts work experience details in JSON format. \
    Key values include 'job title', 'employer name', 'employment duration', and 'job description/responsibilities'.

skills:Extracts details about professional and technical skills in JSON format. \
    The key values is 'skills' and the values are listed in a Python list, delimited by commas.

text: {text}

{format_instructions} """

In [None]:
prompt = ChatPromptTemplate.from_template(template=output_template)

messages = prompt.format_messages(text=resume_text, 
                                format_instructions=format_instructions)

In [None]:
#print(messages[0].content)

In [None]:
response = chat(messages)

In [None]:
#print(response.content)

In [None]:
output_dict = output_parser.parse(response.content)
output_dict

In [None]:
type(output_dict)

In [None]:
output_dict.get('experience')

## Trial 2: Separate extracting experiences
Pretty Successful

In [None]:
contact_schema=ResponseSchema(name="contact",
description="Extracts personal contact information in JSON format. \
    The keys are 'name', 'email', and 'phone number'. \
    The corresponding values for these keys should be in the form of Python lists, delimited by commas.")

education_schema=ResponseSchema(name="education",
description="Extracts information about educational background or degrees in JSON format.\
            The keys are 'institutions', 'degree types', 'majors', and 'graduation dates'.\
            The corresponding values for these keys should be in the form of Python lists, delimited by commas.")

skills_schema=ResponseSchema(name="skills",
description="Extracts details about professional and technical skills in JSON format. \
    The key is 'skills' and the values are listed in a Python list, delimited by commas.")

response_schemas1=[contact_schema, education_schema, skills_schema]

work_experience_schema=ResponseSchema(name="work experiences",
description="Extracts work experience details and technical skills used in each work experience in JSON format. \
    the keys are 'job title', 'employer', 'employment duration', 'job description' and 'technical skills'.\
        The corresponding values for these keys should be in the form of Python lists, delimited by commas.")

project_schema=ResponseSchema(name="projects",
description="Extracts any project detail and technical skills used in each project in JSON format. \
    the keys are 'project name', 'project detail' and 'technical skills'.\
        The corresponding values for these keys should be in the form of Python lists, delimited by commas.")

response_schemas2=[work_experience_schema,project_schema]

In [None]:
output_parser1 = StructuredOutputParser.from_response_schemas(response_schemas1)
format_instructions1 = output_parser1.get_format_instructions()

output_parser2 = StructuredOutputParser.from_response_schemas(response_schemas2)
format_instructions2 = output_parser2.get_format_instructions()

In [None]:
#print(format_instructions1)

In [None]:
#print(format_instructions2)

In [None]:
output_template1="""\
For the following text, extract the following information:

contact: Extracts personal contact information in JSON format. \
    The keys are 'name', 'email', and 'phone number'. \
    The corresponding values for these keys should be in the form of Python lists, delimited by commas.

education: Extracts information about educational background or degrees in JSON format.\
            The keys are 'institutions', 'degree types', 'majors', and 'graduation dates'.\
            The corresponding values for these keys should be in the form of Python lists, delimited by commas.
            
skills: Extracts details about professional and technical skills in JSON format. \
    The key values is 'skills' and the values are listed in a Python list, delimited by commas.

text: {text}

{format_instructions1} """

output_template2="""\
For the following text, extract the following information:

work experiences: Extracts work experience details and technical skills used in each work experience in JSON format. \
    the keys are 'job title', 'employer', 'employment duration', 'job description' and 'technical skills'.\
        The corresponding values for these keys should be in the form of Python lists, delimited by commas.

projects: Extracts any project detail and technical skills used in each project in JSON format. \
    the keys are 'project name', 'project detail' and 'technical skills'.\
        The corresponding values for these keys should be in the form of Python lists, delimited by commas.
        
text: {text}

{format_instructions2} """

In [None]:
prompt1 = ChatPromptTemplate.from_template(template=output_template1)

messages1 = prompt1.format_messages(text=resume_text, 
                                format_instructions1=format_instructions1)
prompt2 = ChatPromptTemplate.from_template(template=output_template2)

messages2 = prompt2.format_messages(text=resume_text, 
                                format_instructions2=format_instructions2)

In [None]:
#print(messages1[0].content)
#print(messages2[0].content)


In [None]:
response1 = chat(messages1)


In [None]:
response2=chat(messages2)

In [None]:
#print(response1.content)

In [None]:
#print(response2.content)

In [None]:
output_dict1 = output_parser1.parse(response1.content)
output_dict2 = output_parser2.parse(response2.content)


In [None]:
output_dict2.get('work experiences')


# Calculate the matching ratio

In [None]:
job_requirement={"Minimum qualification":
                    {"Degree":"Bachelor",
                     "Major":["Physics", "Computer Science", "Electrical Engineering", "Mathmetics", "Machine Learning", "Statistics"],
                     "Skills":["Statistical Analysis"],
                     "Years of Experience":"3 years of experience"},
                "Preferred qualification":
                    {"Degree":"Phd",
                     "Major":["Computer Science"],
                     "Skills":["AWS","Time Series Analysis","Natural Language Processing"],
                     "Years of Experience":"5 years of experience"}
                    }

In [None]:
resume_info=output_dict1 | output_dict2
resume_info

## Using ChatGPT for comparision : Fail

In [None]:
#Trial 1
MinQualification_schema=ResponseSchema(name="Matching minimum qualification",
description="Compare items of 'Minimum qualification' from python dictionary: {job_requirement} and {resume_info}.\
            Then, extract only matching items in JSON format, \
            where the key is 'Matching minimum qualification'.")

PrefQualification_schema=ResponseSchema(name="Matching preferred qualification",
description="Compare items of 'Preferred qualification' from python dictionary: {job_requirement} and qualifications from {resume_info}. \
            Then, extract only matching items in JSON format, \
            where the key is 'Matching preferred qualification'.")


In [None]:
#Trial 2
MinQualification_schema=ResponseSchema(name="Matching minimum qualification",
description="Compare the 'Minimum qualification' elements from the Python dictionaries: {job_requirement} and {resume_info}.\
      Extract and present the common items in a JSON structure, where the corresponding key is 'Matching minimum qualification'.")

PrefQualification_schema=ResponseSchema(name="Matching preferred qualification",
description="Compare the 'Preferred qualification' elements from the Python dictionaries: {job_requirement} and {resume_info}.\
      Extract and present the common items in a JSON structure, where the corresponding key is 'Matching preferred qualification'.")


In [None]:
#Trial 3
MinQualification_schema=ResponseSchema(name="Matching minimum qualification",
description="Compare the 'Minimum qualification' elements from the Python dictionaries: {job_requirement} and {resume_info}.\
      Extract the common items in a JSON structure, where the corresponding key is 'Matching minimum qualification'.")

PrefQualification_schema=ResponseSchema(name="Matching preferred qualification",
description="Compare the 'Preferred qualification' elements from the Python dictionaries: {job_requirement} and {resume_info}.\
      Extract the common items in a JSON structure, where the corresponding key is 'Matching preferred qualification'.")


In [None]:
response_schemas=[MinQualification_schema, PrefQualification_schema]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions()
print(format_instructions)

In [None]:
rubric_template="""
    For the following resume information and job requirement python dictionary, extract the following information:
    
    Matching minimum qualification: Compare the 'Minimum qualification' elements from the Python dictionaries: {job_requirement} and {resume_info}.\
      Extract the common items in a JSON structure, where the corresponding key is 'Matching minimum qualification'.
    
    Matching preferred qualification: Compare the 'Preferred qualification' elements from the Python dictionaries: {job_requirement} and {resume_info}.\
      Extract the common items in a JSON structure, where the corresponding key is 'Matching preferred qualification'.

    
    {format_instructions}
"""

In [None]:
prompt = ChatPromptTemplate.from_template(template=rubric_template)


In [None]:
messages = prompt.format_messages(resume_info=resume_info,
                                  job_requirement=job_requirement, 
                                format_instructions=format_instructions)

In [None]:
messages

In [None]:
chat = ChatOpenAI(temperature=0.0)



In [None]:
response=chat(messages)


In [None]:
print(response.content) ## Wrong output

In [None]:
output_dict = output_parser.parse(response.content)

In [None]:
output_dict

### Use langchain Agents for comparison : Fail

In [None]:
from langchain.agents.agent_toolkits import create_python_agent
from langchain.agents import load_tools, initialize_agent
from langchain.agents import AgentType
from langchain.tools.python.tool import PythonREPLTool
from langchain.python import PythonREPL


In [None]:
llm = ChatOpenAI(temperature=0)
agent = create_python_agent(
    llm,
    tool=PythonREPLTool(),
    verbose=True)

In [None]:
## Wrong!
agent.run(f"""Calculate the matching ratio between 'Matching minimum qualification' from {output_dict} and 'Minimum qualification' from {job_requirement}""") 

# Use Langchain to create Knowledge  (Because of format instructions gpt misses some info!!!!)

In [1]:
import os
import pandas as pd
import openai
from util import extract_text_from_resume
import json
import re
import textwrap
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import ResponseSchema
from langchain.output_parsers import StructuredOutputParser



def wraptext(text, width=100):
    wrapper = textwrap.TextWrapper(width=width)
    word_list = wrapper.wrap(text=text)

    for element in word_list:
        print(element)


from dotenv import load_dotenv, find_dotenv

# Load environment variables from .env file
_=load_dotenv(find_dotenv())

# Set OpenAI API key
openai.api_key = os.environ['OPENAI_API_KEY']
# Directory containing the resumes
resume_dir = "/Users/yunjaewon/ChatGPT/resumes/"
resume_list=os.listdir(resume_dir)
print(resume_list)

resume_text = extract_text_from_resume(resume_dir+'jaeDE.pdf')
#print(resume_text)

chat = ChatOpenAI(temperature=0.0)


['alan.pdf', '.DS_Store', 'jaewon.docx', 'oldresume.docx', 'jaeDE.pdf', 'lorraine.pdf', 'jaewon.pdf', 'mlscientist.pdf']


In [3]:
print(resume_text)

JAEWON YUN

Data Engineer

WW jaewon.yun@mail.utoronto.ca , 4387285493 @ Montreal, QC

Professional Experience

Data Engineer, Branchy Solution 2
01/2022 - present | Toronto, Canada
End-to-End AWS Cloud Migration:

Designed and executed robust ETL pipelines using AWS Glue Studio, Lambda, and
$3, leading to improved data volume handling and enhancing overall data
infrastructure scalability.

Utilized Databricks with PySpark for efficient data processing, streamlining data
transformation and load processes.

Constructed robust data modeling strategies within AWS Redshift and RDS,
bolstering the maintainability and scalability of data warehouse systems.

Managed IAM roles and security groups, strengthening data protection and ensuring
consistent system reliability.

Large Language Model-Powered Resume Rater Engine:

Engineered a Resume Rater Engine using OpenAl's large language model for
precise information extraction and candidate profiling, driving a 30% surge in
successful matches.

Em

In [4]:
name_schema=ResponseSchema(name="name",
description="Extracts name in JSON format. \
    The keys must be 'name'. Each corresponding value should be represented as a Python string.\
    If the name cannot be found, the key should still be included in the JSON object, but its corresponding value should be null.")

email_schema=ResponseSchema(name="email",
description="Extracts email in JSON format. \
    The keys must be 'email'. Each corresponding value should be represented as a Python string.\
    If the email cannot be found, the key should still be included in the JSON object, but its corresponding value should be null.")


phone_number_schema=ResponseSchema(name="phone number",
description="Extracts phone number in JSON format. \
    The keys must be 'phone number'.Each corresponding value should be represented as a Python string. \
    If the phone number cannot be found, the key should still be included in the JSON object, but its corresponding value should be null.")


education_schema=ResponseSchema(name="education",
description="Extract information about the individual's educational background in JSON format.\
      Each educational experience should be represented as a separate JSON object. \
    For each education instance, the keys must be 'institution', 'degree_type', 'major', and 'graduation_date'. \
    Each corresponding value should be represented as a Python string.\
    If any information cannot be found for a given key, ensure the key is still included in the JSON object, but assign its corresponding value as null.")

skills_schema=ResponseSchema(name="skills",
description="Extracts details about professional and technical skills in JSON format. \
    The key must be 'skills'.\
        Each corresponding value should be represented as a Python list, with individual items separated by commas. \
    If certain information cannot be found, the corresponding key should still be included in the JSON object, but its value should be an empty Python list.")

response_schemas1=[name_schema, email_schema, phone_number_schema, education_schema, skills_schema]

work_experience_schema=ResponseSchema(name="work experiences",
description="Follow steps below to extract work experiences: \
1. Begin by extracting details about each distinct job role from the work experience section.\
2. For every distinct job role, even if it is within the same company, create a separate JSON object.\
3. Each JSON object must include the following keys: 'job_title', 'employer', and 'employment_duration'.\
4. For the keys 'job_title', 'employer', and 'employment_duration', represent the corresponding values as Python strings.\
      For instance, the 'job_title' for a specific role might look like: 'Software Engineer'.\
5. If there is any key for which you cannot find the corresponding information, ensure that this key is still included in the JSON object. \
    If no details are found for the keys, assign their value as an empty Python string, for instance 'job_title': ''.\
6. Repeat these steps for each distinct job role identified in the work experience section.")

response_schemas2=[work_experience_schema]

project_schema=ResponseSchema(name="projects",
description="Follow the steps below to extract project details:\
1. Start by identifying and extracting details for each distinct project, the employer, the job title, and the technical skills utilized in each project.\
2. For each distinct project, create a separate JSON object.\
3. The JSON object for each project must include the following keys: 'project_name', 'employer', 'job_title', and 'technical_skills'.\
4. Represent the corresponding values for each key as Python lists or strings. \
Each individual item within the 'technical_skills' list should be separated by commas.\
For instance, a list of technical skills for a specific project might appear as: ['JavaScript', 'React', 'Firebase'].\
5. If you cannot find the information corresponding to any of the keys, ensure that this key is still included in the JSON object.\
However, in such cases, assign its value as an empty Python list or empty string.\
For example, if no technical skills are associated with a particular project, you should include: 'technical_skills': [] in the JSON object.\
Similarly, if no employer or job title is found related to the project, you should include: 'employer': '', 'job_title': '' in the JSON object.\
6. Repeat these steps for each distinct project identified.")

response_schemas3=[project_schema]


output_parser1 = StructuredOutputParser.from_response_schemas(response_schemas1)
format_instructions1 = output_parser1.get_format_instructions()

output_parser2 = StructuredOutputParser.from_response_schemas(response_schemas2)
format_instructions2 = output_parser2.get_format_instructions()

output_parser3 = StructuredOutputParser.from_response_schemas(response_schemas3)
format_instructions3 = output_parser3.get_format_instructions()

In [5]:
output_template1="""\
For the following text, extract the following information:

Extracts name in JSON format. \
    The keys must be 'name'. Each corresponding value should be represented as a Python string.\
    If the name cannot be found, the key should still be included in the JSON object, but its corresponding value should be null.

Extracts email in JSON format. \
    The keys must be 'email'. Each corresponding value should be represented as a Python string.\
    If the email cannot be found, the key should still be included in the JSON object, but its corresponding value should be null.
            
Extracts phone number in JSON format. \
    The keys must be 'phone number'.Each corresponding value should be represented as a Python string. \
    If the phone number cannot be found, the key should still be included in the JSON object, but its corresponding value should be null.

Extract information about the individual's educational background in JSON format.\
      Each educational experience should be represented as a separate JSON object. \
    For each education instance, the keys must be 'institution', 'degree_type', 'major', and 'graduation_date'. \
    Each corresponding value should be represented as a Python string.\
    If any information cannot be found for a given key, ensure the key is still included in the JSON object, but assign its corresponding value as null.

Extracts details about professional and technical skills in JSON format. \
    The key must be 'skills'.\
    Each corresponding value should be represented as a Python list, with individual items separated by commas. \
    If certain information cannot be found, the corresponding key should still be included in the JSON object, but its value should be an empty Python list.

text: {text}

{format_instructions1} """

output_template2="""\
For the following text, extract the following information:

Follow steps below to extract work experiences: 
1. Begin by extracting details about each distinct job role from the work experience section.\
2. For every distinct job role, even if it is within the same company, create a separate JSON object.\
3. Each JSON object must include the following keys: 'job_title', 'employer', and 'employment_duration'.\
4. For the keys 'job_title', 'employer', and 'employment_duration', represent the corresponding values as Python strings.\
      For instance, the 'job_title' for a specific role might look like: 'Software Engineer'.\
5. If there is any key for which you cannot find the corresponding information, ensure that this key is still included in the JSON object. \
    If no details are found for the keys, assign their value as an empty Python string, for instance 'job_title': ''.\
6. Repeat these steps for each distinct job role identified in the work experience section.
        
text: {text}

{format_instructions2} """

output_template3="""\
For the following text, extract the following information:

Follow the steps below to extract project details:
1. Start by identifying and extracting details for each distinct project, the employer, the job title, and the technical skills utilized in each project.
2. For each distinct project, create a separate JSON object.
3. The JSON object for each project must include the following keys: 'project_name', 'employer', 'job_title', and 'technical_skills'.
4. Represent the corresponding values for each key as Python lists or strings. \
Each individual item within the 'technical_skills' list should be separated by commas.\
For instance, a list of technical skills for a specific project might appear as: ['JavaScript', 'React', 'Firebase'].
5. If you cannot find the information corresponding to any of the keys, ensure that this key is still included in the JSON object.\
However, in such cases, assign its value as an empty Python list or empty string.\
For example, if no technical skills are associated with a particular project, you should include: 'technical_skills': [] in the JSON object.\
Similarly, if no employer or job title is found related to the project, you should include: 'employer': '', 'job_title': '' in the JSON object.
6. Repeat these steps for each distinct project identified.
        
text: {text}

{format_instructions3} """

prompt1 = ChatPromptTemplate.from_template(template=output_template1)

messages1 = prompt1.format_messages(text=resume_text, 
                                format_instructions1=format_instructions1)

prompt2 = ChatPromptTemplate.from_template(template=output_template2)

messages2 = prompt2.format_messages(text=resume_text, 
                                format_instructions2=format_instructions2)

prompt3 = ChatPromptTemplate.from_template(template=output_template3)

messages3 = prompt3.format_messages(text=resume_text, 
                                format_instructions3=format_instructions3)




In [6]:
response1 = chat(messages1)


In [7]:
response2 = chat(messages2)

In [8]:
response3 = chat(messages3)

In [9]:
output_dict1 = output_parser1.parse(response1.content)
output_dict2 = output_parser2.parse(response2.content)
output_dict3 = output_parser3.parse(response3.content)

In [10]:
output_dict1

{'name': 'JAEWON YUN',
 'email': 'jaewon.yun@mail.utoronto.ca',
 'phone number': '4387285493',
 'education': [{'institution': 'University of Toronto',
   'degree_type': 'Physics Specialist (HBSc)',
   'major': None,
   'graduation_date': '2020'}],
 'skills': ['Data Engineering/ETL Tools',
  'Data Modeling',
  'Data Warehousing',
  'PySpark',
  'Cypher Query Language',
  'Docker',
  'Databricks',
  'Airflow',
  'Graph Database (Neo4)j)',
  'Git',
  'Python',
  'SQL/NoSQL Databases',
  'SQL',
  'Amazon Web Services',
  'Redshift',
  'SageMaker',
  'Glue Studio',
  'EC2',
  'RDS',
  'Lambda',
  '$3',
  'Cloud Development Kit',
  'DynamoDB',
  'IAM',
  'Microsoft Azure',
  'Synapse Analytics',
  'Data Lake Storage',
  'Data Factory',
  'SQL Database',
  'Blob Storage',
  'Machine Learning',
  'Natural Language Processing',
  'Recommendation System',
  'Statistical Modeling',
  'Medical Image Processing',
  'Time Series Analysis',
  'TensorFlow/Keras']}

In [11]:

output_dict2

{'work experiences': "Follow steps below to extract work experiences: 1. Begin by extracting details about each distinct job role from the work experience section.2. For every distinct job role, even if it is within the same company, create a separate JSON object.3. Each JSON object must include the following keys: 'job_title', 'employer', and 'employment_duration'.4. For the keys 'job_title', 'employer', and 'employment_duration', represent the corresponding values as Python strings.      For instance, the 'job_title' for a specific role might look like: 'Software Engineer'.5. If there is any key for which you cannot find the corresponding information, ensure that this key is still included in the JSON object.     If no details are found for the keys, assign their value as an empty Python string, for instance 'job_title': ''.6. Repeat these steps for each distinct job role identified in the work experience section."}

In [12]:
output_dict3

{'projects': [{'project_name': 'End-to-End AWS Cloud Migration',
   'employer': 'Branchy Solution 2',
   'job_title': 'Data Engineer',
   'technical_skills': ['AWS Glue Studio',
    'Lambda',
    '$3',
    'Databricks',
    'PySpark',
    'AWS Redshift',
    'RDS']},
  {'project_name': 'Large Language Model-Powered Resume Rater Engine',
   'employer': 'Unknown',
   'job_title': 'Unknown',
   'technical_skills': ['OpenAl',
    'Resume Rater Engine',
    'Apache Airflow',
    'Docker',
    'Neo4j',
    'machine learning',
    'predictive analytics']},
  {'project_name': 'NLP-Powered Job Recommendation Engine with AWS',
   'employer': 'Unknown',
   'job_title': 'Unknown',
   'technical_skills': ['NLP-powered job recommendation system',
    'AWS SageMaker',
    'EC2',
    'APIs',
    'Python',
    '$3',
    'DynamoDB',
    'RDS']},
  {'project_name': 'COVID Hospitalization Data Analysis Utilizing Microsoft Azure',
   'employer': 'Unknown',
   'job_title': 'Unknown',
   'technical_skills': 

In [13]:
resume_info=output_dict1 | output_dict2 | output_dict3
resume_info

{'name': 'JAEWON YUN',
 'email': 'jaewon.yun@mail.utoronto.ca',
 'phone number': '4387285493',
 'education': [{'institution': 'University of Toronto',
   'degree_type': 'Physics Specialist (HBSc)',
   'major': None,
   'graduation_date': '2020'}],
 'skills': ['Data Engineering/ETL Tools',
  'Data Modeling',
  'Data Warehousing',
  'PySpark',
  'Cypher Query Language',
  'Docker',
  'Databricks',
  'Airflow',
  'Graph Database (Neo4)j)',
  'Git',
  'Python',
  'SQL/NoSQL Databases',
  'SQL',
  'Amazon Web Services',
  'Redshift',
  'SageMaker',
  'Glue Studio',
  'EC2',
  'RDS',
  'Lambda',
  '$3',
  'Cloud Development Kit',
  'DynamoDB',
  'IAM',
  'Microsoft Azure',
  'Synapse Analytics',
  'Data Lake Storage',
  'Data Factory',
  'SQL Database',
  'Blob Storage',
  'Machine Learning',
  'Natural Language Processing',
  'Recommendation System',
  'Statistical Modeling',
  'Medical Image Processing',
  'Time Series Analysis',
  'TensorFlow/Keras'],
 'work experiences': "Follow steps be

In [None]:
resume_info.keys()

In [None]:
job_requirement={"Minimum qualification":
                    {"Degree":"Bachelor",
                     "Major":["Physics", "Computer Science", "Electrical Engineering", "Mathmetics", "Machine Learning", "Statistics"],
                     "Skills":["Statistical Analysis"],
                     "Years of Experience":"3 years of experience"},
                "Preferred qualification":
                    {"Degree":"Phd",
                     "Major":["Computer Science"],
                     "Skills":["AWS","Time Series Analysis","Natural Language Processing"],
                     "Years of Experience":"5 years of experience"}
                    }

In [None]:
from py2neo import Graph, Node, Relationship

# Establish connection
graph = Graph("bolt://localhost:7687", auth=("neo4j", "Apple1018!"))  # replace with your details

# Clear the graph for this example
graph.delete_all()



In [None]:
resume=resume_info

In [None]:

def save_to_neo4j(resume):
    try:
        # Create the 'Person' node
        person = Node("Person", name=resume['name'], email=resume['email'], phone_number=resume['phone number'])
        graph.create(person)

        # Create the 'Education' nodes
        for edu in resume['education']:
            institution = Node("Institution", name=edu['institution'])
            graph.merge(institution, "Institution", "name")
            
            # Create 'Degree' node
            degree = Node("DegreeType", name=edu['degree_type'])
            graph.merge(degree, "DegreeType", "name")

            # 'studied_at' relation
            rel = Relationship(person, "STUDIED_AT", institution)
            graph.create(rel)

            # 'graduated_in' relation
            rel = Relationship(person, "GRADUATED_IN", institution, year=edu['graduation_date'])
            graph.create(rel)

            # 'obtained' relation
            rel = Relationship(person, "OBTAINED", degree)
            graph.create(rel)

        # Create 'Skills' nodes
        for skill in resume['skills']:
            skill_node = Node("Skill", name=skill)
            graph.merge(skill_node, "Skill", "name")

            # 'has_skill' relation
            rel = Relationship(person, "HAS_SKILL", skill_node)
            graph.create(rel)

        # Create 'WorkExperience' nodes
        for exp in resume['work experiences']:
            employer = Node("Employer", name=exp['employer'])
            graph.merge(employer, "Employer", "name")
            
            job_title = Node("JobTitle", title=exp['job_title'], duration=exp['employment_duration'])
            graph.merge(job_title, "JobTitle", "title")

            # 'held_position' relation
            rel = Relationship(person, "HELD_POSITION", job_title)
            graph.create(rel)
            
            # 'within' relation
            rel = Relationship(job_title, "WITHIN", employer)
            graph.create(rel)

        # Create 'Project' nodes
        for proj in resume['projects']:
            project = Node("Project", name=proj['project_name'])
            graph.merge(project, "Project", "name")

            # 'worked_on' relation
            rel = Relationship(person, "WORKED_ON", project)
            graph.create(rel)

            for skill in proj['technical_skills']:
                skill_node = Node("Skill", name=skill)
                graph.merge(skill_node, "Skill", "name")

                # 'used_skill' relation
                rel = Relationship(project, "USED_SKILL", skill_node)
                graph.create(rel)
    except Exception as e:
        print(f"An error occurred: {e}")


save_to_neo4j(resume)


# Just use GPT3.5 without Langchain

In [14]:
import os
import openai
import tiktoken
import os
import pandas as pd
import openai
from util import extract_text_from_resume
import json
import re
import textwrap

def wraptext(text, width=100):
    wrapper = textwrap.TextWrapper(width=width)
    word_list = wrapper.wrap(text=text)

    for element in word_list:
        print(element)


from dotenv import load_dotenv, find_dotenv

# Load environment variables from .env file
_=load_dotenv(find_dotenv())

# Set OpenAI API key
openai.api_key = os.environ['OPENAI_API_KEY']
# Directory containing the resumes
resume_dir = "/Users/yunjaewon/ChatGPT/resumes/"
resume_list=os.listdir(resume_dir)
print(resume_list)

resume_text = extract_text_from_resume(resume_dir+'jaeDE.pdf')
#print(resume_text)

['alan.pdf', '.DS_Store', 'jaewon.docx', 'oldresume.docx', 'jaeDE.pdf', 'lorraine.pdf', 'jaewon.pdf', 'mlscientist.pdf']


In [15]:
def get_completion(prompt, model="gpt-3.5-turbo"):
    messages = [{"role": "user", "content": prompt}]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0,
    )
    return response.choices[0].message["content"]

def get_completion_from_messages(messages, 
                                 model="gpt-3.5-turbo", 
                                 temperature=0, 
                                 max_tokens=500):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, # this is the degree of randomness of the model's output
        max_tokens=max_tokens, # the maximum number of tokens the model can ouptut 
    )
    return response.choices[0].message["content"]

def get_completion_and_token_count(messages, 
                                   model="gpt-3.5-turbo", 
                                   temperature=0, 
                                   max_tokens=500):
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature, 
        max_tokens=max_tokens,
    )
    
    content = response.choices[0].message["content"]
    
    token_dict = {
'prompt_tokens':response['usage']['prompt_tokens'],
'completion_tokens':response['usage']['completion_tokens'],
'total_tokens':response['usage']['total_tokens'],
    }

    return content, token_dict