-
Notifications
You must be signed in to change notification settings - Fork 0
/
bot.py
314 lines (255 loc) · 12.2 KB
/
bot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# Job Interview Coach
### Mock Interview with Feedback
# LICENSE AND DISCLAIMER:
# Copyright 2023, Jozsef Szalma<
# Creative Commons Attribution-NonCommercial 4.0 International Public License
# Gradio code was reused from / informed by: https://www.gradio.app/guides/creating-a-chatbot-fast
# Before repurposing this code for an HR use-case consider:
# OpenAI's useage policies (https://openai.com/policies/usage-policies) expliclitly prohibit:
# "Activity that has high risk of economic harm, including [...] Automated determinations of eligibility for [...] employment [...]"
# The EU AI Act proposal (https://eur-lex.europa.eu/resource.html?uri=cellar:e0649735-a372-11eb-9585-01aa75ed71a1.0001.02/DOC_1&format=PDF)
# contains the following language:
# "AI systems used in employment, workers management and access to self-employment,
# notably for the recruitment and selection of persons [...] should also be <b>classified as high-risk"
# KNOWN ISSUES:
# - incomplete error handling around job description, e.g. if an invalid JD URL is provided the code won't fall back to the copy-pasted JD
# - if no JD and/or CV are provided GPT-4 might on occasion ignore instructions to only ask one interview question at a time
# - the current workflow consumes a lot of tokens as the JD and the CV aren't summarized, but considered as-is for each question
# - the scraping logic breaks once the job is in the "no longer accepting applications" status
#env variables
import os
#API
import openai
#UI
import gradio as gr
#to digest the Job Description and the Resume
import requests
from bs4 import BeautifulSoup
import pdfplumber
import io
import re
#store OpenAI API key in .env file or replace right side of the equation with your key
openai.api_key = os.getenv("KEY")
#OpenAI Parameters
#I'm using two prompts here, immagine this like a two person interview panel, one conducts the interview, the other evaluates
INTERVIEWER_MODEL = 'gpt-4-0613'
INTERVIEWER_TEMPERATURE = 0.4
INTERVIEWER_TOKEN_LIMIT = 300
INTERVIEWER_PROMPT = """
Role:
Interviewer in a job interview coaching application; your role is to interview the candidate.
Do not provide feedback, that is done after the interview by a human.
Follow the interview script, don't ask more than one question per message.
Interview script:
1) Welcome the candidate
2) Check if a CV was automatically provided by the system, ask the candidate to provide their CV if not.
3) If the CV was provided by the system ask the candidate to confirm if you have their correct CV by showing a short summary.
4) Check if a Job Description was automatically provided by the system, ask the candidate to provide the JD they are interviewing for if not.
5) If the JD was provided by the system ask the candidate to confirm if you have the correct JD by showing a short summary.
6) Compare and contrast Candidate Resume and Job Description and ask the first clarification question from the candidate to establish overlaps and disconnects between JD and CV.
7) Ask the 2nd clarification question from the candidate to establish overlaps and disconnects between JD and CV.
8) Ask the 3rd clarification question from the candidate to establish overlaps and disconnects between JD and CV.
9) Ask the candidate for their motivation to apply to this job, if not yet discussed.
10) Thank the candidate, explain that feedback will be provided at a later stage and append to your last message {interview ended}
"""
REVIEWER_MODEL = 'gpt-4-0613'
REVIEWER_TEMPERATURE = 0.2
REVIEWER_TOKEN_LIMIT = 2000
REVIEWER_PROMPT = """
Role:
Job interview coach in a job interview coaching application.
Task:
Your role is to review a conversation between the interviewer and the candidate and provide feedback.
Only consider job relevant questions, additional chatter (e.g. confirming data) can be ignored.
Rate answers on a scale from 1 (worst) to 10 (best).
Recommend an alternative answer for each question.
Provide your response as a valid, but human readable JSON, see template:
{
"questions": [
{
"question_number": 1,
"question_text": "Could you please ellaborate on...",
"candidate_answer": "I think...",
"recommended_answer": "",
"answer_correctness_rating": 9
}
],
"overall_rating": "90%"
}
"""
#caching additional inputs
linkedin_jd_cache = {}
linkedin_jd = ""
candidate_cv = ""
#For the sake of simplicity I'm providing an option to scrape the JD from LinkedIn directly
#This is probably against LinkedIn's T&Cs, so use at your own risk
#Also, the scraping logic breaks once the job is in the "no longer accepting applications" status
def extract_linkedin_jd (URL, copy_paste):
global linkedin_jd_cache
global linkedin_jd
if URL:
#Checking if the JD has already been scraped
if URL in linkedin_jd_cache:
print(f"Text for {URL} already loaded.")
return linkedin_jd_cache[URL]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
}
response = requests.get(URL, headers=headers)
jd = ""
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
#Finding the job title
job_title_tag = soup.find('h1', class_='topcard__title')
if job_title_tag:
job_title = job_title_tag.text.strip()
jd = jd + job_title
else:
jd = jd + "Couldn't find job title on LinkedIn \n"
#Finding the company name
company_name_tag = soup.find('a', class_='topcard__org-name-link')
if company_name_tag:
company_name = company_name_tag.text.strip()
jd = jd + company_name
else:
jd = jd + "Couldn't find company name on LinkedIn \n"
# Finding the job description
job_description_tag = soup.find('div', class_='description__text')
if job_description_tag:
job_description = job_description_tag.text.strip()
jd = jd + job_description
else:
jd = jd + "Couldn't find job description on LinkedIn \n"
jd = re.sub('\n+', '\n', jd)
linkedin_jd_cache[URL] = jd
else:
print("Failed to retrieve the webpage. Status code:", response.status_code)
jd = "couldn't load JD from LinkedIn"
elif copy_paste:
jd = copy_paste
else:
jd = "no JD provided"
linkedin_jd = jd
return jd
#Loading the Candidate's Resume
def load_cv (cv_pdf):
global candidate_cv
#converting between Gradio's feed and what pdfpluber can digest
cv_pdf = io.BytesIO(cv_pdf)
with pdfplumber.open(cv_pdf) as pdf:
#initializing an empty string to store the extracted text
text = ""
#iterating over each page of the PDF
for page in pdf.pages:
#extracting text from the page and add it to the text string
text += page.extract_text()
#removing extra line breaks
text = re.sub('\n+', '\n', text)
candidate_cv = text
return text
#extracting control messages from streaming text enclosed in curly braces
#this can be used, inter alia, for the interviewer to indicate the end of the interview
#TODO replace this with a more formalized solution detailed here: https://platform.openai.com/docs/guides/gpt/function-calling
def extract_control(text):
pattern = r'{(.*?)}'
match = re.search(pattern, text)
if match:
control = match.group().replace("{","").replace("}","")
text_without_control = re.sub(pattern, '', text)
else:
control_start = text.find('{')
if control_start != -1:
control = text[control_start + 1:]
text_without_control = text[:control_start]
else:
control = ""
text_without_control = text
return text_without_control, control
#this is the handler function that gets triggered when the submit button is pressed
#contains the prompt engineering logic as well
def btn_handler(message, history):
#this part handles the standard interview
history_openai_format = []
#adding the interviewer prompt as a system message
history_openai_format.append({"role": "system", "content": INTERVIEWER_PROMPT})
#adding the JD as a system message
history_openai_format.append({"role": "system", "content": "Job Description: " + linkedin_jd})
#adding the CV as a system message
history_openai_format.append({"role": "system", "content": "Candidate's CV: " + candidate_cv})
#translating the Gradio chat history into OpenAI format
for human, assistant in history:
history_openai_format.append({"role": "user", "content": human })
history_openai_format.append({"role": "assistant", "content":assistant})
history_openai_format.append({"role": "user", "content": message})
#submitting the interviewer inference request to the API
response = openai.ChatCompletion.create(
model= INTERVIEWER_MODEL,
messages= history_openai_format,
temperature=INTERVIEWER_TEMPERATURE,
max_tokens = INTERVIEWER_TOKEN_LIMIT,
stream=True,
n = 1
)
partial_message = ""
control_message = ""
trimmed_message = ""
#yielding the streamed message to the chat window, while ensuring control messages don't become visible
for chunk in response:
if len(chunk['choices'][0]['delta']) != 0:
partial_message = partial_message + chunk['choices'][0]['delta']['content']
trimmed_message, control_message = extract_control (partial_message)
yield trimmed_message
#if the interview has ended let's ask for the evaluation
if control_message == "interview ended":
print("starting eval")
eval_prompt = []
history_adj_format = []
#composing the eval prompt
eval_prompt.append({"role": "system", "content": REVIEWER_PROMPT})
eval_prompt.append({"role": "system", "content": "Job Description: " + linkedin_jd})
eval_prompt.append({"role": "system", "content": "Candidate's CV: " + candidate_cv})
#transforming the chat history to ensure the reviewer model don't get confused and continue the interview
for human, assistant in history:
history_adj_format.append({"role": "candidate", "content": human })
history_adj_format.append({"role": "interviewer", "content":assistant})
eval_prompt.append({"role": "system", "content": str(history_adj_format).replace("\"","")})
#making the review inference call to the API
response = openai.ChatCompletion.create(
model= REVIEWER_MODEL,
messages= eval_prompt,
temperature= REVIEWER_TEMPERATURE,
max_tokens = REVIEWER_TOKEN_LIMIT,
stream=True,
n = 1
)
#continuing the streaming where we left off
partial_message = trimmed_message + r"<br>"
for chunk in response:
if len(chunk['choices'][0]['delta']) != 0:
partial_message = partial_message + chunk['choices'][0]['delta']['content']
yield partial_message
chat_tab = gr.ChatInterface(btn_handler).queue()
jd_tab = gr.Interface(
fn=extract_linkedin_jd,
inputs=[
gr.Textbox("", label="Job Description LinkedIn URL"),
gr.Textbox("", label="or copy-paste Job Description here")
],
outputs=[
gr.Textbox()
],
allow_flagging="never"
)
cv_tab = gr.Interface(
fn=load_cv,
inputs=[
gr.File(type='binary'),
],
outputs=[
gr.Textbox()
],
allow_flagging="never"
)
demo = gr.TabbedInterface([cv_tab, jd_tab, chat_tab], ["CV Upload", "Job Description", "Interview"]).queue()
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0")