In [24]:
import os
import openai
# import sys
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

openai.api_key = OPENAI_API_KEY

In [30]:
print("Available engines: ")
print([data['id'] for data in openai.Engine.list()['data']])

Available engines: 
['whisper-1', 'babbage', 'text-davinci-003', 'davinci', 'text-davinci-edit-001', 'babbage-code-search-code', 'text-similarity-babbage-001', 'code-davinci-edit-001', 'text-davinci-001', 'gpt-4-0613', 'ada', 'babbage-code-search-text', 'babbage-similarity', 'gpt-4', 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-16k-0613', 'code-search-babbage-text-001', 'text-curie-001', 'gpt-3.5-turbo', 'gpt-3.5-turbo-16k', 'code-search-babbage-code-001', 'text-ada-001', 'text-similarity-ada-001', 'curie-instruct-beta', 'gpt-3.5-turbo-0301', 'ada-code-search-code', 'ada-similarity', 'code-search-ada-text-001', 'text-search-ada-query-001', 'davinci-search-document', 'ada-code-search-text', 'text-search-ada-doc-001', 'davinci-instruct-beta', 'text-similarity-curie-001', 'code-search-ada-code-001', 'ada-search-query', 'text-search-davinci-query-001', 'curie-search-query', 'davinci-search-query', 'babbage-search-document', 'ada-search-document', 'text-search-curie-query-001', 'gpt-4-0314', 'text-

# Extract resume names

In [70]:
from langchain.chat_models import ChatOpenAI
from langchain import LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

chat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [71]:
def get_name(resume_text):

    template = """You are a helpful assistant that extracts the applicant name from the resume. Only output the full name in this format:\n \
    first_name, last_name \n\n """
    system_message_prompt = SystemMessagePromptTemplate.from_template(template)
    human_template = "Resume: \n\n {resume_text}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    # chat_prompt.format_messages(resume_text="resume_text")
    chain = LLMChain(llm=chat, prompt=chat_prompt)
    name = chain.run(resume_text=resume_text)

    return name


In [90]:
def get_name_from_file(filename):
    # assumes the filename is in the format: firstname_lastname_resume.txt
    basename = os.path.splitext(filename)[0]
    names = basename.split("_")
    full_name = names[0] + " " + names[1]
    return full_name

In [318]:
metadata_list

[{'filename': 'Abraham_Lincoln_resume.txt', 'full_name': 'Abraham Lincoln'},
 {'filename': 'Agatha_Christie_resume.txt', 'full_name': 'Agatha Christie'},
 {'filename': 'Alberto_Santos-Dumont_resume.txt',
  'full_name': 'Alberto Santos-Dumont'},
 {'filename': 'Amadeo_Avogadro_resume.txt', 'full_name': 'Amadeo Avogadro'},
 {'filename': 'Andy_Warhol_resume.txt', 'full_name': 'Andy Warhol'},
 {'filename': 'Anne_Frank_resume.txt', 'full_name': 'Anne Frank'},
 {'filename': 'Audrey_Hepburn_resume.txt', 'full_name': 'Audrey Hepburn'},
 {'filename': 'Barack_Obama_resume.txt', 'full_name': 'Barack Obama'},
 {'filename': 'Che_Guevara_resume.txt', 'full_name': 'Che Guevara'},
 {'filename': 'Cleopatra_resume.txt', 'full_name': 'Cleopatra resume'},
 {'filename': 'Coco_Chanel_resume.txt', 'full_name': 'Coco Chanel'},
 {'filename': 'Dalai_Lama_resume.txt', 'full_name': 'Dalai Lama'},
 {'filename': 'David_Bowie_resume.txt', 'full_name': 'David Bowie'},
 {'filename': 'Diego_Maradona_resume.txt', 'full_n

In [320]:
import os

path = r"resume_data"
metadata_list = []

name_list = []
# iterate over files in that directory
for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path) and (filename != ".DS_Store"):
        # print(f'Loading file: {file_path}')
        full_name = get_name_from_file(filename)
        # print(full_name)
        metadata = {"filename": filename, "full_name": full_name}
        metadata_list.append(metadata)

# Load resume_data via DirectoryLoader

In [321]:
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader(path, glob="*.txt")
docs = loader.load()

In [322]:
type(docs[0])

langchain.schema.Document

In [324]:
docs[60].metadata

{'source': 'resume_data\\Steve_Jobs_resume.txt'}

In [325]:
# update docs with metadata_list
for i in range(len(docs)):
    docs[i].metadata = metadata_list[i]

In [326]:
docs[60].metadata

{'filename': 'Steve_Jobs_resume.txt', 'full_name': 'Steve Jobs'}

# Initiate chroma db

In [327]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding_function = OpenAIEmbeddings()
from langchain.vectorstores import Chroma

In [328]:
# create ids for each document
ids = [f"{str(i)}" for i in range(1, len(docs)+1)]

In [329]:
# create the vectorstore
vectordb = Chroma.from_documents(
    documents = docs,
    embedding = embedding_function,
    ids=ids,
    persist_directory='chroma/full_resume/',
    collection_name="resume_full"
)

In [330]:
vectordb.persist()

## similarity search demo

In [267]:

vectordb._collection.get(ids=[ids[-1]])

{'ids': ['skills_66'],
 'embeddings': None,
 'documents': ['"Explicitly mentioned technical skills:\\n- Data integration\\n- Data analytics\\n- Data mining\\n- .Net development\\n- Programming languages: perl, laravel, reactjs, pytorch, nltk\\n- Google technologies: Google BigQuery, Google Cloud Compute Engine\\n- Database technologies: HBase, Solr\\n- Data analytics tools: Weka, Spotfire, Matplotlib\\n- Agile methodologies: Scrum\\n- Data mining techniques: Regression analysis, Collaborative filtering, NLTK, CUDA\\n\\nImplied or directly stated technical skills from work experiences:\\n- .Net development\\n- Perl\\n- Laravel\\n- Web services\\n- Dask\\n- ReactJS\\n- Google technologies: Google BigQuery, Google Cloud Compute Engine\\n- Solr\\n- Data analytics: Classification, Matplotlib, Weka, Spotfire\\n- Agile methodologies: Scrum\\n- Data mining techniques: PyTorch, Regression analysis, Collaborative filtering, NLTK, CUDA\\n\\nConsolidated and organized technical skills:\\nExplicitl

In [336]:
resumedb = Chroma(persist_directory="chroma/full_resume/",
                    collection_name="resume_full",
                    embedding_function=embedding_function)

skillsdb = Chroma(persist_directory="chroma/skills/", 
                  collection_name="resume_skills",
                  embedding_function=embedding_function)

workdb = Chroma(persist_directory="chroma/work/",
                collection_name="resume_work",
                embedding_function=embedding_function)

In [348]:
query = "Software engineer with AWS experience"
# for db in [skillsdb, workdb]:
#     results = db.similarity_search(query)
#     skills_results =

resume_results = resumedb.similarity_search_with_relevance_scores(query, k=10)
skill_results = skillsdb.similarity_search_with_relevance_scores(query)
work_results = workdb.similarity_search_with_relevance_scores(query)

In [360]:
import pandas as pd

def get_matches_resume(query, k=10, match_type="resume"):
    if match_type == "work":
        db = Chroma(persist_directory="chroma/work/",
                collection_name="resume_work",
                embedding_function=embedding_function)

    elif match_type == "skills":
        db = Chroma(persist_directory="chroma/skills/", 
                  collection_name="resume_skills",
                  embedding_function=embedding_function)
    else:
        db = Chroma(persist_directory="chroma/full_resume/",
                    collection_name="resume_full",
                    embedding_function=embedding_function)

    results = db.similarity_search_with_relevance_scores(query, k=k)
    distance = [f"{result[1]:.2f}" for result in results]
    full_name = [result[0].metadata['full_name'] for result in results]
    content = [result[0].page_content for result in results]
    df = pd.DataFrame({"full_name": full_name, "distance": distance, "content": content})   
    return df

In [367]:
matches = get_matches_resume('machine_learning engineer', k=3, match_type="resume")

In [368]:
matches

Unnamed: 0,full_name,distance,content
0,Michael Jordan,0.32,MICHAEL JORDAN\n\nMSc in Computer Engineering ...
1,Vladimir Lenin,0.33,Vladimir Lenin\n\nEmail: vladimir.lenin@email....
2,Abraham Lincoln,0.34,Abraham Lincoln\n\nProfessional Summary: Accom...


In [349]:
print("\n**Matching full resume:")
for rr in resume_results:
    print(rr[0].metadata['full_name'])
    print(f"{rr[1]:.2f}")

print("\n**Matching skills:")
for sr in skill_results:
    print(sr[0].metadata['full_name'])
    print(f"{sr[1]:.2f}")

print("\n**Matching work experience:")
for wr in work_results:
    print(wr[0].metadata['full_name'])
    print(f"{wr[1]:.2f}")


**Matching full resume:
Johannes Gutenberg
0.29
Abraham Lincoln
0.32
Roger Federer
0.35
Winston Churchill
0.35
John Steinbeck
0.36
Audrey Hepburn
0.36
Coco Chanel
0.36
Virginia Woolf
0.37
Steve Jobs
0.37
George Orwell
0.38

**Matching skills:
Audrey Hepburn
0.31
Johannes Gutenberg
0.35
Pierre-Auguste Renoir
0.36
John Steinbeck
0.36

**Matching work experience:
Johannes Gutenberg
0.30
Audrey Hepburn
0.30
Mahatma Gandhi
0.32
John Steinbeck
0.33


In [228]:
for result in results:
    print(result)
    # print('Metadata: ', result[0].metadata)
    # print('Score: ',result[1])
    print("------------------")
    