In [24]:
import os
import openai
# import sys
OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

openai.api_key = OPENAI_API_KEY

In [30]:
print("Available engines: ")
print([data['id'] for data in openai.Engine.list()['data']])

Available engines: 
['whisper-1', 'babbage', 'text-davinci-003', 'davinci', 'text-davinci-edit-001', 'babbage-code-search-code', 'text-similarity-babbage-001', 'code-davinci-edit-001', 'text-davinci-001', 'gpt-4-0613', 'ada', 'babbage-code-search-text', 'babbage-similarity', 'gpt-4', 'gpt-3.5-turbo-0613', 'gpt-3.5-turbo-16k-0613', 'code-search-babbage-text-001', 'text-curie-001', 'gpt-3.5-turbo', 'gpt-3.5-turbo-16k', 'code-search-babbage-code-001', 'text-ada-001', 'text-similarity-ada-001', 'curie-instruct-beta', 'gpt-3.5-turbo-0301', 'ada-code-search-code', 'ada-similarity', 'code-search-ada-text-001', 'text-search-ada-query-001', 'davinci-search-document', 'ada-code-search-text', 'text-search-ada-doc-001', 'davinci-instruct-beta', 'text-similarity-curie-001', 'code-search-ada-code-001', 'ada-search-query', 'text-search-davinci-query-001', 'curie-search-query', 'davinci-search-query', 'babbage-search-document', 'ada-search-document', 'text-search-curie-query-001', 'gpt-4-0314', 'text-

# Extract resume names

In [70]:
from langchain.chat_models import ChatOpenAI
from langchain import LLMChain
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

chat = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [71]:
def get_name(resume_text):

    template = """You are a helpful assistant that extracts the applicant name from the resume. Only output the full name in this format:\n \
    first_name, last_name \n\n """
    system_message_prompt = SystemMessagePromptTemplate.from_template(template)
    human_template = "Resume: \n\n {resume_text}"
    human_message_prompt = HumanMessagePromptTemplate.from_template(human_template)

    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    # chat_prompt.format_messages(resume_text="resume_text")
    chain = LLMChain(llm=chat, prompt=chat_prompt)
    name = chain.run(resume_text=resume_text)

    return name


In [90]:
def get_name_from_file(filename):
    # assumes the filename is in the format: firstname_lastname_resume.txt
    basename = os.path.splitext(filename)[0]
    names = basename.split("_")
    full_name = names[0] + " " + names[1]
    return full_name

In [106]:
import os

path = r"resume_data"
metadata_list = []

name_list = []
# iterate over files in that directory
for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path) and (filename != ".DS_Store"):
        # print(f'Loading file: {file_path}')
        full_name = get_name_from_file(filename)
        print(full_name)
        metadata = {"filename": filename, "full_name": full_name}
        metadata_list.append(metadata)

Abraham Lincoln
Agatha Christie
Alberto Santos-Dumont
Amadeo Avogadro
Andy Warhol
Anne Frank
Audrey Hepburn
Barack Obama
Che Guevara
Cleopatra resume
Coco Chanel
Dalai Lama
David Bowie
Diego Maradona
Elvis Presley
Emily Brontë
Eva Perón
Fidel Castro
Freddie Mercury
Frederick Douglass
Galileo Galilei
George Orwell
George Washington
Helen Keller
J.K. Rowling
Jim Morrison
Johannes Gutenberg
John D.
John F.
John Lennon
John Steinbeck
Joseph Stalin
Julius Caesar
Kurt Cobain
Leonardo da
Leon Trotsky
Leo Tolstoy
Louis Pasteur
Mahatma Gandhi
Mao Zedong
Marie Antoinette
Marie Curie
Marlon Brando
Martin Luther
Michael Jordan
Mikhail Gorbachev
Muhammad Ali
Nelson Mandela
Nikola Tesla
Oprah Winfrey
Pierre-Auguste Renoir
Pierre Curie
Plato resume
Pope Francis
Princess Diana
Queen Elizabeth
Roger Federer
Rosalind Franklin
Rosa Parks
Stephen Hawking
Steve Jobs
Thomas Edison
Virginia Woolf
Vladimir Lenin
Walt Disney
Winston Churchill


# Load resume_data via DirectoryLoader

In [96]:
from langchain.document_loaders import DirectoryLoader
loader = DirectoryLoader("resume_data", glob="*.txt")
docs = loader.load()

In [97]:
len(docs)

66

In [100]:
docs[10].metadata

{'source': 'resume_data\\Coco_Chanel_resume.txt'}

# Initiate chroma db

In [110]:
from langchain.embeddings.openai import OpenAIEmbeddings
embedding_function = OpenAIEmbeddings()
from langchain.vectorstores import Chroma
persist_directory = 'chroma/'

In [114]:
ids = [f"resume_{str(i)}" for i in range(1, len(docs)+1)]

In [161]:
# create the vectorstore
vectordb = Chroma.from_documents(
    documents = docs,
    embedding = embedding_function,
    collection_name="full_resume",
    persist_directory='chroma/full_resume/',
    ids=ids
)

In [120]:
# update docs with metadata_list
for i in range(len(docs)):
    docs[i].metadata = metadata_list[i]
    vectordb.update_document(ids[i], docs[i])


## similarity search demo

In [144]:
query = "Machine learning engineer"
results = vectordb.similarity_search_with_score("software engineer with aws experience", k=3)

In [155]:
score = results[0][1]

0.31041303277015686

In [159]:
for result in results:
    
    print(f'Metadata: ', result[0].metadata)
    print('Score: ',result[1])
    

Metadata:  {'filename': 'Johannes_Gutenberg_resume.txt', 'full_name': 'Johannes Gutenberg'}
Score:  0.31041303277015686
Metadata:  {'filename': 'Abraham_Lincoln_resume.txt', 'full_name': 'Abraham Lincoln'}
Score:  0.3441496789455414
Metadata:  {'filename': 'Roger_Federer_resume.txt', 'full_name': 'Roger Federer'}
Score:  0.3596689701080322
