In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
import pinecone
import os

  from tqdm.autonotebook import tqdm


# PDF LOADER

We can produce documents from PDF files using LangChain's PyPDFDirectoryLoader. Each document is an instance of LangChain's Document class, containing both the document text and its metadata.

In [3]:
loader = PyPDFDirectoryLoader("./CVs/")
docs = loader.load()
print("Documents: ", len(docs))

Documents:  4


In [4]:
docs[0]

Document(page_content='Jan Küster ·Consultant and Software Engineer ·Bremen, Germany ·info@jankuester.com ·+/four.lnum/nine.lnum /one.lnum/seven.lnum/six.lnum *** *** **\nJ/A.sc/N.sc Kü/S.sc/T.sc/E.sc/R.sc R/E.sc/S.sc/U.sc/M.sc/E.sc\nStatus: M.Sc. Digital Media, Fullstack JS Engineer\nFields: Project Management, Software Development, Consulting\nTech: Meteor, Javascript, Bootstrap, Mongodb, Git, Webstorm, Sourcetree\nLoves: Global Game Jam, Sci-Fi series, Stackover/f_low, Fitness and Martial\nArts\nExperience\n/two.lnum/zero.lnum/one.lnum/six.lnum / /zero.lnum/nine.lnum Fullstack Javascript Engineer University of Bremen\nInvent a realtime classroom management using Meteor and React\nDesign software architecture and leading development\n/two.lnum/zero.lnum/one.lnum/four.lnum - /two.lnum/zero.lnum/one.lnum/six.lnum IT Consultant for IBM XPages and Notes Domino We/four.lnumIT GmbH Bremen\nRealize projects in XPages and We/four.lnumIT Aveedo, monitor project status, conduct reports\nIntegr

The page content corresponds to the document text.

In [5]:
print("Document page content 0: \n")
print(docs[0].page_content)

Document page content 0: 

Jan Küster ·Consultant and Software Engineer ·Bremen, Germany ·info@jankuester.com ·+/four.lnum/nine.lnum /one.lnum/seven.lnum/six.lnum *** *** **
J/A.sc/N.sc Kü/S.sc/T.sc/E.sc/R.sc R/E.sc/S.sc/U.sc/M.sc/E.sc
Status: M.Sc. Digital Media, Fullstack JS Engineer
Fields: Project Management, Software Development, Consulting
Tech: Meteor, Javascript, Bootstrap, Mongodb, Git, Webstorm, Sourcetree
Loves: Global Game Jam, Sci-Fi series, Stackover/f_low, Fitness and Martial
Arts
Experience
/two.lnum/zero.lnum/one.lnum/six.lnum / /zero.lnum/nine.lnum Fullstack Javascript Engineer University of Bremen
Invent a realtime classroom management using Meteor and React
Design software architecture and leading development
/two.lnum/zero.lnum/one.lnum/four.lnum - /two.lnum/zero.lnum/one.lnum/six.lnum IT Consultant for IBM XPages and Notes Domino We/four.lnumIT GmbH Bremen
Realize projects in XPages and We/four.lnumIT Aveedo, monitor project status, conduct reports
Integrated Camu

Metadata, represented as a dictionary, holds the document's metadata like the source and the page of the specific pdf.

In [6]:
print("Document metadata 0: \n")
print(docs[0].metadata)

Document metadata 0: 

{'source': 'CVs/Jan Kuster CV.pdf', 'page': 0}


________________

# Chunk our data up into smaller documents

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [8]:
print (f'Now we have {len(texts)} documents')

Now we have 6 documents


________________

# Create embeddings of our documents to get ready for similarity search

In [12]:
# Api keys from pinecone and openai
PINECONE_API_KEY ="dabe371f-52da-49c6-ba8e-cdcbdc664b5c"
OPEN_AI_KEY ="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"

In [15]:
# Initialize OpenAI embeddings using the provided API key from environment variables
embeddings = OpenAIEmbeddings(openai_api_key=OPEN_AI_KEY)

In [16]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment='gcp-starter' 
)
index_name = "langchaintest"
if index_name not in pinecone.list_indexes():
    # Create the index if it does not exist
    pinecone.create_index(index_name, dimension=1536)
# Connect to abstractive-question-answering index we created
index = pinecone.Index(index_name)

In [18]:
# Generate and save embeddings in pinecone
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

_____________________________

In [19]:
# Initialize the OpenAI language model and load the question-answering chain
llm = OpenAI(temperature=0, openai_api_key=OPEN_AI_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

____________

# Query those docs to get the answer back

In [20]:
query = "Cuales son las habilidades de james bond?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

'\nJames Bond es conocido por sus habilidades de combate, habilidades de espionaje, habilidades de conducción y habilidades de supervivencia.'

In [41]:
query = "Hazme un resumen de James Bond"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' James Bond es un agente secreto altamente calificado con licencia para matar. Tiene una amplia experiencia en operaciones de inteligencia encubiertas, infiltración en entornos de alto riesgo, uso de técnicas y gadgets de espionaje avanzados, vigilancia, análisis y prevención de amenazas. Está capacitado en técnicas de espionaje avanzadas, entrenamiento en combate, manejo de armas de fuego y habilidades tácticas. Habla inglés fluido, francés de nivel intermedio y ruso básico.'

In [42]:
query = "Que candidatos se desenvuelve mejor en el idioma ingles"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' James Bond is fluent in English.'

In [21]:
query = "Si quiero contratar a un programador ¿Cual seria el mejor candidato"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' El mejor candidato para un programador sería Jan Küster. Tiene una Maestría en Medios Digitales, es un ingeniero Fullstack JS y tiene experiencia en proyectos de gestión de proyectos, desarrollo de software y consultoría. Sus habilidades técnicas incluyen Meteor, Javascript, Bootstrap, Mongodb, Git, Webstorm y Sourcetree.'