In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import Pinecone
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
import pinecone
import os

# PDF LOADER

We can produce documents from PDF files using LangChain's PyPDFDirectoryLoader. Each document is an instance of LangChain's Document class, containing both the document text and its metadata.

In [19]:
loader = PyPDFDirectoryLoader("./CVs/")
docs = loader.load()
print("Documents: ", len(docs))

Documents:  4


In [20]:
docs[0]

Document(page_content='CHARLES RAMBO\n+1 123-456-7890\nyou@provider.com\nlinkedin.com/in/charles-rambo\ngithub.com/fizixmastr\nEDUCATION\nMaster of Science |Photonics Aug. 2019 – May 2021\nUniversity of Eastern Finland Joensuu, Finland\nBachelor of Arts |Major: Physics, Minor: Education Aug. 2016 – May 2018\nAustin College Sherman, TX\nAssociate of Liberal Sciences Aug. 2015 – May 2016\nNorth Lake College Irving, TX\nWORK EXPERIENCE\nIntegration Engineering Intern June 2018 – August 2019\nFinisar Corp. Sherman, TX\n•Worked in ISO 4 cleanroom developing applications to improve efficiency and creating specs\n•Employed metrology and microscopy for failure analysis and developing process for wet etching\n•Member of Emergency Response Team\nLaboratory Assistant January 2016 – July 2016\nNorth Lake College Irving, TX\n•Inventoried and maintained Physics Department lab equipment\n•Physics tutoring\nAssistant Manager December 2006 – August 2015\nSun & Ski Sports Austin, TX\n•Led a team of 20+ 

The page content corresponds to the document text.

In [21]:
print("Document page content 0: \n")
print(docs[0].page_content)

Document page content 0: 

CHARLES RAMBO
+1 123-456-7890
you@provider.com
linkedin.com/in/charles-rambo
github.com/fizixmastr
EDUCATION
Master of Science |Photonics Aug. 2019 – May 2021
University of Eastern Finland Joensuu, Finland
Bachelor of Arts |Major: Physics, Minor: Education Aug. 2016 – May 2018
Austin College Sherman, TX
Associate of Liberal Sciences Aug. 2015 – May 2016
North Lake College Irving, TX
WORK EXPERIENCE
Integration Engineering Intern June 2018 – August 2019
Finisar Corp. Sherman, TX
•Worked in ISO 4 cleanroom developing applications to improve efficiency and creating specs
•Employed metrology and microscopy for failure analysis and developing process for wet etching
•Member of Emergency Response Team
Laboratory Assistant January 2016 – July 2016
North Lake College Irving, TX
•Inventoried and maintained Physics Department lab equipment
•Physics tutoring
Assistant Manager December 2006 – August 2015
Sun & Ski Sports Austin, TX
•Led a team of 20+ employees
•Ran socia

Metadata, represented as a dictionary, holds the document's metadata like the source and the page of the specific pdf.

In [22]:
print("Document metadata 0: \n")
print(docs[0].metadata)

Document metadata 0: 

{'source': 'CVs\\Charles Rambo CV.pdf', 'page': 0}


________________

# Chunk our data up into smaller documents

In [23]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)

In [24]:
print (f'Now we have {len(texts)} documents')

Now we have 6 documents


________________

# Create embeddings of our documents to get ready for similarity search

In [25]:
# Api keys from pinecone and openai
PINECONE_API_KEY ="5d48e2dc-ac70-48c7-b62d-791f2baa26e8"
OPEN_AI_KEY ="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"

In [27]:
# Initialize OpenAI embeddings using the provided API key from environment variables
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPEN_AI_KEY'])

In [28]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  
    environment='gcp-starter' 
)
index_name = "langchaintest"
if index_name not in pinecone.list_indexes():
    # Create the index if it does not exist
    pinecone.create_index(index_name, dimension=1536)
# Connect to abstractive-question-answering index we created
index = pinecone.Index(index_name)

In [32]:
# Generate and save embeddings in pinecone
docsearch = Pinecone.from_texts([t.page_content for t in texts], embeddings, index_name=index_name)

_____________________________

In [45]:
# Initialize the OpenAI language model and load the question-answering chain
llm = OpenAI(temperature=0, openai_api_key=OPEN_AI_KEY)
chain = load_qa_chain(llm, chain_type="stuff")

____________

# Query those docs to get the answer back

In [44]:
query = "Cuales son las habilidades de james bond?"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' Las habilidades de James Bond incluyen técnicas de espionaje, vigilancia, entrenamiento de combate, infiltración, resolución de problemas, comunicación efectiva, adaptabilidad y trabajo en equipo. También habla inglés fluido, francés de nivel intermedio y ruso básico.'

In [41]:
query = "Hazme un resumen de James Bond"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' James Bond es un agente secreto altamente calificado con licencia para matar. Tiene una amplia experiencia en operaciones de inteligencia encubiertas, infiltración en entornos de alto riesgo, uso de técnicas y gadgets de espionaje avanzados, vigilancia, análisis y prevención de amenazas. Está capacitado en técnicas de espionaje avanzadas, entrenamiento en combate, manejo de armas de fuego y habilidades tácticas. Habla inglés fluido, francés de nivel intermedio y ruso básico.'

In [42]:
query = "Que candidatos se desenvuelve mejor en el idioma ingles"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' James Bond is fluent in English.'

In [43]:
query = "Si quiero contratar a un programador ¿Cual seria el mejor candidato"
docs = docsearch.similarity_search(query)
chain.run(input_documents=docs, question=query)

' El mejor candidato para un programador sería Jan Küster. Tiene un M.Sc. en Digital Media, es un ingeniero Fullstack JS, y tiene experiencia en proyectos de gestión de proyectos, desarrollo de software y consultoría. Sus habilidades técnicas incluyen Meteor, Javascript, Bootstrap, Mongodb, Git, Webstorm y Sourcetree.'