In [23]:
import langchain
from pinecone import Pinecone , ServerlessSpec
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as langchainpinecode
from langchain.llms import huggingface_pipeline
import os

In [2]:
def read_doc(directory):
    file_loader = PyPDFDirectoryLoader(directory)
    document=file_loader.load()
    return document


In [None]:

doc=read_doc("documents/")
doc

[Document(metadata={'source': 'documents\\Pavankumar_Patil_Resume.pdf', 'page': 0}, page_content='Bengaluru 560040\n6360670648\npatilpavan7263@gmail.com\nhttps://github.com/patilpavan5656\nhttps://www.linkedin.com/in/pavan-patil-30b579214/\nPAVANKUMARPATIL\nI seek challenging opportunities where I can fully utilize my skills to contribute to the success of the organization.CAREER OBJECTIVE\nSOFTWARE ENGINEERING INTERN, 02/2023 - 07/2023\nCodeCraft Technologies Pvt Ltd, Bengaluru\nEXPERIENCE\nImplemented new features into existing software systems using HTML, CSS, and JavaScript.•\nUtilized version control systems like Git and GitHub to manage source code changes during development cycles.•\nAnalyzed code and corrected errors to optimize output.•\nCollaborated with other developers on coding projects in an Agile environment.•\n1. WIFI Handoff using NS2 simulator\n2. Implementation of Smart Home Automation System\n3. HetNet Simulation in MATLAB\n4.\xa0Client and server communication usin

In [6]:
def chunk_data(docs, chunk_size = 512, chunk_overlap=50):
    text_spliter = RecursiveCharacterTextSplitter(chunk_size = chunk_size, chunk_overlap = chunk_overlap)
    doc = text_spliter.split_documents(docs)
    return doc



In [7]:
documents=chunk_data(docs=doc)

In [8]:
documents

[Document(metadata={'source': 'documents\\Pavankumar_Patil_Resume.pdf', 'page': 0}, page_content='Bengaluru 560040\n6360670648\npatilpavan7263@gmail.com\nhttps://github.com/patilpavan5656\nhttps://www.linkedin.com/in/pavan-patil-30b579214/\nPAVANKUMARPATIL\nI seek challenging opportunities where I can fully utilize my skills to contribute to the success of the organization.CAREER OBJECTIVE\nSOFTWARE ENGINEERING INTERN, 02/2023 - 07/2023\nCodeCraft Technologies Pvt Ltd, Bengaluru\nEXPERIENCE\nImplemented new features into existing software systems using HTML, CSS, and JavaScript.•'),
 Document(metadata={'source': 'documents\\Pavankumar_Patil_Resume.pdf', 'page': 0}, page_content='Utilized version control systems like Git and GitHub to manage source code changes during development cycles.•\nAnalyzed code and corrected errors to optimize output.•\nCollaborated with other developers on coding projects in an Agile environment.•\n1. WIFI Handoff using NS2 simulator\n2. Implementation of Smar

In [11]:
# embedding technique of Huggingface

embedding = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5") 

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [14]:
vector=embedding.embed_query("hello, how are you?")
len(vector)

768

In [None]:
from keys import piconapikey
pc = Pinecone(api_key=piconapikey)

In [22]:
import time

index_name = "resumevectore"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [24]:
index

<pinecone.data.index.Index at 0x1ab8d053310>

In [25]:
from langchain_pinecone import PineconeVectorStore

vector_store = PineconeVectorStore(index=index, embedding=embedding)

In [None]:
indexs=vector_store.add_documents(documents)

['fe20e324-5484-408b-8f28-eb9197460949',
 '2e75b8cc-c30e-4f68-83bf-bb2b103fd2da',
 '5a1f5bdf-70b6-4734-a7d1-6a371def4eeb',
 'c8f3b80b-eef9-42dc-8039-6c6051470e46',
 '3cc94931-b7b2-4682-adfa-a97fe8ecc965',
 'de9ea501-0812-409e-a196-251fcbbb3621',
 '4cff1064-2a32-4f8b-89fa-2b763bc8d979',
 '1b62a79d-d067-43fe-92ae-aaf268f28beb']

In [28]:
indexs = ['fe20e324-5484-408b-8f28-eb9197460949',
 '2e75b8cc-c30e-4f68-83bf-bb2b103fd2da',
 '5a1f5bdf-70b6-4734-a7d1-6a371def4eeb',
 'c8f3b80b-eef9-42dc-8039-6c6051470e46',
 '3cc94931-b7b2-4682-adfa-a97fe8ecc965',
 'de9ea501-0812-409e-a196-251fcbbb3621',
 '4cff1064-2a32-4f8b-89fa-2b763bc8d979',
 '1b62a79d-d067-43fe-92ae-aaf268f28beb']

In [29]:
def retrieve_query(query, k = 2):
    matching_results = index.similarity_search(query, k = k)
    return matching_results

In [30]:
from langchain.chains.question_answering import load_qa_chain


In [None]:
llm = huggingface_pipeline()
chains = load_qa_chain(llm, chain_type="stuff")

In [None]:
def retrieve_answer(query):
    doc_search = retrieve_query(query=query)
    print(doc_search)
    response = chains.run(input_documents= doc_search, question=query)
    return response


In [None]:
our_query = "how many questions can be make over the data"
answer = retrieve_answer(our_query)
print(answer)