In [38]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import CTransformers

import os
from dotenv import load_dotenv

In [39]:
load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_api_env = 'gcp-starter'

In [40]:
# Etract data from PDF
def load_pdf(data):
   loader =  DirectoryLoader(path=data, 
                            glob='*.pdf',
                            loader_cls=PyPDFLoader)
   documents = loader.load()

   return documents

In [41]:
extracted_data = load_pdf('data/')
extracted_data

In [42]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, 
                                                chunk_overlap=20)
    text_chunks = text_splitter.split_documents(documents=extracted_data)
    
    return text_chunks

In [43]:
text_chunks = text_split(extracted_data)
len(text_chunks)
# text_chunks

10484


In [44]:
# Embedding model
def download_embeddings_model():
    embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings



In [45]:
embeddings = download_embeddings_model()

  embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
  from .autonotebook import tqdm as notebook_tqdm


In [46]:
query_result = embeddings.embed_query("Hello world")
print("Length", len(query_result))

Length 384


In [47]:
query_result

[-0.03447728604078293,
 0.031023267656564713,
 0.006734997034072876,
 0.02610897831618786,
 -0.0393621064722538,
 -0.16030247509479523,
 0.066924087703228,
 -0.00644137617200613,
 -0.04745044559240341,
 0.014758842997252941,
 0.07087539881467819,
 0.05552750080823898,
 0.019193340092897415,
 -0.02625138871371746,
 -0.01010960340499878,
 -0.026940476149320602,
 0.022307490929961205,
 -0.022226663306355476,
 -0.149692565202713,
 -0.01749301701784134,
 0.007676353678107262,
 0.05435230955481529,
 0.003254495793953538,
 0.03172589838504791,
 -0.08462144434452057,
 -0.0294061116874218,
 0.05159566551446915,
 0.048124078661203384,
 -0.0033147772774100304,
 -0.058279260993003845,
 0.04196932166814804,
 0.022210663184523582,
 0.128188818693161,
 -0.022338958457112312,
 -0.011656227521598339,
 0.06292832642793655,
 -0.03287624567747116,
 -0.09122596681118011,
 -0.03117532841861248,
 0.05269956216216087,
 0.04703476279973984,
 -0.0842030718922615,
 -0.030056225135922432,
 -0.020744914188981056,
