In [46]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as PineconeStore
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import ctransformers
from langchain.llms import CTransformers
import pinecone


In [23]:
PINECONE_API_KEY = "3b6967cd-0235-4073-9d75-31e3ff580e38"

In [24]:
#extracting daata from the pdf
def load_pdf(data):
    loader = DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader)

    documents = loader.load()

    return documents

In [25]:
extracted_data = load_pdf("data/")

In [26]:
# creating data Chunks

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 40,chunk_overlap = 15)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks 

In [27]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 3125


In [9]:
#downloading embedding model 
def download_hugging_face_embedings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [10]:
embedding = download_hugging_face_embedings()

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
embedding

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [12]:
query_result = embedding.embed_query("Hello World")
print("Length", len(query_result))

Length 384


In [13]:
query_result

[-0.034477222710847855,
 0.031023189425468445,
 0.00673493230715394,
 0.02610897831618786,
 -0.03936203941702843,
 -0.16030248999595642,
 0.06692398339509964,
 -0.006441492587327957,
 -0.04745054617524147,
 0.01475894171744585,
 0.07087540626525879,
 0.05552757903933525,
 0.019193286076188087,
 -0.026251375675201416,
 -0.010109512135386467,
 -0.026940539479255676,
 0.022307513281702995,
 -0.022226620465517044,
 -0.14969266951084137,
 -0.017493048682808876,
 0.00767626753076911,
 0.0543522946536541,
 0.003254437353461981,
 0.03172598034143448,
 -0.0846213847398758,
 -0.029406018555164337,
 0.05159567669034004,
 0.048124030232429504,
 -0.0033148014917969704,
 -0.05827920511364937,
 0.04196932911872864,
 0.022210698574781418,
 0.12818889319896698,
 -0.022338908165693283,
 -0.011656295508146286,
 0.06292832642793655,
 -0.03287626430392265,
 -0.09122607111930847,
 -0.03117542713880539,
 0.052699606865644455,
 0.0470348484814167,
 -0.08420299738645554,
 -0.030056146904826164,
 -0.02074474096

In [28]:
text_chunks

[Document(page_content='About TSEC', metadata={'source': 'data\\About TSEC.pdf', 'page': 0}),
 Document(page_content='Thadomal Shahani  Engineering College -', metadata={'source': 'data\\About TSEC.pdf', 'page': 0}),
 Document(page_content='College - MUMBAI > Governing Council &', metadata={'source': 'data\\About TSEC.pdf', 'page': 0}),
 Document(page_content='Council & Trust', metadata={'source': 'data\\About TSEC.pdf', 'page': 0}),
 Document(page_content='The Hyderabad (Sind) National', metadata={'source': 'data\\About TSEC.pdf', 'page': 0}),
 Document(page_content='National Collegiate Board well  known', metadata={'source': 'data\\About TSEC.pdf', 'page': 0}),
 Document(page_content='well  known as HSNC board is a', metadata={'source': 'data\\About TSEC.pdf', 'page': 0}),
 Document(page_content='board is a charitable trust', metadata={'source': 'data\\About TSEC.pdf', 'page': 0}),
 Document(page_content='established by Sindhi Community in', metadata={'source': 'data\\About TSEC.pdf'

In [29]:
pinecone_instance = pinecone.Pinecone(api_key=PINECONE_API_KEY)

In [30]:
index_name = "mprbu"

In [31]:
import os 
os.environ['PINECONE_API_KEY'] = '3b6967cd-0235-4073-9d75-31e3ff580e38'

In [32]:
docsearch = PineconeStore.from_texts(
    [t.page_content for t in text_chunks],
    embedding,
    index_name=index_name
)

In [35]:
#if we already have an index we can load it like this
#docsearch=PineconeStore.from_existing_index(index_name,embeddings)

query = "I want Information about Archana Patankar"

docs=docsearch.similarity_search(query,k=3)
print("Result",docs)

Result [Document(page_content='2 Dr. Archana Patankar  Ph.D. (Engg.)'), Document(page_content='7 Prof. Anand Patwardhan'), Document(page_content='7 82 Aaditya Chinchkhedkar Informa tion')]


In [36]:
prompt_template = """
Use the following pieces of information to answer the user's question.
if you don't know the answer, just say that you don't know , don't try to make up answer.

Context:{context}
Question:{question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [39]:
PROMPT = PromptTemplate(input_variables=["context", "question"], template=prompt_template)
chain_type_kwargs = {"prompt": PROMPT}


In [47]:
llm = CTransformers(model="model\llama-2-7b-chat.ggmlv3.q4_0.bin",
model_type = "llama",config={'max_new_tokens':512,'temperature':0.8})

In [50]:
qa = RetrievalQA.from_chain_type(
    llm = llm,
    chain_type = "stuff",
    retriever = docsearch.as_retriever(search_kwargs={'k':2}),
    chain_type_kwargs=chain_type_kwargs
)

In [51]:
while True:
    user_input = input(f"Input Prompt: ")
    result = qa({"query": user_input})
    print("Response : ",result["result"])

  warn_deprecated(


Response :  Dr. Archana Patankar is a highly experienced professor in the field of engineering with over 10 years of experience in teaching and research. She has published several papers in reputed journals and has presented her work at various conferences. Her areas of expertise include signal processing, image processing, and machine learning.
Response :  The number of computers available for Computer Engineering students at XYZ University cannot be determined based on the information provided. The number of computers available varies depending on various factors such as the size of the university, the number of students enrolled, and the availability of resources. Additionally, it is important to note that computer engineering students typically have access to a variety of computing resources, including both hardware and software, as part of their academic program.
