In [52]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone as PineconeStore
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from pydantic import BaseModel


In [53]:
PINECONE_API_KEY = "pcsk_7S39mV_JdpZCGQWN7rN2P5VUUYrUWBVHQqXMsqUjwDxq1bxYbdp9CikVvTAMqdyNk7w63F"
PINECONE_API_ENV = "us-east-1"


In [57]:
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import PyPDFLoader

# Define the function to load and extract PDF data
def load_pdf(data_path):
    loader = DirectoryLoader(data_path,
                             glob="*.pdf",
                             loader_cls=PyPDFLoader)
    
    documents = loader.load()
    
    return documents


In [58]:
# Call the function with the directory path where PDFs are stored
extracted_data = load_pdf("data/")

# Print the extracted data to check the results
print(extracted_data)


[Document(metadata={'source': 'data\\tww.pdf', 'page': 0}, page_content='Q: What is TechWithWarriors?\nA: TechWithWarriors is a tech company that offers internships and certifications in AI, ML, DL, NLP,\nand digital marketing.\nQ: Where is TechWithWarriors based?\nA: TechWithWarriors is an online-based company, providing services globally.\nQ: How can I apply for an internship at TechWithWarriors?\nA: You can apply for an internship through our LinkedIn page or contact us via email.\nQ: What technologies does TechWithWarriors focus on?\nA: TechWithWarriors focuses on AI, Machine Learning, Deep Learning, Natural Language\nProcessing, and Digital Marketing.\nQ: What is the duration of an internship at TechWithWarriors?\nA: The duration of an internship at TechWithWarriors varies, but it typically lasts 3-6 months.\nQ: Is the internship paid?\nA: Currently, internships at TechWithWarriors are unpaid, but we offer a certificate upon completion.\nQ: Do I need prior experience to apply?\nA:

In [59]:
#extracted_data

In [60]:
#Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    text_chunks = text_splitter.split_documents(extracted_data)

    return text_chunks

In [61]:
text_chunks = text_split(extracted_data)
print("length of my chunk:", len(text_chunks))

length of my chunk: 11


In [62]:
#text_chunks

In [78]:
#download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [79]:
embeddings = download_hugging_face_embeddings()


In [80]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [81]:

import os
from pinecone.grpc import PineconeGRPC as Pinecone

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


In [82]:
!pip install protoc_gen_openapiv2



In [83]:
index_name="support"

In [85]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings, 
)

In [86]:
import pinecone
from langchain.vectorstores import Pinecone


In [87]:
#If we already have an index we can load it like this
docsearch=Pinecone.from_existing_index(index_name, embeddings)



In [73]:
query = "how much long internship?"

docs=docsearch.similarity_search(query, k=1)

print("Result", docs)

Result []


In [74]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Question: {question}
Answer: {answer}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [75]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["Question ", "Answer"])
chain_type_kwargs={"prompt": PROMPT}

In [76]:
llm=CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                  model_type="llama",
                  config={'max_new_tokens':512,
                          'temperature':0.3})

In [77]:
# Chain Type Arguments - including 'document_variable_name' to specify the variable name for the context
chain_type_kwargs = {
    "document_variable_name": "context",  # Ensuring the context variable name is properly set
}

# Initialize the RetrievalQA chain with the correct setup
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(search_kwargs={'k': 2}),
    return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs  # Pass the correct configuration for context
)

In [51]:
while True:
    user_input=input(f"Input Prompt:")
    result=qa({"query": user_input})
    print("Response : ", result["result"])

  result=qa({"query": user_input})


Response :   The duration of an internship at TechWithWarriors typically lasts 3-6 months.
Response :   You can join the WhatsApp group by sending a message to the admin with your name and email address.
Response :   You will be added to the WhatsApp group after you have completed the application process and been accepted into the internship program.
Response :   What is the duration of the Digital Marketing internship offered by TechWithWarriors?


KeyboardInterrupt: 