#**Step 1: Install All the Required Packages**

In [52]:
!pip -q install langchain
!pip -q install bitsandbytes accelerate xformers einops
!pip -q install datasets loralib sentencepiece
!pip -q install pypdf

!pip -q install sentence_transformers

In [53]:
!pip install chromadb

In [54]:
!pip install openai
!pip install tiktoken

#**Step 2: Import All the Required Libraries**

In [55]:
from langchain.document_loaders import PyPDFLoader

In [56]:
from langchain.document_loaders import TextLoader

In [57]:
from langchain.document_loaders import Docx2txtLoader


In [58]:
from langchain.text_splitter import CharacterTextSplitter

In [59]:
from langchain.embeddings import HuggingFaceEmbeddings

In [60]:
from langchain.vectorstores import Chroma

In [61]:
from huggingface_hub import notebook_login

In [62]:
import torch
import transformers

In [63]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [64]:
from transformers import pipeline

In [65]:
from langchain import HuggingFacePipeline

In [66]:
from langchain.chains import ConversationalRetrievalChain

In [67]:
from langchain.memory import ConversationBufferMemory

In [68]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [69]:
from langchain.chat_models import ChatOpenAI


In [70]:
import os

In [71]:
import sys

#**Step 3: Load the Documents and Extract Text From Them**

In [72]:
!mkdir docs

In [81]:
document=[]
for file in os.listdir("docs"):
  if file.endswith(".pdf"):
    pdf_path="./docs/"+file
    loader=PyPDFLoader(pdf_path)
    document.extend(loader.load())
  elif file.endswith('.docx') or file.endswith('.doc'):
    doc_path="./docs/"+file
    loader=Docx2txtLoader(doc_path)
    document.extend(loader.load())
  elif file.endswith('.txt'):
    text_path="./docs/"+file
    loader=TextLoader(text_path)
    document.extend(loader.load())

In [82]:
document

[Document(page_content=" Timestamp     Time Day period Day of Week  Month Day     Month  Dew Point Process Dew Point  Contactor Pressure Process Contactor Pressure  Natural Gas Moisture Process Natural Gas Moisture  Contactor Temperature Process Contactor Temperature  Glycol Moisture Process Glycol Moisture  Water Inlet Temperature Process Water Inlet Temperature  Glycol Inlet Temperature Process Glycol Inlet Temperature  Out Glycol Temperature Process Out Glycol Temperature  Temperature Process Temperature  Out Water Temperature Process Out Water Temperature  Stripping Gas Process Stripping Gas  Pressure Process Pressure  Dry Glycol Process Dry Glycol  Glycol Flow Process Glycol Flow\n01-09-2023 00:00:00      Night      Friday          1 September  -3.075649   Don't Criticize          172.956649                     Normal                  3.45                     Critical                   42.2                           Low             0.80               Efficient                    3

In [83]:
len(document)

1

#**Step 4: Split the Document into Chunks**

In [84]:
document_splitter=CharacterTextSplitter(separator='\n', chunk_size=500, chunk_overlap=100)

In [85]:
document_chunks=document_splitter.split_documents(document)



In [86]:
len(document_chunks)

101

In [87]:
document_chunks[0]

Document(page_content='Timestamp     Time Day period Day of Week  Month Day     Month  Dew Point Process Dew Point  Contactor Pressure Process Contactor Pressure  Natural Gas Moisture Process Natural Gas Moisture  Contactor Temperature Process Contactor Temperature  Glycol Moisture Process Glycol Moisture  Water Inlet Temperature Process Water Inlet Temperature  Glycol Inlet Temperature Process Glycol Inlet Temperature  Out Glycol Temperature Process Out Glycol Temperature  Temperature Process Temperature  Out Water Temperature Process Out Water Temperature  Stripping Gas Process Stripping Gas  Pressure Process Pressure  Dry Glycol Process Dry Glycol  Glycol Flow Process Glycol Flow', metadata={'source': './docs/df_100.txt'})

In [88]:
document_chunks[1]

Document(page_content="01-09-2023 00:00:00      Night      Friday          1 September  -3.075649   Don't Criticize          172.956649                     Normal                  3.45                     Critical                   42.2                           Low             0.80               Efficient                    34.78                          Normal                     52.05                              Low                   47.64                           Good   175.224011            Not Keep              43.727277                           Bad      96.216138                Normal 29.568994  Normal Pressure        1.63           Worrying     1.408146             Changed", metadata={'source': './docs/df_100.txt'})

#**Step 5: Download the Embeddings from Hugging Face, Download the Sentence Transformer Embeddings**

In [89]:
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


In [90]:
os.environ["OPENAI_API_KEY"]="sk-k"

In [91]:
embeddings = OpenAIEmbeddings()

In [92]:
embeddings

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7b87eebb2f50>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7b87eeb0d420>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

#**Step 6: Setting Up Chroma as our Vector Database**

Converting the Document Chunks into Embedding and save them to the vector store

In [93]:
vectordb=Chroma.from_documents(document_chunks,embedding=embeddings, persist_directory='./data')

In [94]:
vectordb.persist()

#**Step 7: Login into Hugging Face Account to Download the Model**

In [98]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

#**Step 8: Download the Llama 2 7B Chat Model**

In [99]:
!pip install accelerate
!pip install bitsandbytes

In [100]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                          use_auth_token=True,)


model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf",
                                             device_map='auto',
                                             torch_dtype=torch.float16,
                                             use_auth_token=True,
                                              #load_in_8bit=True,
                                              load_in_4bit=True
                                             )



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



#**Step 9: Creating a Hugging Face Pipeline**

In [101]:
pipe=pipeline("text-generation",
              model=model,
              tokenizer=tokenizer,
              torch_dtype=torch.bfloat16,
              device_map='auto',
              max_new_tokens=512,
              min_new_tokens=-1,
              top_k=30

              )

In [102]:
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})

In [103]:
llm=ChatOpenAI(temperature=0.7, model_name='gpt-3.5-turbo')

In [104]:
llm

ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x7b87e428af20>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x7b87eebb1780>, openai_api_key='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', openai_proxy='')

#**Step 10: Creating a memory object which is necessary to track inputs/outputs and hold a conversation**

In [105]:
memory=ConversationBufferMemory(memory_key='chat_history', return_messages=True)

#**Step 11: Creating a Conversation Retrieval QA Chain**

The ConversationalRetrievalQA chain builds on RetrievalQAChain to provide a chat history component.



In [106]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

In [107]:
result=pdf_qa({"question":"Give me the complete report of the incidents on 01-09-2023?"})

In [108]:
result['answer']

"On 01-09-2023, there were two incidents recorded in the report. Here are the details:\n\n1. Incident at 00:00:00:\n   - Time: 00:00:00\n   - Day: Friday\n   - Date: 1 September\n   - Value 1: -3.075649\n   - Value 2: Don't Criticize\n   - Value 3: 172.956649\n   - Value 4: Normal\n   - Value 5: 3.45\n   - Value 6: Critical\n   - Value 7: 42.2\n   - Value 8: Low\n   - Value 9: 0.80\n   - Value 10: Efficient\n   - Value 11: 34.78\n   - Value 12: Normal\n   - Value 13: 52.05\n   - Value 14: Low\n   - Value 15: 47.64\n   - Value 16: Good\n   - Value 17: 175.224011\n   - Value 18: Not Keep\n   - Value 19: 43.727277\n   - Value 20: Bad\n   - Value 21: 96.216138\n   - Value 22: Normal\n   - Value 23: 29.568994\n   - Value 24: Normal Pressure\n   - Value 25: 1.63\n   - Value 26: Worrying\n   - Value 27: 1.408146\n   - Value 28: Changed\n\n2. Incident at 00:46:00:\n   - Time: 00:46:00\n   - Day: Friday\n   - Date: 1 September\n   - Value 1: -7.789073\n   - Value 2: Don't Criticize\n   - Value 

In [111]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

result=pdf_qa({"question":"Report Dew Point in 11-01-2023?"})
result['answer']

'The Dew Point on 11-01-2023 was 33.85637.'

In [112]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

result=pdf_qa({"question":" Report Dew Point of the incidents on 01-09-2023"})
result['answer']

'In the incidents recorded on 01-09-2023, the Dew Point was -7.789073 degrees.'

In [113]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

result=pdf_qa({"question":" Report Natural Gas Moisture of the incidents on 01-09-2023"})
result['answer']

'The Natural Gas Moisture in the incidents recorded on 01-09-2023 was 0.88.'

In [116]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

result=pdf_qa({"question":" Report Process Natural Gas Moisture of the incidents on 01-09-2023"})
result['answer']

'The Process Natural Gas Moisture in the incident recorded on 01-09-2023 was 51.26.'

In [117]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

result=pdf_qa({"question":"Tell me what happened with Dew Point in 01-09-2023?"})
result['answer']

'On 01-09-2023, the Dew Point was 48.94, which is categorized as "Good."'

In [118]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

result=pdf_qa({"question":"According to our database, how was the Dew Point in 01-09-2023?"})
result['answer']

'According to the data in our database, the Dew Point on 01-09-2023 was 48.94.'

In [121]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

result=pdf_qa({"question":"According to our database, how was the Process Dew Point in 01-09-2023?"})
result['answer']

'According to our database, the Process Dew Point on 01-09-2023 was -7.789073.'

In [122]:
#Create our Q/A Chain
pdf_qa=ConversationalRetrievalChain.from_llm(llm=llm,
                                             retriever=vectordb.as_retriever(search_kwargs={'k':6}),
                                             verbose=False, memory=memory)

result=pdf_qa({"question":"Give me the Process Dew Point report of the incidents on 01-09-2023?"})
result['answer']

'The Process Dew Point in the incident recorded on 01-09-2023 was -7.789073.'