In [1]:
import os
from getpass import getpass

os.environ["OPENAI_API_KEY"] = getpass("Paste your OpenAI API key here and hit enter:")

In [None]:
!pip install openai
!pip install chromadb

In [4]:
import os
import pathlib
import re

from langchain.docstore.document import Document
from langchain.document_loaders import TextLoader

from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS

In [None]:
!pip install pypdf
!pip install tiktoken

# PyPdfloader

In [None]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("cucm_b_troubleshooting-guide-1251-extracted (1).pdf")
pages = loader.load_and_split()

## Davinvi v1

In [None]:
import tiktoken
# create a GPT-4 encoder instance
enc = tiktoken.encoding_for_model("gpt-4")

total_word_count = sum(len(doc.page_content.split()) for doc in pages)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in pages)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.2 / 1000}")

## Ada v2

In [None]:
total_word_count = sum(len(doc.page_content.split()) for doc in pages)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in pages)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.0004  / 1000}")

## Ada v1

In [None]:
total_word_count = sum(len(doc.page_content.split()) for doc in pages)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in pages)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.0040  / 1000}")

## Babbage v1

In [None]:
total_word_count = sum(len(doc.page_content.split()) for doc in pages)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in pages)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.0050  / 1000}")

## Curie v1

In [None]:
total_word_count = sum(len(doc.page_content.split()) for doc in pages)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in pages)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.020  / 1000}")

# PyMuPDF

In [None]:
!pip install PyMuPDF

In [None]:
from langchain.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader("cucm_b_troubleshooting-guide-1251-extracted (1).pdf")
pages = loader.load_and_split()

In [None]:
total_word_count = sum(len(doc.page_content.split()) for doc in pages)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in pages)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")


In [None]:
pages

In [3]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

In [None]:
persist_directory = './db/'
embeddings = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
vectordb = Chroma.from_documents(documents=pages, embedding= embeddings,persist_directory=persist_directory)  

In [None]:
vectordb.persist()
vectordb = None

In [6]:
persist_directory = './db/'
embeddings = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [7]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [8]:
    
chain_type_kwargs = {"prompt": prompt}

In [9]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

chain = RetrievalQAWithSourcesChain.from_chain_type(OpenAI(model_name="text-embedding-ada-002",temperature=0), chain_type="stuff", retriever=vectordb.as_retriever(),  return_source_documents=True,
    chain_type_kwargs=chain_type_kwargs)

In [None]:


from IPython.display import display, Markdown
def print_result(result):
  output_text = f"""### Question: 
  {query}
  ### Answer: 
  {result['answer']}
  ### Sources: 
  {result['sources']}
  ### All relevant sources:
  {' '.join(list(set([doc.metadata['source'] for doc in result['source_documents']])))}
  """
  display(Markdown(output_text))

In [10]:
query = "Enable Email Alert for Core Dump"
result = chain(query)
print(result)

{'question': 'Enable Email Alert for Core Dump', 'answer': ' 1.1\n 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n 1, 1, 1, 1,\n 1, 1, 1, 1,\n\n 1, 1, 1, 1, 1,\n 1, 1,', 'sources': '', 'source_documents': [Document(page_content='Refer to the Troubleshooting Guide for Cisco Unified Communications Manager for more information on the\nkerneldump utility and troubleshooting.\nEnable Email Alert for Core Dump\nUse this procedure to configure the Real-Time Monitoring Tool to email the administrator whenever a core\ndump occurs.\nProcedure\nStep 1\nSelect System > Tools > Alert > Alert Central.\nStep 2\nRight-click CoreDumpFileFound alert and select Set Alert Properties.\nStep 3\na)Follow the wizard prompts to set your preferred criteria:\nIn the Alert Properties: Email Notification popup, make sure that Enable Email is checked and click\nC

In [None]:
!pip install PyPDF2

### Reading the pdf and storing the data in text

In [None]:
import PyPDF2

def read_pdf(pdf_file):
  """Reads a PDF file and returns the text content.

  Args:
    pdf_file: The path to the PDF file to read.

  Returns:
    The text content of the PDF file.
  """

  with open(pdf_file, 'rb') as f:
    pdf_reader = PyPDF2.PdfReader(f)
    text = ''
    for page in pdf_reader.pages:
      text += page.extract_text()

    return text

if __name__ == '__main__':
  pdf_file = 'cucm_b_troubleshooting-guide-1251-extracted (1).pdf'
  text = read_pdf(pdf_file)

  print(text)

  file1=open(r"./cucm_b_troubleshooting-guide-1251-extracted.txt","a")
file1.writelines(text)


In [None]:

pip install PyMuPDF

In [None]:
import fitz
doc = fitz.open('cucm_b_troubleshooting-guide-1251-extracted (1).pdf')
text = ""
for page in doc:
   text+=page.get_text()
file1=open(r"./cucm_b_troubleshooting-guide-1251-extracted.txt","a")
file1.writelines(text)


## CharacterTextSplitter

The main model using chatgpt 3.5 turbo 

In [None]:
!pip install textract

In [2]:
import textract
doc = textract.process(".\cucm_b_troubleshooting-guide-1251-extracted (1).pdf")

In [3]:
# Step 1: Convert PDF to text
import textract
doc = textract.process("./cucm_b_troubleshooting-guide-1251-extracted (1).pdf")

# Step 2: Save to .txt and reopen (helps prevent issues)
with open('./cucm_b_troubleshooting-guide-1251-extracted.txt', 'w') as f:
    f.write(doc.decode('utf-8'))

with open('./cucm_b_troubleshooting-guide-1251-extracted.txt', 'r') as f:
    text = f.read()

In [54]:

from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(        
    separator = "\n\n",
    chunk_size = 1000,
    chunk_overlap  = 24,
    length_function = len,
)

In [55]:
chunks = text_splitter.split_text(text)

In [56]:
docsearch = Chroma.from_texts(chunks, embeddings, metadatas=[{"source": f"{i}-pl"} for i in range(len(chunks))])

In [57]:
chunks[0]

'CHAPTER\n\n1\n\nTroubleshooting Overview\n\nThis section provides the necessary background information and available resources to troubleshoot the Cisco\n\nUnified Communications Manager.\n\n• Cisco Unified Serviceability, on page 1\n\n• Cisco Unified Communications Operating System Administration, on page 2\n\n• General Model of Problem Solving, on page 2\n\n• Network Failure Preparation, on page 3\n\n• Where to Find More Information, on page 3\n\nCisco Unified Serviceability\n\nCisco Unified Serviceability, a web-based troubleshooting tool for Unified Communications Manager, provides\n\nthe following functionality to assist administrators troubleshoot system problems:\n\n• Saves Unified Communications Manager services alarms and events for troubleshooting and provides\n\nalarm message definitions.\n\n• Saves Unified Communications Manager services trace information to various log files for troubleshooting.\n\nAdministrators can configure, collect, and view trace information.'

In [20]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

In [58]:
from langchain.chains import RetrievalQAWithSourcesChain

In [12]:
persist_directory = './db/'
embeddings = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [13]:
from langchain import OpenAI

chain = RetrievalQAWithSourcesChain.from_chain_type(OpenAI(temperature=0), chain_type="stuff", retriever=vectordb.as_retriever())

In [None]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

In [None]:
system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [14]:
chain({"question": "how to t to verify if the service is currently running."}, return_only_outputs=True)


{'answer': ' To verify if the service is currently running, go to Unified Communications Manager Administration, choose Navigation > Cisco Unified Serviceability, choose Tools > Control Center – Feature Services, and from the Servers column, choose the server. The Status column will display which services are running for the chosen server.\n',
 'sources': 'cucm_b_troubleshooting-guide-1251-extracted (1).pdf'}

#### end

In [None]:

# metadatas = [ {"source" : "Troubleshooting Guide for Cisco Unified Communications Manager, Release 12.5(1)"}]
texts = text_splitter.create_documents([cucm_b_troubleshooting_guide])
print(texts[0])


In [None]:
split_docs = text_splitter.split_documents(texts)


In [None]:

import tiktoken
enc = tiktoken.encoding_for_model("text-embedding-ada-002")

total_word_count = sum(len(doc.page_content.split()) for doc in split_docs)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in split_docs)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.0004 / 1000}")

### Create Vector Store using OpenAI

In [None]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import TextLoader

In [None]:
persist_directory = './db/'
embeddings = OpenAIEmbeddings()
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
vectordb = Chroma.from_documents(documents=split_docs, embedding= embeddings,persist_directory=persist_directory)  

In [None]:
query = "Audit Logging"
docs = db.similarity_search(query)

In [None]:
vectordb.persist()
vectordb = None

In [None]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

In [64]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)

In [65]:
system_template="""Use the following pieces of context to answer the users question.
Take note of the sources and include them in the answer in the format: "SOURCES: source1 source2", use "SOURCES" in capital letters regardless of the number of sources.
If you don't know the answer, just say that "I don't know", don't try to make up an answer.
----------------
{summaries}"""
messages = [
    SystemMessagePromptTemplate.from_template(system_template),
    HumanMessagePromptTemplate.from_template("{question}")
]
prompt = ChatPromptTemplate.from_messages(messages)

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain import OpenAI

chain = RetrievalQAWithSourcesChain.from_chain_type(OpenAI(model_name="text-embedding-ada-002",temperature=0), chain_type="stuff", retriever=vectordb.as_retriever())

In [71]:
# chain({"question": "Enable Email Alert for Core Dump"}, return_only_outputs=True)
result = chain({"question": "Enable Email Alert for Core Dump"}, return_only_outputs=True)
print_result(result)

### Question: 
  verify which Cisco CallManager services are active
  ### Answer: 
   To enable an email alert for a core dump, select System > Tools > Alert > Alert Central, right-click CoreDumpFileFound alert and select Set Alert Properties, follow the wizard prompts to set your preferred criteria, and set the default Email server.
Source: 17-pl, 16-pl, 14-pl, 15-pl.

  

In [70]:
from IPython.display import display, Markdown
def print_result(result):
  output_text = f"""### Question: 
  {query}
  ### Answer: 
  {result['answer']}

  """
  display(Markdown(output_text))

## RecursiveCharacterTextSplitter

In [None]:
!pip install nltk

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 20,
    length_function = len,
)
texts = text_splitter.create_documents([text])
print(texts[0])

In [None]:
split_text = text_splitter.split_text(text)
split_text[:2]

In [None]:
enc = tiktoken.encoding_for_model("gpt-4")

total_word_count = sum(len(doc.page_content.split()) for doc in split_docs)
total_token_count = sum(len(enc.encode(doc.page_content)) for doc in split_docs)

print(f"\nTotal word count: {total_word_count}")
print(f"\nEstimated tokens: {total_token_count}")
print(f"\nEstimated cost of embedding: ${total_token_count * 0.0004 / 1000}")