# Setup

In [8]:
!pip install param



In [9]:
from langchain.vectorstores import DocArrayInMemorySearch
from langchain.document_loaders import TextLoader

In [10]:
#Create a temporary database with your files
def load_db(file, chain_type, k):
    #Load document - need to load multiple files
    loader = PyPDFLoader(file)
    #Split the documents into pages
    documents = loader.load()
    #Split text
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
    docs = text_splitter.split_documents(documents)
    #Define embedding - turn text chunks into vectors
    embeddings = OpenAIEmbeddings()
    #Create vector database from data
    db = DocArrayInMemorySearch.from_documents(docs, embeddings)
    #Define retriever
    retriever = db.as_retriever(search_type="similarity", search_kwargs={'k': k})
    #Create chatbot chain. Memory is managed externally
    qa = ConversationalRetrievalChain.from_llm(
        llm = ChatOpenAI(model_name=llm_name, temperature=0),
        chain_type=chain_type,
        retriever=retriever,
        return_source_documents=True,
        return_generated_question=True,
    )
    return qa

In [12]:
#Python library allowing you to manage attributes when creating interactive
#visualizations
#import param

#More packages for interactive visualization
!pip install langchain[docarray]
#!pip install "pydantic<2.0"
#Jupyter Bokeh required when visualizing in Colab specifically
#!pip install jupyter_bokeh

zsh:1: no matches found: langchain[docarray]


In [13]:
#Create a class using panel and parem that creates an interactive chatbot
#that interacts with the database

#the cbfs class inherits from param.Parameterized
class cbfs(param.Parameterized):
    #List to store conversation history
    chat_history = param.List([])
    #String for chatbot's latest response
    answer = param.String("")
    #Latest question sent to document database
    db_query  = param.String("")
    #List of documents/context retrieved from database
    db_response = param.List([])

    #Initialize the class
    def __init__(self,  **params):
        super(cbfs, self).__init__( **params)
        #Empty list to store the chat's visual elements (?)
        self.panels = []
        #Initialize one file for demonstration
        self.loaded_file = "/content/drive/MyDrive/MachineLearning-Lecture01.pdf"
        #Calls load_db() method to create a QA omdel
        self.qa = load_db(self.loaded_file,"stuff", 4)

    #Loads new file into database
    def call_load_db(self, count):
        if count == 0 or file_input.value is None:  # init or no file specified :
            return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")
        else:
            file_input.save("temp.pdf")  # local copy
            #Change the style of the button
            self.loaded_file = file_input.filename
            button_load.button_style="outline"
            self.qa = load_db("temp.pdf", "stuff", 4)
            button_load.button_style="solid"
        #Clear chat history
        self.clr_history()
        return pn.pane.Markdown(f"Loaded File: {self.loaded_file}")

    #Process queries
    def convchain(self, query):
        if not query:
            return pn.WidgetBox(pn.Row('User:', pn.pane.Markdown("", width=600)), scroll=True)
        result = self.qa({"question": query, "chat_history": self.chat_history})
        #Store question and answer in chat history
        self.chat_history.extend([(query, result["answer"])])
        #Update database query with new question
        self.db_query = result["generated_question"]
        #Update database with response with new source documents
        self.db_response = result["source_documents"]
        self.answer = result['answer']
        #Add user and chatbot responses to new rows
        self.panels.extend([
            pn.Row('User:', pn.pane.Markdown(query, width=600)),
            pn.Row('ChatBot:', pn.pane.Markdown(self.answer, width=600, background='#F6F6F6'))
        ])
        inp.value = ''  #clears loading indicator when cleared
        return pn.WidgetBox(*self.panels,scroll=True)

    #Decorator (?)
    @param.depends('db_query ', )
    #Shows latest db query, or message if no queries have been made
    def get_lquest(self):
        if not self.db_query :
            return pn.Column(
                pn.Row(pn.pane.Markdown(f"Last question to DB:", styles={'background-color': '#F6F6F6'})),
                pn.Row(pn.pane.Str("no DB accesses so far"))
            )
        return pn.Column(
            pn.Row(pn.pane.Markdown(f"DB query:", styles={'background-color': '#F6F6F6'})),
            pn.pane.Str(self.db_query )
        )

    @param.depends('db_response', )
    #Shows database lookup results as a list of retrieved docs
    def get_sources(self):
        if not self.db_response:
            return
        rlist=[pn.Row(pn.pane.Markdown(f"Result of DB lookup:", styles={'background-color': '#F6F6F6'}))]
        for doc in self.db_response:
            rlist.append(pn.Row(pn.pane.Str(doc)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    @param.depends('convchain', 'clr_history')
    #Shows conversation history with each exchange in a new row
    def get_chats(self):
        if not self.chat_history:
            return pn.WidgetBox(pn.Row(pn.pane.Str("No History Yet")), width=600, scroll=True)
        rlist=[pn.Row(pn.pane.Markdown(f"Current Chat History variable", styles={'background-color': '#F6F6F6'}))]
        for exchange in self.chat_history:
            rlist.append(pn.Row(pn.pane.Str(exchange)))
        return pn.WidgetBox(*rlist, width=600, scroll=True)

    #Clears chat history
    def clr_history(self,count=0):
        self.chat_history = []
        return


NameError: name 'param' is not defined

In [4]:
cb = cbfs()

file_input = pn.widgets.FileInput(accept='.pdf')
button_load = pn.widgets.Button(name="Load DB", button_type='primary')
button_clearhistory = pn.widgets.Button(name="Clear History", button_type='warning')
button_clearhistory.on_click(cb.clr_history)
inp = pn.widgets.TextInput( placeholder='Enter text here…')

bound_button_load = pn.bind(cb.call_load_db, button_load.param.clicks)
conversation = pn.bind(cb.convchain, inp)

jpg_pane = pn.pane.Image( './img/convchain.jpg')

tab1 = pn.Column(
    pn.Row(inp),
    pn.layout.Divider(),
    pn.panel(conversation,  loading_indicator=True, height=300),
    pn.layout.Divider(),
)
tab2= pn.Column(
    pn.panel(cb.get_lquest),
    pn.layout.Divider(),
    pn.panel(cb.get_sources ),
)
tab3= pn.Column(
    pn.panel(cb.get_chats),
    pn.layout.Divider(),
)
tab4=pn.Column(
    pn.Row( file_input, button_load, bound_button_load),
    pn.Row( button_clearhistory, pn.pane.Markdown("Clears chat history. Can use to start a new topic" )),
    pn.layout.Divider(),
    pn.Row(jpg_pane.clone(width=400))
)
dashboard = pn.Column(
    pn.Row(pn.pane.Markdown('# ChatWithYourData_Bot')),
    pn.Tabs(('Conversation', tab1), ('Database', tab2), ('Chat History', tab3),('Configure', tab4))
)
pn.extension()
dashboard

NameError: name 'cbfs' is not defined

In [21]:
#Create one instance of the OpenAIEmbeddings object - THIS IS PROBABLY IN THE WRONG PLACE
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Document loading

In [4]:
#Load and preprocess documents
loader = DirectoryLoader("/Users/kyli/Desktop/lab_gpt_example_docs")

In [5]:
#Load documents
documents = loader.load()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
#See contents of documents, probably only do this with one document as a test
documents

[Document(metadata={'source': '/Users/kyli/Desktop/lab_gpt_example_docs/01 Pathophys of TBI.pdf'}, page_content='Pathophysiology of Traumatic Brain Injury\n\nKatherine R. Giordano and Jonathan Lifshitz\n\n2.1\n\nIntroduction\n\nIn broad terms, traumatic brain injury (TBI) is induced by mechanical forces applied to the head that displace the brain within the skull and dis- rupt neurological function. TBI can result from rotation, acceleration/deceleration of the brain, focal cavitation, blast-wave exposure, or a com- bination of biomechanical conditions. Following the mechanical force, pathophysiological pro- cesses are initiated, which extend the classifica- tion of TBI from an event to a complex disease [1]. Resultant clinical symptoms from the mechanical force and elements of the pathophys- iology contribute to the designation of injury severity (mild, moderate, severe, debilitating, recoverable, fatal). Symptoms after TBI are highly variable even within each pathoanatomi- cal classi

In [7]:
#Split the text using Recursive Character Text Splitter - other text splitting algos available
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

In [8]:
#Assign each chunk of text to a different list item in docs (I think)
docs = text_splitter.split_documents(documents)

In [9]:
#Look at one chunk of text
docs[2]

Document(metadata={'source': '/Users/kyli/Desktop/lab_gpt_example_docs/01 Pathophys of TBI.pdf'}, page_content='mechanical force, the location of the impact, pre- injury lifestyle, and genetics. However, even when all factors are considered, the complexity of the pathophysiology can make it difficult to determine which individuals recover and which may experience chronic morbidities [2]. The vast majority of TBIs are diffuse with symptoms that resolve within an acute to subacute time course (1–10 days post-injury). Persistent symptoms can lead to the diagnosis of post-concussion syn- drome, which can last for months or years after the injury and is estimated to occur in about 10–15% of milder cases [3]. Common enduring symptoms of TBI can be broadly categorized as cognitive, somatic, or emotional, and the occur- rence of long-term negative outcomes after TBI increases with injury severity [2, 3]. Collectively, TBI, whether focal, diffuse, and mixed focal/dif- fuse, is a leading cause o

In [23]:
#Import text=splitting algorithm - there ae multiple options available
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [24]:
#How many characters in each text chunk
chunk_size = 26
#How many characters overlap between each text chunk
chunk_overlap = 4

In [26]:
#Create an instance of RecursiveCharacterTextSplitter, assign to variable r_splitter
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size = chunk_size,
    chunk_overlap = chunk_overlap,
)

In [None]:
#Get rid of the Chroma db - if you're doin ga new practice run or final product
#db.delete()