In [8]:
#pip install unstructured
#brew install unoconv
#pip install odfpy
#pip install langchain
#pip install chromadb
#pip install pypdf

import os
import glob
import nbformat
from nbconvert import PythonExporter
from PyPDF2 import PdfReader
import subprocess  
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredPowerPointLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

def read_file(file_path):
    '''
    This function reads a file among eligible file types:
         {'.txt', '.pdf', '.odt', '.ods', '.docx', '.xlsx', '.ipynb', '.py', '.pptx', '.html'},
    and converts the file into langchain.docstore.document.Document object
    return: [Document(s)]
    '''       

    content, content_sheets, data, pages, doc = None, None, None, None, None

    file_extension = os.path.splitext(file_path)[1]
    
    if file_extension in {'.txt', '.pdf', '.odt', '.ods', '.docx', '.xlsx', '.ipynb', '.py', '.pptx', '.html'}:

        # Read the file based on its extension
        if file_extension == '.pdf':
            print("this is a pdf")
            loader = PyPDFLoader(file_path)
            pages = loader.load_and_split() # a list of Document objects, separate by page

        elif file_extension == '.docx':
            print("this is a docx")
            loader = UnstructuredWordDocumentLoader(file_path)
            data = loader.load() # a list of one Document object

        elif file_extension == '.pptx':
            print("this is a pptx")
            loader = UnstructuredPowerPointLoader(file_path)
            data = loader.load() # a list of one Document object

        elif file_extension == '.html':
            print("this is a html")
            loader = UnstructuredHTMLLoader(file_path)
            data = loader.load() # a list of one Document object

        elif file_extension in ('.txt', '.py'):
            print("this is a txt or py")
            loader = TextLoader(file_path, encoding='utf8')
            data = loader.load() # a list of one Document object

        elif file_extension == '.odt':
            print("this is a odt")
            cmd = ['unoconv', '--stdout', '-f', 'txt', file_path]
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            stdout, stderr = p.communicate()
            content = stdout.decode('utf-8') # a string

        elif file_extension == '.ipynb':
            print("this is a ipynb")
            with open(file_path, 'r', encoding='utf-8') as f:
                notebook = nbformat.read(f, as_version=nbformat.NO_CONVERT)
            exporter = PythonExporter()
            content, _ = exporter.from_notebook_node(notebook) # content is a string            

        elif file_extension in ('.ods', '.xlsx'):
            print("this is a ods or xlsx")
            excel_file = pd.ExcelFile(file_path)
            # Iterate over sheets and extract data
            content_sheets = {}
            for sheet_name in excel_file.sheet_names:
                content_sheets[sheet_name] = excel_file.parse(sheet_name) 
                # each sheet is saved as a DataFrame

        else:
            pass


        # save the file as a Document object
        if data:
            print(file_path, "has data.")
            return data

        elif pages:
            print(file_path, "has pages.")
            return pages

        elif content:
            print(file_path, "has content.")
            content = content.replace("  ","")
            doc = Document(page_content=content, metadata={"source": file_path})
            return [doc]

        elif content_sheets:
            print(file_path, "has content_sheets.")
            docs = []
            for sheet_name in content_sheets:
                content_in_sheet = str(content_sheets[sheet_name])
                # remove large amounts of space and NaN in xlsx and ods
                content_in_sheet = content_in_sheet.replace("  ","")
                content_in_sheet = content_in_sheet.replace("NaN", "")
                sheet = Document(page_content=content_in_sheet, metadata={"source": file_path, "sheet": sheet_name})
                docs.append(sheet)
            return docs

        else:
            pass

    return []

In [9]:
def folder_recursion(folder_path):
    """
    This function loops through the contents in a root folder:
    if a content is a single file, run read_file();
    if a content is a foler, recurse folder_recursion().
    return: [Documents], all eligible document types in the root folder and all subfolders
    """

    documents = []
    
    # Loop through files in the folder
    for file_name in os.listdir(folder_path):

        file_path = os.path.join(folder_path, file_name)
        
        if os.path.isdir(file_path): # file_path is a folder
            documents += folder_recursion(file_path)
        
        else: # file_path is a single file
            documents += read_file(file_path)
    
    return documents

In [10]:
pwd

'/Users/jian/Documents/GitHub/PersonalNoteBot'

In [11]:
folder_path = '../../../NLP_Kai/notes2/' #the folder directory containing your notes
documents = folder_recursion(folder_path)
len(documents)

this is a odt
../../../NLP_Kai/notes2/6_machine learning framework.odt has content.
this is a docx
../../../NLP_Kai/notes2/7_NLP models and tasks.docx has data.
this is a ods or xlsx
../../../NLP_Kai/notes2/4_important notes.xlsx has content_sheets.
this is a docx
../../../NLP_Kai/notes2/11_文件存在哪里save file.docx has data.
this is a pdf
../../../NLP_Kai/notes2/9_T5.pdf has pages.
this is a txt or py
../../../NLP_Kai/notes2/3_hw1.py has data.
this is a txt or py
../../../NLP_Kai/notes2/8_requirements.txt has data.
this is a html
../../../NLP_Kai/notes2/LangChain 0.0.147.html has data.
this is a ipynb
../../../NLP_Kai/notes2/1_Deep_Learning_Solution.ipynb has content.
this is a odt
../../../NLP_Kai/notes2/folder1/folder2/machine learning framework.odt has content.
this is a odt
../../../NLP_Kai/notes2/folder1/folder3/machine learning framework.odt has content.
this is a ods or xlsx
../../../NLP_Kai/notes2/folder1/folder3/目录.ods has content_sheets.
this is a ods or xlsx
../../../NLP_Kai/not

125

In [12]:
print(f"Read in {len(documents)} documents (and pages) from directory {folder_path}")

Read in 125 documents (and pages) from directory ../../../NLP_Kai/notes2/


In [4]:
# split each document into a good size

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
split_documents = text_splitter.split_documents(documents)
len(split_documents)

1052

In [202]:
# define embedding
os.environ["OPENAI_API_KEY"] = "..."
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])

In [221]:
# attention: we need persist_directory to save our embeddings for later use
# OpenAI charges on embedding and run(query)

# a folder named 'db2' is created inside folder_path, db2 has your embeddings

persist_directory = 'db2'
vectordb = Chroma.from_documents(documents=split_documents, embedding=embeddings, persist_directory=persist_directory)
vectordb.persist() # need .persist() in ipynb to save to local drive. in python script, this is automatic
vectordb = None
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings) # successfully load embedding from local

Using embedded DuckDB with persistence: data will be stored in: db2
Using embedded DuckDB with persistence: data will be stored in: db2


In [222]:
# chatbot QA function
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type='stuff', retriever=vectordb.as_retriever())
query = 'how to pre-train a transformer?'
qa.run(query)

' Pre-training a transformer typically involves using a large amount of unlabeled data, such as the Common Crawl Corpus (C4), and training for a long time using an "inverse square root" learning rate schedule. Additionally, increasing the batch size and the number of training steps can confer benefits.'

In [223]:
# similarity search to return related documents
similar_docs = vectordb.similarity_search_with_score(query)

In [224]:
# we will return metadata
similar_docs

[(Document(page_content='Longer training Our baseline model uses a relatively small amount of pre-training (1⁄4as\nmuch as BERT (Devlin et al., 2018),1⁄16as much as XLNet (Yang et al., 2019),1⁄64as\nmuch as RoBERTa (Liu et al., 2019c), etc.). Fortunately, C4 is big enough that we\ncan train for substantially longer without repeating data (which can be detrimental,\nas shown in Section 3.4.2). We found in Section 3.6 that additional pre-training can\nindeed be helpful, and that both increasing the batch size and increasing the number of\ntraining steps can confer this beneﬁt. We therefore pre-train our models for 1million\nsteps on a batch size of 211sequences of length 512, corresponding to a total of about\n1trillion pre-training tokens (about 32×as many as our baseline). In Section 3.4.1, we\nshowed that pre-training on the RealNews-like, WebText-like, and Wikipedia + TBC\ndata sets outperformed pre-training on C4 on a few downstream tasks. However, these', metadata={'source': 'notes

In [227]:
query = "do we have a requirements.txt file?"
qa.run(query)
# the document name in metadata is NOT used in similarity search.
# will add document name into the beginning of text

' No, we do not have a requirements.txt file.'

In [234]:
vectordb.similarity_search_with_score("Flask==2.0.2")
# 只搜“Flask”找不到正确的doc，这里也没有找到requirements.txt里含有Flask的那一部分

[(Document(page_content='tensorflow-io-gcs-filesystem==0.23.1\ntensorflow-probability==0.16.0\ntermcolor==1.1.0\nterminado==0.16.0\ntf-estimator-nightly==2.8.0.dev2021122109\nthreadpoolctl==3.0.0\ntinycss2==1.1.1\ntokenizers==0.13.2\ntoml==0.10.2\ntoolz==0.12.0\ntorch==1.12.0\ntorchvision==0.13.0\ntornado==6.2\ntqdm==4.62.3\ntraitlets==5.4.0\ntransformers==4.27.4\ntrio==0.19.0\ntrio-websocket==0.9.2\ntyping_extensions==4.0.1\ntzdata==2022.2\ntzlocal==4.2\nurllib3==1.26.7\nvalidators==0.20.0\nvirtualenv==20.10.0\nwcwidth==0.2.5\nwebencodings==0.5.1\nWerkzeug==2.0.2\nwidgetsnbextension==4.0.3\nwrapt==1.13.3\nwsproto==1.0.0\nxgboost==1.5.0\nzipp==3.7.0', metadata={'source': 'notes/requirements.txt'}),
  0.4120103716850281),
 (Document(page_content='ipykernel==6.16.0\nipython==8.5.0\nipython-genutils==0.2.0\nipywidgets==8.0.2\nitsdangerous==2.0.1\njedi==0.18.1\nJinja2==3.0.3\njoblib==1.1.0\njsonschema==4.16.0\njupyter==1.0.0\njupyter-console==6.4.4\njupyter-core==4.11.1\njupyter_client==7.

In [236]:
qa.run("文件存在哪里")

' 文件的存储位置总是从working directory出发，而不是test.py的位置出发，使用Slash和back slash的方向在IBM和Mac中不一样，如果用r string则两个方向都可以，就像第二个存储路径的写法。'

In [235]:
vectordb.similarity_search_with_score("文件存在哪里")

[(Document(page_content='文件存在哪里？\n\n有如下路径：\n\nA\\B\\C\\D\\test.py \tfolder D中有python file test.py\n\nA\\B\\data\t\tfolder B中有folder data\n\n我们的working directory是A\\B\n\n文件test.py中有存储命令：df.to_csv(“mytest.csv”)，那么这个df会被存到B folder下。\n\n如果把存储命令改为：df.to_csv(r“data\\mytest.csv”)，那么这个df会被存到B\\data folder下。\n\n存储的位置总是从working directory出发，而不是test.py的位置出发。\n\n使用Slash和back slash的方向在IBM和Mac中不一样，如果用r string则两个方向都可以，就像第二个存储路径的写法', metadata={'source': 'notes/文件存在哪里save file.docx'}),
  0.331976056098938),
 (Document(page_content='Unnamed: 2 \n0 \\\n1  \n2  \n3  \n4  \n5  \n6  \n7  \n8  \n9  \n10 tech stack \n11 TFIDF, GloVe, bert, RNN, cosine similarity, BM25 \n12Open source models GPT2, Bert, T5, Bart, GPT-J... \n13 open source models + adapter \n14 open source models + LangChain/GPT-index \n15 OpenAI API \n16 build plugin \n17 \n18 \n19 \n20 \n21 \n22 \n23 \n24 \n25 \n26 这样的model可以回答domain之内和之外的问题 \n27 \n28 \n29 \n30 \n31 老师finetune过generative task吗？ \n32 \n33 \n34 \n35 \n36 \n37 \n38 \n39 \n40 \n41

In [None]:
# # To cleanup, you can delete the collection
# vectordb.delete_collection()
# vectordb.persist()

# # Or just nuke the persist directory 如果需要删除db2：
# !rm -rf db2/

In [None]:
# 给chatbot加上memory

In [3]:
persist_directory = 'db'
embeddings = OpenAIEmbeddings(openai_api_key=os.environ["OPENAI_API_KEY"])
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

Using embedded DuckDB with persistence: data will be stored in: db


In [5]:
#pip install unstructured
#brew install unoconv
#pip install odfpy
#pip install langchain
#pip install chromadb
#pip install pypdf

import os
import nbformat
from nbconvert import PythonExporter
import subprocess  
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredPowerPointLoader
from langchain.document_loaders import UnstructuredWordDocumentLoader
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain import OpenAI
from langchain.chains import RetrievalQA, ConversationChain
from langchain.memory import ConversationSummaryMemory, ConversationBufferMemory
from langchain.agents import Tool, initialize_agent

os.environ["OPENAI_API_KEY"] = "..."

llm = OpenAI(
    temperature=0,
    openai_api_key=os.environ["OPENAI_API_KEY"],
    model_name="text-davinci-003"
)


# chatbot QA function
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff', 
    retriever=vectordb.as_retriever()
)

source_tool = Tool(
    name="retrievalQA tool",
    func=qa.run,
    description="Always check retrivealQA tool first when answering questions"
)

memory = ConversationSummaryMemory(llm=llm,memory_key="chat_history")

tools = [source_tool]

conversational_agent = initialize_agent(
    agent='conversational-react-description', 
    tools=tools, 
    llm=llm,
    verbose=True,
    max_iterations=3,
    memory=memory,
)

instruction = " AI must use retrievalQA tool first. If cannot find answer in retrievalQA tool, AI's output must start with 'The question is not in the scope of the documents provided.', and then AI can answer the question based on chat history or parametric knowledge."

In [6]:
query = "who is the author of Ancient and Modern Physics?" + instruction
conversational_agent(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: retrievalQA tool
Action Input: who is the author of Ancient and Modern Physics?[0m
Observation: [36;1m[1;3m I don't know.[0m
Thought:[32;1m[1;3m Do I need to use a tool? No
AI: The question is not in the scope of the documents provided. However, based on our previous conversation, I believe the author of Ancient and Modern Physics is Albert Einstein.[0m

[1m> Finished chain.[0m


{'input': "who is the author of Ancient and Modern Physics? AI must use retrievalQA tool first. If cannot find answer in retrievalQA tool, AI's output must start with 'The question is not in the scope of the documents provided.', and then AI can answer the question based on chat history or parametric knowledge.",
 'chat_history': '',
 'output': 'The question is not in the scope of the documents provided. However, based on our previous conversation, I believe the author of Ancient and Modern Physics is Albert Einstein.'}

In [7]:
query = "how to pre-train a transformer?" + instruction
conversational_agent(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: retrievalQA tool
Action Input: how to pre-train a transformer?[0m
Observation: [36;1m[1;3m Pre-train a transformer using a simple denoising objective and then separately fine-tune on each of the downstream tasks. Use a standard encoder-decoder Transformer as proposed by Vaswani et al. (2017). Use teacher forcing and a cross-entropy loss for optimization. Pre-train for 524,288 steps on C4 with a maximum sequence length of 512 and a batch size of 128 sequences.[0m
Thought:[32;1m[1;3m Do I need to use a tool? No
AI: This is a great way to pre-train a transformer. It is important to remember to use teacher forcing and a cross-entropy loss for optimization, and to pre-train for a large number of steps. Additionally, it is important to use a standard encoder-decoder Transformer as proposed by Vaswani et al. (2017).[0m

[1m> Finished chain.[0m


{'input': "how to pre-train a transformer? AI must use retrievalQA tool first. If cannot find answer in retrievalQA tool, AI's output must start with 'The question is not in the scope of the documents provided.', and then AI can answer the question based on chat history or parametric knowledge.",
 'chat_history': '\nThe human asked what the AI thinks of artificial intelligence. The AI thinks artificial intelligence is a force for good because it will help humans reach their full potential. The human then asked who is the author of Ancient and Modern Physics, and the AI used a retrievalQA tool before responding that the question was not in the scope of the documents provided, but based on the previous conversation, the author is Albert Einstein.',
 'output': 'This is a great way to pre-train a transformer. It is important to remember to use teacher forcing and a cross-entropy loss for optimization, and to pre-train for a large number of steps. Additionally, it is important to use a stan

In [8]:
vectordb.similarity_search_with_score("how to pre-train a transformer?")

[(Document(page_content='Exploring the Limits of Transfer Learning\n3.1 Baseline\nOur goal for our baseline is to reﬂect typical, modern practice. We pre-train a standard\nTransformer (described in Section 2.1) using a simple denoising objective and then separately\nﬁne-tune on each of our downstream tasks. We describe the details of this experimental\nsetup in the following subsections.\n3.1.1 Model\nFor our model, we use a standard encoder-decoder Transformer as proposed by Vaswani et al.\n(2017). While many modern approaches to transfer learning for NLP use a Transformer\narchitecture consisting of only a single “stack” (e.g. for language modeling (Radford et al.,\n2018; Dong et al., 2019) or classiﬁcation and span prediction (Devlin et al., 2018; Yang et al.,\n2019)), we found that using a standard encoder-decoder structure achieved good results\non both generative and classiﬁcation tasks. We explore the performance of diﬀerent model\narchitectures in Section 3.2.', metadata={'sour

In [9]:
query = "who did you think is the author of the physics book?" + instruction
conversational_agent(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Thought: Do I need to use a tool? Yes
Action: retrievalQA tool
Action Input: who did you think is the author of the physics book?[0m
Observation: [36;1m[1;3m I don't know.[0m
Thought:[32;1m[1;3m Do I need to use a tool? No
AI: The question is not in the scope of the documents provided, but based on the previous conversation, the author is Albert Einstein.[0m

[1m> Finished chain.[0m


{'input': "who did you think is the author of the physics book? AI must use retrievalQA tool first. If cannot find answer in retrievalQA tool, AI's output must start with 'The question is not in the scope of the documents provided.', and then AI can answer the question based on chat history or parametric knowledge.",
 'chat_history': '\nThe human asked what the AI thinks of artificial intelligence. The AI thinks artificial intelligence is a force for good because it will help humans reach their full potential. The human then asked who is the author of Ancient and Modern Physics, and the AI used a retrievalQA tool before responding that the question was not in the scope of the documents provided, but based on the previous conversation, the author is Albert Einstein. The human then asked how to pre-train a transformer, and the AI suggested using teacher forcing and a cross-entropy loss for optimization, as well as a standard encoder-decoder Transformer as proposed by Vaswani et al. (2017