In [None]:
!pip -q install huggingface chromadb transformers langchain

In [2]:
!pip install InstructorEmbedding

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting InstructorEmbedding
  Downloading InstructorEmbedding-1.0.0-py2.py3-none-any.whl (13 kB)
Installing collected packages: InstructorEmbedding
Successfully installed InstructorEmbedding-1.0.0


In [3]:
from langchain.embeddings import HuggingFaceEmbeddings

In [4]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [73]:
hfEmbed = HuggingFaceEmbeddings()

In [None]:
hfInstructEmbed = HuggingFaceInstructEmbeddings(
 query_instruction="Represent the query for retrieval: "   
)

In [7]:
text = "This is a test document."

In [8]:
query_result = hfEmbed.embed_query(text)

In [9]:
instructor_result = hfInstructEmbed.embed_query(text)

In [10]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader

In [11]:
spaceLoad = TextLoader('/content/linux_play.txt')

In [12]:
from langchain.text_splitter import CharacterTextSplitter

In [13]:
documents = spaceLoad.load()

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
hfCharSplitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, 
                                                                  chunk_size=100,
                                                                  chunk_overlap=0)

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
recurSplitter = RecursiveCharacterTextSplitter(chunk_size=100,
                                               chunk_overlap=20,
                                               length_function=len)

In [17]:
char_documents = hfCharSplitter.split_documents(documents)



In [51]:
with open('/content/linux_play.txt') as lin:
  txt_lin = lin.read()

recurse_documents = recurSplitter.create_documents([txt_lin])

In [19]:
recurse_documents[0]

Document(page_content='BEGIN;\nLinux Playbook', metadata={})

# Now work on embedding

In [None]:
from langchain.vectorstores import Chroma
# Supplying a persist_directory will store the embeddings on disk

charSplit_directory = 'charSplit_db'

plain_chroma = Chroma.from_documents(documents=char_documents,
                                     embeddings=hfEmbed,
                                     persist_directory=charSplit_directory)

In [74]:
# Supplying a persist_directory will store the embeddings on disk

recurse_directory = 'recurse_db'

recurse_chroma = Chroma.from_documents(documents=recurse_documents,
                                     embeddings=hfInstructEmbed,
                                     persist_directory=recurse_directory)



In [22]:
query = "sed command"
plain_chroma.similarity_search(query)

[]

In [75]:
query = "sed command"
recurse_chroma.similarity_search(query)

[Document(page_content='b) sed commands\n    The following needs to practiced based on the SED15Command_guide.pdf', metadata={}),
 Document(page_content="# sed -e '/Cop/,$p' file\n  # sed -i -e reg_exp file to directly modify file", metadata={}),
 Document(page_content="s/<YEAR>/2022/' MIT.LICENSE\n  # sed -e 's/^/#/' MIT.LICENSE", metadata={}),
 Document(page_content="# sed -e '1r input_file' output_file (to copy onefile into another)", metadata={})]

In [76]:
recurse_chroma.similarity_search_with_score(query)

[(Document(page_content='b) sed commands\n    The following needs to practiced based on the SED15Command_guide.pdf', metadata={}),
  0.5646041035652161),
 (Document(page_content="# sed -e '/Cop/,$p' file\n  # sed -i -e reg_exp file to directly modify file", metadata={}),
  0.8786867260932922),
 (Document(page_content="s/<YEAR>/2022/' MIT.LICENSE\n  # sed -e 's/^/#/' MIT.LICENSE", metadata={}),
  0.9263590574264526),
 (Document(page_content="# sed -e '1r input_file' output_file (to copy onefile into another)", metadata={}),
  0.9417576789855957)]

In [None]:
!pip -q install faiss-cpu

In [27]:
from langchain.vectorstores.faiss import FAISS
faissDb = FAISS.from_documents(documents=recurse_documents,
                                     embedding=hfEmbed)

In [28]:
faissRetriever = faissDb.as_retriever()

In [29]:
faissRetriever.get_relevant_documents("How many sed commands?")

[Document(page_content='b) sed commands\n    The following needs to practiced based on the SED15Command_guide.pdf', metadata={}),
 Document(page_content='- go to the line number in the file and edit it using vim\n    - counting matches', metadata={}),
 Document(page_content="# sed -e '/Cop/,$p' file\n  # sed -i -e reg_exp file to directly modify file", metadata={}),
 Document(page_content="- sed commands:\n  # sed -e 's/<COPYRIGHT HOLDER>/solverbot/' -e 's/<YEAR>/2022/' MIT.LICENSE | head", metadata={})]

In [30]:
mmr_retriever = faissDb.as_retriever(search_type="mmr")

In [31]:
mmr_retriever.get_relevant_documents("How many Sed Commands")

[Document(page_content='b) sed commands\n    The following needs to practiced based on the SED15Command_guide.pdf', metadata={}),
 Document(page_content='- count the number of occurence of the pattern', metadata={}),
 Document(page_content='- Print a range of lines\n    - Print 3rd or 5th line', metadata={}),
 Document(page_content="- grep commands:\n  # grep 'patter' fileName.ext\n  # grep 'patter' -n fileName.ext", metadata={})]

#Startin to use LLMs and Chains

In [32]:
from langchain.chains.summarize import load_summarize_chain
from langchain.chains import AnalyzeDocumentChain
from langchain import PromptTemplate, HuggingFaceHub, LLMChain

In [33]:
!pip -q install configparser

In [34]:
import os
import configparser

data = configparser.ConfigParser()
data.read_file(open('/content/apidata.config'))
os.environ['HUGGINGFACEHUB_API_TOKEN'] = data["OPENAI"]["HFREAD"]
os.environ['SERPAPI_API_KEY'] = data["OPENAI"]["SERPAPI_KEY"]
os.environ['OPENAI_API_KEY'] = data["OPENAI"]["KEY"]

In [35]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, 
                        input_variables=["question"])

llm_chain = LLMChain(prompt=prompt, 
                     llm=HuggingFaceHub(repo_id="EleutherAI/gpt-j-6B", 
                                                       model_kwargs={"temperature":0.1,
                                                                     "max_length":128}))

In [36]:
llm_chain.run("Tell me about yourself.")

"\n\nQuestion: What do you do?\n\nAnswer: I'm a writer.\n\nQuestion: What do you write?\n\nAnswer: I write about the world.\n\nQuestion: What do you write about?\n\nAnswer: I write about the world.\n\nQuestion: What do you write about?\n\nAnswer: I write about the world.\n\nQuestion: What do you write about?\n\nAnswer: I write about the world.\n\nQuestion: What do you write about?\n\nAnswer"

In [77]:
from langchain.chains.summarize import load_summarize_chain

hugLLM = HuggingFaceHub(repo_id="google/flan-ul2",
                        model_kwargs={"temperature":0.1,
                                      "max_length":512})

summary_chain = load_summarize_chain(llm=hugLLM, 
                                     chain_type="map_reduce")

In [84]:
#Using summarize on Analyze documents chain
summarize_document_chain = AnalyzeDocumentChain(combine_docs_chain=summary_chain)

In [None]:
summarize_document_chain.run(txt_lin)

In [78]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm=hugLLM, 
                                 chain_type="stuff", 
                                 retriever=recurse_chroma.as_retriever())

In [80]:
qa.run("How many Lines are there")

'Count the lines, all and only data lines'

In [81]:
from langchain.chains import LLMChain
prompt_template = """Use the context below to write a 400 word blog post about the topic below:
    Context: {context}
    Topic: {topic}
    Blog post:"""

PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "topic"]
)

chain = LLMChain(llm=hugLLM, 
                 prompt=PROMPT)

In [67]:
def generate_answer(topic):
    docs = recurse_chroma.similarity_search(topic, k=4)
    inputs = [{"context": doc.page_content, "topic": topic} for doc in docs]
    print(chain.apply(inputs))

In [83]:
generate_answer("Return all the Sed commands")

[{'text': 'sed is a stream editor that can be used to modify text files. It is a'}, {'text': "sed -e '/Cop/,$p' file sed -"}, {'text': "s/YEAR>/2022/' MIT.LICENSE # sed"}, {'text': "- sed commands: # sed -e 's/COPYRIGHT"}]
