In [None]:
!pip -q install huggingface chromadb transformers langchain

In [2]:
!pip install InstructorEmbedding

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting InstructorEmbedding
  Downloading InstructorEmbedding-1.0.0-py2.py3-none-any.whl (13 kB)
Installing collected packages: InstructorEmbedding
Successfully installed InstructorEmbedding-1.0.0


In [3]:
from langchain.embeddings import HuggingFaceEmbeddings

In [4]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

Embeddings create a vector representation of a piece of text. This is useful because it means we can think about text in the vector space, and do things like semantic search where we look for pieces of text that are most similar in the vector space.

The base Embedding class in LangChain exposes two methods: embed_documents and embed_query. The largest difference is that these two methods have different interfaces: one works over multiple documents, while the other works over a single document. Besides this, another reason for having these as two separate methods is that some embedding providers have different embedding methods for documents (to be searched over) vs queries (the search query itself).

In [None]:
hfEmbed = HuggingFaceEmbeddings()

In [6]:
hfEmbed.model_name

'sentence-transformers/all-mpnet-base-v2'

In [None]:
hfInstructEmbed = HuggingFaceInstructEmbeddings(
 query_instruction="Represent the query for retrieval: "   
)

In [8]:
hfInstructEmbed.model_name

'hkunlp/instructor-large'

In [9]:
text = "This is a test document."

In [10]:
query_result = hfEmbed.embed_query(text)

In [None]:
query_result

In [12]:
doc_result = hfEmbed.embed_documents([text])

In [None]:
doc_result

In [14]:
instructor_result = hfInstructEmbed.embed_query(text)

In [None]:
instructor_result

In [16]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader

In [20]:
spaceLoad = TextLoader('/content/linux_play.txt')

In [21]:
from langchain.text_splitter import CharacterTextSplitter

In [22]:
documents = spaceLoad.load()

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [34]:
plainCharSplitter = CharacterTextSplitter(separator='\n\n',
                                          chunk_size=500,
                                          chunk_overlap=0,
                                          length_function=len)

In [24]:
hfCharSplitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, 
                                                                  chunk_size=100,
                                                                  chunk_overlap=0)

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
recurSplitter = RecursiveCharacterTextSplitter(chunk_size=100,
                                               chunk_overlap=20,
                                               length_function=len)

In [35]:
plain_docs = plainCharSplitter.split_documents(documents)



In [36]:
plain_docs[0]

Document(page_content='BEGIN;\nLinux Playbook\n\nThe command and scenarios has to be executed inside the\nKali Docker image. The docker image is called linux_playg.\nThe docker will be contain the\nset of set of files, prepared for this plabook and\nuploaded on to dockerhub.', metadata={'source': '/content/linux_play.txt'})

In [None]:
char_documents = hfCharSplitter.split_documents(documents)

In [28]:
char_documents[0]

Document(page_content='BEGIN;\nLinux Playbook\n\nThe command and scenarios has to be executed inside the\nKali Docker image. The docker image is called linux_playg.\nThe docker will be contain the\nset of set of files, prepared for this plabook and\nuploaded on to dockerhub.', metadata={'source': '/content/linux_play.txt'})

In [31]:
with open('/content/linux_play.txt') as lin:
  txt_lin = lin.read()

recurse_documents = recurSplitter.create_documents([txt_lin])

In [32]:
recurse_documents[0]

Document(page_content='BEGIN;\nLinux Playbook', metadata={})

# Now work on embedding

In [38]:
hfEmbed_recurse = hfEmbed.embed_query("BEGIN;\nLinux Playbook")

In [None]:
hfEmbed_recurse

In [40]:
hfEmbed_charSplit = hfEmbed.embed_query("BEGIN;\nLinux Playbook\n\nThe command and scenarios has to be executed inside the\nKali Docker image. The docker image is called linux_playg.\nThe docker will be contain the\nset of set of files, prepared for this plabook and\nuploaded on to dockerhub.")

In [None]:
hfEmbed_charSplit

In [None]:
###Try for the plain here

In [None]:
from langchain.vectorstores import Chroma

# Supplying a persist_directory will store the embeddings on disk

plain_directory = 'plain_db'

plain_chroma = Chroma.from_documents(documents=plain_docs,
                                     embeddings=hfEmbed,
                                     persist_directory=plain_directory)

In [45]:
# Supplying a persist_directory will store the embeddings on disk

charSplit_directory = 'charSplit_db'

plain_chroma = Chroma.from_documents(documents=char_documents,
                                     embeddings=hfEmbed,
                                     persist_directory=charSplit_directory)



In [46]:
# Supplying a persist_directory will store the embeddings on disk

recurse_directory = 'recurse_db'

recurse_chroma = Chroma.from_documents(documents=recurse_documents,
                                     embeddings=hfEmbed,
                                     persist_directory=recurse_directory)

