In [None]:
!pip -q install huggingface chromadb transformers langchain

In [2]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
hfEmbed = HuggingFaceEmbeddings()

In [4]:
hfEmbed.model_name

'sentence-transformers/all-mpnet-base-v2'

In [6]:
text = "This is a test document."

In [7]:
query_result = hfEmbed.embed_query(text)

In [None]:
query_result

In [9]:
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader

In [10]:
spaceLoad = TextLoader('/content/linux_play.txt')

In [11]:
from langchain.text_splitter import CharacterTextSplitter

In [12]:
documents = spaceLoad.load()

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
hfCharSplitter = CharacterTextSplitter.from_huggingface_tokenizer(tokenizer, 
                                                                  chunk_size=100,
                                                                  chunk_overlap=0)

In [15]:
char_documents = hfCharSplitter.split_documents(documents)



In [16]:
char_documents[0]

Document(page_content='BEGIN;\nLinux Playbook\n\nThe command and scenarios has to be executed inside the\nKali Docker image. The docker image is called linux_playg.\nThe docker will be contain the\nset of set of files, prepared for this plabook and\nuploaded on to dockerhub.', metadata={'source': '/content/linux_play.txt'})

# Now work on embedding

In [17]:
hfEmbed_recurse = hfEmbed.embed_query("BEGIN;\nLinux Playbook")

In [None]:
hfEmbed_recurse

In [None]:
###Try for the plain here

In [19]:
from langchain.vectorstores import Chroma

In [None]:
persist = 'chroma_db'

plain_chroma = Chroma.from_documents(documents=char_documents,
                                     embeddings=hfEmbed,
                                     persist_directory=persist)

In [21]:
reload_chroma = Chroma(persist_directory=persist,
                                     embedding_function=hfEmbed)



In [None]:
# Supplying a persist_directory will store the embeddings on disk

from langchain.vectorstores import FAISS

faiss= 'faiss_db'

plain_faiss = FAISS.from_documents(documents=char_documents,
                                     embedding=hfEmbed)

plain_faiss.save_local(faiss)

reload_faiss = FAISS.load_local(persist_directory=persist,
                                     embeddings=hfEmbed)

In [None]:
# Supplying a persist_directory will store the embeddings on disk
import pinecone 
from langchain.vectorstores import Pinecone

pinecone.init(
    api_key="YOUR_API_KEY",  # find at app.pinecone.io
    environment="YOUR_ENV"  # next to api key in console
)

pindex = 'test-index'

plain_pine = Pinecone.from_documents(documents=char_documents,
                                     embeddings=hfEmbed,
                                     index_name=pindex)

reload_pine = Pinecone.from_existing_index(pindex, 
                                           embedding=hfEmbed)

In [None]:
from langchain.vectorstores.pgvector import PGVector

import os

CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver=os.environ.get("PGVECTOR_DRIVER", "psycopg2"),
    host=os.environ.get("PGVECTOR_HOST", "yourhost"),
    port=int(os.environ.get("PGVECTOR_PORT", "5432")),
    database=os.environ.get("PGVECTOR_DATABASE", "postgres"),
    user=os.environ.get("PGVECTOR_USER", "postgres"),
    password=os.environ.get("PGVECTOR_PASSWORD", "postgres"),
)

In [None]:
db = PGVector.from_documents(
    embedding=hfembed,
    documents=docs,
    collection_name="test_index",
    connection_string=CONNECTION_STRING,
)

query = "What is Linux?"
docs_with_score: List[Tuple[Document, float]] = db.similarity_search_with_score(query)