In [1]:
!pip install weaviate-client openai tiktoken langchain sentence-transformers transformers > /dev/null

In [2]:
from langchain.embeddings import (
    HuggingFaceEmbeddings, 
    SentenceTransformerEmbeddings
)

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.vectorstores import Weaviate

In [3]:
def get_text_splits(text_file):
  """Function takes in the text data and returns the  
  splits so for further processing can be done."""
  with open(text_file,'r') as txt:
    data = txt.read()

  textSplit = RecursiveCharacterTextSplitter(chunk_size=150,
                                             chunk_overlap=15,
                                             length_function=len)
  doc_list = textSplit.split_text(data)
  return doc_list

In [None]:
#testing out the above function with the open source 
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [15]:
import weaviate
from weaviate.embedded import EmbeddedOptions

client = weaviate.Client(
  embedded_options=EmbeddedOptions(),
  additional_headers={
        "X-HuggingFace-Api-Key": ""
    }
)

embedded weaviate is already listing on port 6666


In [16]:
client.schema.delete_all()
client.schema.get()

{'classes': []}

The Retrievers & Vectorizaters modules such as text2vec-* or img2vec-* convert data objects to vectors.

The Readers & Generators modules process data after retrieving the data from Weaviate, such as to answer questions or summarize text.

The other modules include everything else, such as a spellcheck module

In [17]:
schema = {
    "classes": [
        {
            "class": "Paragraph",
            "description": "A written paragraph",
         "moduleConfig": {
        "text2vec-huggingface": {
          "model": "sentence-transformers/all-MiniLM-L6-v2",
          "options": {
            "waitForModel": True,
            "useGPU": False,
            "useCache": True
            }
          }
        },
            "properties": [
                {
                    "dataType": ["text"],
                    "description": "The content of the paragraph",
                    "moduleConfig": {
                        "text2vec-huggingface": {
                          "skip": False,
                          "vectorizePropertyName": False
                        }
                      },
                    "name": "content",
                },
            ],
         "vectorizer":"text2vec-huggingface"
        },
    ]
}

In [18]:
client.schema.create(schema)

In [10]:
mail_docs = get_text_splits("mail_collector.txt")

In [None]:
len(mail_docs)

50

In [None]:
mail_docs[0]

'Space via IFTTT <action@ifttt.com>\nAstronomy Picture of the Day:'

In [19]:
# Configure a batch process
with client.batch as batch:
    batch.batch_size=5
    for i, d in enumerate(mail_docs):
        properties = {
            "content": d,
        }

        client.batch.add_data_object(properties, "Paragraph")

Embedded weaviate wasn't listening on port 6666, so starting embedded weaviate again
Started /root/.cache/weaviate-embedded: process ID 10433


In [20]:
client.data_object.get()

{'deprecations': None,
 'objects': [{'class': 'Paragraph',
   'creationTimeUnix': 1682739842536,
   'id': '05d37ca2-78ee-4ed9-bd2e-6437936c22f8',
   'lastUpdateTimeUnix': 1682739842536,
   'properties': {'content': "be similar to one in which our own Sun formed over 4.5 billion years ago. Hubble's stunning image of the stellar nursery was released to celebrate the"},
   'vectorWeights': None},
  {'class': 'Paragraph',
   'creationTimeUnix': 1682739834475,
   'id': '08e41f4b-1bc7-458c-bcd7-44fbe9c68dd1',
   'lastUpdateTimeUnix': 1682739834475,
   'properties': {'content': 'Perseus, it lies at the edge of a large, star-forming molecular cloud. This Hubble Space Telescope close-up frames a region just over 1 light-year'},
   'vectorWeights': None},
  {'class': 'Paragraph',
   'creationTimeUnix': 1682739815426,
   'id': '0a1704ad-8495-4c93-8cde-dbe79bc3b08d',
   'lastUpdateTimeUnix': 1682739815426,
   'properties': {'content': 'Space via IFTTT <action@ifttt.com>\nAstronomy Picture of the D

In [21]:
nearText = {"concepts": ["Photograph"]}

In [22]:
result = (
    client.query
    .get("Paragraph", ["content"])
    .with_near_text(nearText)
    .with_limit(2)
    .do()
)

In [23]:
import json
print(json.dumps(result, indent=4))

{
    "data": {
        "Get": {
            "Paragraph": [
                {
                    "content": "2023-04-26 10:44:41+05:30Was this a lucky shot? Although many amazing photographs are taken by someone who just happened\u00c2\u00a0to be in the right place at"
                },
                {
                    "content": "that it took many hours of exposure with a telescope in Seven Persons, Alberta , Canada to create the featured image.April 24, 2023via NASA"
                }
            ]
        }
    }
}


In [24]:
vectorstore = Weaviate(client, 
                       "Paragraph", 
                       "content")

In [None]:
vectorstore.similarity_search("Photograph")