# Introduction

This notebook shows that the weaviate spark connector works.

# Imports

In [1]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
import os
import weaviate
import uuid

# Setup

In [2]:
spark = SparkSession.builder.remote("sc://spark:15002").getOrCreate()

  if LooseVersion(pandas.__version__) < LooseVersion(minimum_pandas_version):
  if LooseVersion(pyarrow.__version__) < LooseVersion(minimum_pyarrow_version):
  if LooseVersion(grpc.__version__) < LooseVersion(minimum_grpc_version):


In [3]:
client = weaviate.Client("http://weaviate:8080")
client.schema.delete_all()

# Demo

Create some data:

In [4]:
# Define a list of kids book titles
titles = [
    "The Cat in the Hat",
    "Where the Wild Things Are",
    "Green Eggs and Ham",
    "Curious George",
    "Goodnight Moon",
]

# Define a list of kids book authors
authors = [
    "Dr. Seuss",
    "Maurice Sendak",
    "Dr. Seuss",
    "H.A. Rey",
    "Margaret Wise Brown",
]

# Define a list of kids book descriptions
descriptions = [
    "A mischievous cat in a tall hat",
    "A young boy named Max who sails to an island inhabited by strange creatures",
    "A character named Sam-I-Am who tries to convince another character to try green eggs and ham",
    "A curious monkey who gets into all sorts of trouble",
    "A bedtime story that features a great green room",
]


# Create a Pandas DataFrame with random data
df = pd.DataFrame(
    {
        "id": pd.Series(
            [
                str(uuid.uuid5(uuid.NAMESPACE_DNS, "".join(item)))
                for item in zip(titles, authors, descriptions)
            ]
        ),
        "title": pd.Series(titles),
        "author": pd.Series(authors),
        "description": pd.Series(descriptions),
        "vector": pd.Series([np.random.rand(2).tolist() for _ in range(5)]),
    }
)

# Convert the Pandas DataFrame to PySpark DataFrame
df = spark.createDataFrame(df)

# # Show the PySpark DataFrame
df.show()

+--------------------+--------------------+-------------------+--------------------+--------------------+
|                  id|               title|             author|         description|              vector|
+--------------------+--------------------+-------------------+--------------------+--------------------+
|40291b05-a9da-5fb...|  The Cat in the Hat|          Dr. Seuss|A mischievous cat...|[0.01772842184581...|
|adc373dd-19f5-55b...|Where the Wild Th...|     Maurice Sendak|A young boy named...|[0.18736113349957...|
|9cb9e3bf-86d0-5fd...|  Green Eggs and Ham|          Dr. Seuss|A character named...|[0.95506783539131...|
|8f39aadf-c249-563...|      Curious George|           H.A. Rey|A curious monkey ...|[0.02401788748345...|
|584aa8cd-5b1f-581...|      Goodnight Moon|Margaret Wise Brown|A bedtime story t...|[0.66434099099570...|
+--------------------+--------------------+-------------------+--------------------+--------------------+



Create the schema in weaviate:

In [5]:
schema = {
    "class": "Book",
    "properties": [
        {"name": "title", "dataType": ["text"]},
        {"name": "author", "dataType": ["text"]},
        {"name": "description", "dataType": ["text"]},
    ],
}

client.schema.create_class(schema)

Write the dataframe to weaviate:

In [6]:
df.withColumnRenamed("id", "uuid").write.format("io.weaviate.spark.Weaviate").option(
    "batchSize", 200
).option("scheme", "http").option("host", "weaviate:8080").option("id", "uuid").option(
    "className", "Book"
).option(
    "vector", "vector"
).mode(
    "append"
).save()

Check content in weaviate:

In [7]:
client.data_object.get(class_name="Book", with_vector=True)

{'deprecations': [],
 'objects': [{'class': 'Book',
   'creationTimeUnix': 1691767469657,
   'id': '40291b05-a9da-5fb2-90a8-951f49d1dc40',
   'lastUpdateTimeUnix': 1691767469657,
   'properties': {'author': 'Dr. Seuss',
    'description': 'A mischievous cat in a tall hat',
    'title': 'The Cat in the Hat'},
   'vector': [0.017728422, 0.33793798],
   'vectorWeights': None},
  {'class': 'Book',
   'creationTimeUnix': 1691767469657,
   'id': '584aa8cd-5b1f-5814-b98c-144cde544ac6',
   'lastUpdateTimeUnix': 1691767469657,
   'properties': {'author': 'Margaret Wise Brown',
    'description': 'A bedtime story that features a great green room',
    'title': 'Goodnight Moon'},
   'vector': [0.664341, 0.95618093],
   'vectorWeights': None},
  {'class': 'Book',
   'creationTimeUnix': 1691767469657,
   'id': '8f39aadf-c249-563e-953e-f2677496333d',
   'lastUpdateTimeUnix': 1691767469657,
   'properties': {'author': 'H.A. Rey',
    'description': 'A curious monkey who gets into all sorts of trouble