In [1]:
import weaviate
from dotenv import load_dotenv
import os
from langchain.schema.document import Document
import json

Load Weaviate and OpenAI API keys from .env file

In [2]:
load_dotenv('.env')

True

Helper functions to extract data from directory of json files, convert to LangChain Document object, and insert into Weaviate cluster

In [3]:
#helper function to cleanly print json
def jprint(data_in):
    print(json.dumps(data_in, indent=2))
    
#function to extract data from directory of json files
def extract_data_from_directory(directory_path: str) -> list:
    # List of all JSON files in the directory
    json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
    combined_data = []
    
    for json_file in json_files:
        file_path = os.path.join(directory_path, json_file)
        
        with open(file_path, 'r') as file:
            data = json.load(file)
            combined_data.extend(data)
            
    return combined_data
    
#function to convert python dictionary to LangChain Document object    
def dict_to_doc(documents):
    lang_docs = []
    for chunk in documents:
        doc = Document(page_content = chunk['text'],
                       metadata = {"chunk_topics": chunk['Topics'],
                                   "ep_title": chunk['episode_title'],
                                   "ep_date": chunk['date_posted'],
                                   "ep_guest": chunk['guest'],
                                   "timestamp": "0:00",
                                   "seq_id": chunk["seq_num"],
                                   "ep_link": chunk["video_url"]})
        lang_docs.append(doc)
    return lang_docs

#function to insert LangChain Documents into Weaviate cluster
def insert_documents(documents):
    for document in documents:
        client.data_object.create(
            {"chunk_body" : document.page_content, 
             "chunk_topics": document.metadata["chunk_topics"],
             "ep_title": document.metadata["ep_title"],
             "ep_date": document.metadata["ep_date"],
             "ep_guest": document.metadata["ep_guest"],
             "timestamp": document.metadata["timestamp"],
             "seq_id": document.metadata["seq_id"],
             "ep_link": document.metadata["ep_link"]},
            "Chunk_Node"
        )
        for topic in document.metadata["chunk_topics"]:
            client.data_object.create(
                {"topic": topic,
                 "ep_title": document.metadata["ep_title"]},
                "Topic_Node"
            )

#function to delete weaviate class schema
def delete_class(class_name):
    if client.schema.exists(class_name):
        client.schema.delete_class(class_name)
            
            

Instantiate Weaviate client and define schema for Chunk and Topic nodes

In [4]:
auth_config = weaviate.AuthApiKey(api_key=os.getenv("WEAVIATE_API_KEY"))

client = weaviate.Client(
    url="https://streamlit-hackathon-llm-2zcna00b.weaviate.network",
    auth_client_secret=auth_config,
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }
)

In [5]:
chunk_node_definition = {
    "class": "Chunk_Node",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False            # Include the class name in vector calculation (default true)
        }
    },
    "vectorIndexConfig": {
        "distance": "cosine",
    },
    "properties": [
        {
            "name": "chunk_body",
            "dataType": ["text"]
        },
        {
            "name": "chunk_topics",
            "dataType": ["text[]"]
        },
        {
            "name": "ep_title",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True            
                }
            }
        },
        {
            "name": "ep_date",
            "dataType": ["date"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True            
                }
            }
        },
        {
            "name": "ep_guest",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True            
                }
            }
        },
        {
            "name": "timestamp",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True            
                }
            }
        },
        {
            "name": "seq_id",
            "dataType": ["int"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True            
                }
            }
        },
        {
            "name": "ep_link",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True            
                }
            }
        },
        
    ],
}

topic_node_definition = {
    "class": "Topic_Node",
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False
        }
    },
    "vectorIndexConfig": {
        "distance": "cosine",
    },
    "properties": [
        {
            "name": "topic",
            "dataType": ["text"]
        },
        {
            "name": "ep_title",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True            
                }
            },
        },
    ],
}

In [6]:
client.schema.get()

{'classes': []}

In [6]:
delete_class("Topic_Node")
delete_class("Chunk_Node")

Create Chunk and Topic node classes in Weaviate cluster

In [7]:
client.schema.create_class(chunk_node_definition)
client.schema.create_class(topic_node_definition)

In [8]:
jprint(client.schema.get())

{
  "classes": [
    {
      "class": "Chunk_Node",
      "invertedIndexConfig": {
        "bm25": {
          "b": 0.75,
          "k1": 1.2
        },
        "cleanupIntervalSeconds": 60,
        "stopwords": {
          "additions": null,
          "preset": "en",
          "removals": null
        }
      },
      "moduleConfig": {
        "text2vec-openai": {
          "baseURL": "https://api.openai.com",
          "model": "ada",
          "modelVersion": "002",
          "type": "text",
          "vectorizeClassName": false
        }
      },
      "multiTenancyConfig": {
        "enabled": false
      },
      "properties": [
        {
          "dataType": [
            "text"
          ],
          "indexFilterable": true,
          "indexSearchable": true,
          "moduleConfig": {
            "text2vec-openai": {
              "skip": false,
              "vectorizePropertyName": false
            }
          },
          "name": "chunk_body",
          "tokenization": "

Extract data from directory of json files, convert to LangChain Document object, and insert into Weaviate cluster

In [9]:
transcript_dicts = extract_data_from_directory("LangChain Documents")
transcript_docs = dict_to_doc(transcript_dicts)
insert_documents(transcript_docs)

Make sure all chunks/topics were inserted into Weaviate cluster

In [10]:
jprint(client.query.aggregate("Chunk_Node").with_meta_count().do())
jprint(client.query.aggregate("Topic_Node").with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "Chunk_Node": [
        {
          "meta": {
            "count": 99
          }
        }
      ]
    }
  }
}
{
  "data": {
    "Aggregate": {
      "Topic_Node": [
        {
          "meta": {
            "count": 433
          }
        }
      ]
    }
  }
}
