In [1]:
import weaviate
from dotenv import load_dotenv
import os

load_dotenv()
  
WCS_API_KEY = os.getenv("WCS_API_KEY")
WEAVIATE_CLUSTER_URL= os.getenv("WEAVIATE_CLUSTER_URL")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
  
# Connect to a WCS instance
weaviate_client = weaviate.connect_to_wcs(
    cluster_url=WEAVIATE_CLUSTER_URL,
    auth_credentials=weaviate.auth.AuthApiKey(WCS_API_KEY),
    headers = {
        'X-OpenAI-Api-Key': OPENAI_API_KEY
    }
    )



In [2]:
import weaviate.classes.config as wvcc

weaviate_client.collections.delete_all()
collection = weaviate_client.collections.create(
    name = "WeaviateBlogChunk",
    vectorizer_config=wvcc.Configure.Vectorizer.text2vec_openai(),
    properties=[
        wvcc.Property(name = "content", data_type=wvcc.DataType.TEXT),
        wvcc.Property(name = "author", data_type=wvcc.DataType.TEXT),
    ]
)

In [3]:
import re

def chunk_list(lst, chunk_size):
    """Break a list into chunks of the specified size."""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

def split_into_sentences(text):
    """Split text into sentences using regular expressions."""
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

def read_and_chunk_index_files(main_folder_path):
    """Read index.md files from subfolders, split into sentences, and chunk every 5 sentences."""
    blog_chunks = []
    for folder_name in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, folder_name)
        if os.path.isdir(subfolder_path):
            index_file_path = os.path.join(subfolder_path, "index.mdx")
            if os.path.isfile(index_file_path):
                with open(index_file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    sentences = split_into_sentences(content)
                    sentence_chunks = chunk_list(sentences, 5)
                    sentence_chunks = [' '.join(chuck) for chuck in sentence_chunks]
                    blog_chunks.extend(sentence_chunks)
    return blog_chunks

main_folder_path = './blog'
blog_chunks = read_and_chunk_index_files(main_folder_path)

In [4]:
len(blog_chunks)

1780

In [5]:
blog_chunks[0]

"---\ntitle: 'Accelerating Vector Search up to +40% with Intel’s latest Xeon CPU - Emerald Rapids'\nslug: intel\nauthors: [zain, asdine, john]\ndate: 2024-03-26\nimage: ./img/hero.png\ntags: ['engineering', 'research']\ndescription: 'Boosting Weaviate using SIMD-AVX512, Loop Unrolling and Compiler Optimizations'\n---\n\n![HERO image](./img/hero.png)\n\n**Overview of Key Sections:**\n- [**Vector Distance Calculations**](#vector-distance-calculations) Different vector distance metrics popularly used in Weaviate. - [**Implementations of Distance Calculations in Weaviate**](#vector-distance-implementations) Improvements under the hood for implementation of Dot product and L2 distance metrics. - [**Intel’s 5th Gen Intel Xeon Processor, Emerald Rapids**](#enter-intel-emerald-rapids)  More on Intel's new 5th Gen Xeon processor. - [**Benchmarking Performance**](#lets-talk-numbers) Performance numbers on microbenchmarks along with simulated real-world usage scenarios. What’s the most important 

In [6]:
blogs = weaviate_client.collections.get("WeaviateBlogChunk")

for idx, blog_chunk in enumerate(blog_chunks):
    upload = blogs.data.insert(
        properties={
            "content": blog_chunk
        }
    )