# Indexing Stage
In the initial indexing stage, text data must be first collected as documents and metadata. In this implementation, this is performed by the scraping of website. This data must be then split into "nodes", which is a represents a "chunk" or part of the data containing a certain portion of information. Nodes must are then indexed via an embedding model, where we plan on using OpenAI's Ada v2 embedding model. The embeddings and metadata together create a rich representation to aid in retrieval.

In [1]:
# Suppress Pydantic warnings since it's based in llamaindex
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

## Hard-coded stuff in this cell that will be replaced in the cloud function
* OPEN AI Key will be an environment variable
* Weaviate IP address that we will work on finding programmatically

In [2]:
# !pip install weaviate-client
# !pip install openai
# !pip install llama-index
# !pip install python-dotenv

import weaviate
import pandas as pd
import os

from dotenv import load_dotenv
from datetime import datetime, timezone
from llama_index import Document
# Suppress Pydantic warnings
import warnings
warnings.simplefilter(action='ignore', category=Warning)


from llama_index.node_parser import SimpleNodeParser
from llama_index.storage.storage_context import StorageContext
from llama_index.vector_stores.weaviate import WeaviateVectorStore
from llama_index.indices.vector_store.base import VectorStoreIndex
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters

# Load the .env file
load_dotenv()

# Retrieve the OpenAI API key from the environment variables
OPENAI_KEY = os.getenv("OPENAI_KEY")

# Set the OpenAI key as an Environment Variable (for when it's run on GCS)
os.environ["OPENAI_API_KEY"] = OPENAI_KEY

# Current Weaviate IP
WEAVIATE_IP_ADDRESS = "34.42.138.162"

In [3]:
def create_date(date_string):
    """
    Convert a date string to RFC 3339 formatted string with timezone.

    Parameters:
    - date_string (str): Input date string in the format "%Y-%m-%dT%H-%M-%S".

    Returns:
    - str: RFC 3339 formatted date-time string.
    """
    dt_object = datetime.strptime(date_string, "%Y-%m-%dT%H-%M-%S")
    # convert datetime object to RFC 3339 string (with timezone)
    rfc3339_string = dt_object.replace(tzinfo=timezone.utc).isoformat()
    return rfc3339_string

In [4]:
client = weaviate.Client(url="http://" + WEAVIATE_IP_ADDRESS + ":8080")

# # Delete existing schema (caution: this deletes the current structure)
# client.schema.delete_all()

# # Here we use the schema created in the previous cell.
# client.schema.create(schema)
# print("Schema was created.")

## Hard-coded stuff in this cell that will be replaced in the cloud function
* data_directory will be the bucket
* csv_file will be the new file added to the bucket

In [5]:
data_directory = "./sample_data"
csv_file = 'descript.com_2023-10-07T06-47-45.csv'
# Get the website address and timestamp from the filename
websiteAddress, timestamp = csv_file.rsplit('.', 1)[0].split('_')

# Read in the CSV
df = pd.read_csv(data_directory + "/" + csv_file)

# Manually assemble the documents
documents = []
for _, row in df.iterrows():
    document = Document(
        text=row['text'],
        metadata={
            'websiteAddress': websiteAddress,
            'timestamp': timestamp
        }
    )
    document.doc_id = row['key']
    documents.append(document)

In [6]:
vector_store = WeaviateVectorStore(
    weaviate_client=client,
    index_name="Pages",
    text_key="text"
)

index = VectorStoreIndex.from_vector_store(
    vector_store=vector_store,
    service_context=None
)

for document in documents:
    index.insert(document)

In [7]:
len(documents)

187