In [1]:
schema = {
    "classes": [
        {
            "class": "TextChunk",
            "description": "A segmented portion of text from a scraped webpage with full details.",
            "invertedIndexConfig": {
                "indexTimestamps": True
            },
            "vectorizer": "text2vec-openai",
            "moduleConfig": {
                "generative-openai": {
                    "model": "gpt-3.5-turbo"
                }
            },
            "properties": [
                {
                    "name": "key",
                    "dataType": ["string"],
                    "description": "The identifier for the text chunk.",
                    "indexInverted": True
                },
                {
                    "name": "text",
                    "dataType": ["string"],
                    "description": "The content of the text chunk.",
                    "indexInverted": True
                },
                {
                    "name": "pageURL",
                    "dataType": ["string"],
                    "description": "The specific URL of the scraped webpage this chunk belongs to.",
                    "indexInverted": True
                },
                {
                    "name": "websiteAddress",
                    "dataType": ["string"],
                    "description": "The address of the website this chunk comes from.",
                    "indexInverted": True
                },
                {
                    "name": "timestamp",
                    "dataType": ["date"],
                    "description": "The date and time when the chunk was scraped.",
                    "indexInverted": True
                }
            ]
        }
    ]
}


In [2]:
import weaviate
import csv
from os import listdir
from datetime import datetime, timezone

def create_date(date_string):
    # convert it to datetime object
    dt_object = datetime.strptime(date_string, "%Y-%m-%dT%H-%M-%S")
    # convert datetime object to RFC 3339 string (with timezone)
    rfc3339_string = dt_object.replace(tzinfo=timezone.utc).isoformat()
    return rfc3339_string


# Initialize Weaviate client
client = weaviate.Client(
    # Weaviate instance URL
    url="http://34.66.77.236:8080",
    additional_headers={
        # Replace with your OpenAI key
        "X-OPENAI-Api-Key": "sk-OPEN_AI_KEY",
    }
)

# Delete existing schema (caution: this deletes the current structure)
client.schema.delete_all()

# Here we use the schema created in the previous cell.
client.schema.create(schema)
print("Schema was created.")

# Function to load data from CSV and extract website name and timestamp from filename
def load_csv_data(directory):
    all_data = []
    for filename in listdir(directory):
        if filename.endswith('.csv'):
            website_name, timestamp = filename.rsplit('.', 1)[0].split('_')
            timestamp = create_date(timestamp)
            with open(f"{directory}/{filename}", mode='r') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    row['websiteAddress'] = website_name
                    row['timestamp'] = timestamp
                    all_data.append(row)
    return all_data

# Load CSV data. TODO: Scraper needs to be modified to save CSVs here
data_directory = '../data'

csv_data = load_csv_data(data_directory)

text_chunk_size = 500

def split_into_chunks(string, text_chunk_size):
    words = string.split()
    chunks = []
    chunk = ""
    
    # Split into chunks
    for idx, word in enumerate(words):
        if idx % text_chunk_size == 0 and idx > 0:
            chunks.append(chunk.strip())
            chunk = ""
        chunk += word + " "
    
    # Add last chunk to list by concatenating with last chunk in the list
    if chunk and len(chunks) > 0:
        chunks[-1] += chunk.strip()
        # chunks.append(chunk.strip())
        
    # If there's only one chunk, return list with one chunk
    elif chunk:
        chunks.append(chunk.strip())
    return chunks

Schema was created.


In [None]:
csv_data

In [10]:
from weaviate.batch import Batch 
from weaviate.util import generate_uuid5 

csv_website_address = csv_data[0]['websiteAddress']
csv_timestamp = csv_data[0]['timestamp']

def text_chunk_exists(client, website_address, timestamp):
    query = f"""
    {{
      Get {{
        TextChunk (where: {{
            operator: And
            operands: [{{
                path: ["websiteAddress"],
                operator: Equal,
                valueString: "{website_address}"
            }}, {{
                path: ["timestamp"],
                operator: Equal,
                valueDate: "{timestamp}"
            }}]
        }}) {{
          __typename
        }}
      }}
    }}
    """

    result = client.query.raw(query)
    print(result)
    return len(result['data']['Get']['TextChunk']) > 0  # Returns True if TextChunk exists, False otherwise

# text_chunk_exists(client, "www.bubble.com", "2023-10-03T15:30:00+00:00")
already_exists = text_chunk_exists(client, csv_website_address, csv_timestamp)

{'data': {'Get': {'TextChunk': []}}}


In [16]:
from weaviate.batch import Batch 
from weaviate.util import generate_uuid5 

if not already_exists:
    # Configure the batch size
    client.batch.configure(batch_size=20)

    for data in csv_data:
        # Split text into chunks using the previously defined function
        chunks = split_into_chunks(data["text"], text_chunk_size)
        for i, chunk in enumerate(chunks):
            # Create each chunk as a separate TextChunk object
            text_chunk = {
                "key": f"{data['page']}_chunk_{i}",
                "text": chunk,
                "pageURL": data["page"],
                "websiteAddress": data["websiteAddress"],
                "timestamp": data["timestamp"]
            }

            # Generate a unique UUID for this TextChunk
            text_chunk_id = generate_uuid5(f"{data['websiteAddress']}_chunk_{i}{data['timestamp']}")

            # Add the data object to Weaviate using batch
            client.batch.add_data_object(data_object=text_chunk, class_name="TextChunk", uuid=text_chunk_id)

    # Flush the batch to make sure all data is submitted
    client.batch.flush()

In [None]:
# NOT BATCHING
# if not already_exists:
#     for data in csv_data:
#         # Split text into chunks using the previously defined function
#         chunks = split_into_chunks(data["text"], text_chunk_size)
#         for i, chunk in enumerate(chunks):
#             # Create each chunk as a separate TextChunk object
#             text_chunk = {
#                 "key": f"{data['key']}_chunk_{i}",
#                 "text": chunk,
#                 "pageURL": data["key"],
#                 "websiteAddress": data["websiteAddress"],
#                 "timestamp": data["timestamp"]
#             }

#             # Generate a unique UUID for this TextChunk
#             text_chunk_id = generate_uuid(f"{data['websiteAddress']}_chunk_{i}{data['timestamp']}")

#             # Add the data object to Weaviate
#             client.data_object.create(data_object=text_chunk, class_name="TextChunk", uuid=text_chunk_id)

In [17]:
client.schema.get()

{'classes': [{'class': 'TextChunk',
   'description': 'A segmented portion of text from a scraped webpage with full details.',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'indexTimestamps': True,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'generative-openai': {'model': 'gpt-3.5-turbo'},
    'text2vec-openai': {'model': 'ada',
     'modelVersion': '002',
     'type': 'text',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'The identifier for the text chunk.',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'key',
     'tokenization': 'whitespace'},
    {'dataType': ['text'],
     'description': 'The content of the text chunk.',
     'indexFilterable': True,
     'indexSearc

In [None]:
# Check everything via GraphQL
query = """
{
  Get {
    TextChunk {
      key
      text
      pageURL
      websiteAddress
      timestamp
    }
  }
}
"""

result = client.query.raw(query)
print(result)

In [23]:
text_to_search = "What is computer vision?"

where_filter = {
    "operator" : "And",
    "operands" : [
        {
            "path": ["websiteAddress"],
            "operator": "Equal",
            "valueString": "www.chooch.com"
        },
        {
            "path": ["timestamp"],
            "operator": "Equal",
            "valueDate": "2023-10-03T15:30:00+00:00"
        }
    ]
}

results = client.query.get('TextChunk', ['text']) \
    .with_limit(2) \
    .with_near_text({'concepts': [text_to_search]}) \
    .with_where(where_filter) \
    .do()

print(results)

{'data': {'Get': {'TextChunk': [{'text': 'the field’s boundaries in addressing these issues. Techniques such as histogram equalization, gamma correction, SIFT, SURF, RPCA, and the use of CNNs, GNNs, and semi-supervised and unsupervised learning techniques, along with data augmentation strategies, have all been instrumental in overcoming these challenges. Continued investment in research, development, and training of the next generation of computer vision scientists is vital for the field’s evolution. As computer vision advances, it will play an increasingly important role in driving efficiency and innovation in many sectors of the economy and society. Despite the challenges faced, the future of computer vision technology remains promising, with immense potential to reshape our world. Computer vision platforms of tomorrow The most recent wave of generative AI technologies will prove instrumental in shaping the next iterations of computer vision solutions. Today’s computer vision platfor