In [1]:
schema = {
    "classes": [
        {
            "class": "TextChunk",
            "description": "A segmented portion of text from a scraped webpage.",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {
                "generative-openai": {
                    "model": "gpt-3.5-turbo"
                }
            },
            "properties": [
                {
                    "name": "key",
                    "dataType": ["string"],
                    "description": "The identifier for the text chunk.",
                    "indexInverted": True
                },
                {
                    "name": "text",
                    "dataType": ["string"],
                    "description": "The content of the text chunk.",
                    "indexInverted": True
                }
            ]
        },
        {
            "class": "Page",
            "description": "Details of a specific webpage scraped during a session.",
            "properties": [
                {
                    "name": "pageURL",
                    "dataType": ["string"],
                    "description": "The specific URL of the scraped webpage.",
                    "indexInverted": True
                },
                {
                    "name": "chunks",
                    "dataType": ["TextChunk"],
                    "description": "Segmented content chunks from the scraped page."
                }
            ]
        },
        {
            "class": "ScrapeSession",
            "description": "Represents a specific scrape event at a certain timestamp.",
            "invertedIndexConfig": {
                "indexTimestamps": True
            },
            "properties": [
                {
                    "name": "timestamp",
                    "dataType": ["date"],
                    "description": "The date and time when the scrape session occurred.",
                    "indexInverted": True
                },
                {
                    "name": "pages",
                    "dataType": ["Page"],
                    "description": "Webpages scraped during the session."
                }
            ]
        },
        {
            "class": "Website",
            "description": "Represents a website which can have multiple scrape sessions.",
            "properties": [
                {
                    "name": "websiteAddress",
                    "dataType": ["string"],
                    "description": "The address of the website.",
                    "indexInverted": True
                },
                {
                    "name": "scrapeSessions",
                    "dataType": ["ScrapeSession"],
                    "description": "The scrape sessions associated with the website."
                }
            ]
        }
    ]
}

# To print, for example, the properties of the Website class:
print(schema["classes"][-1]["properties"])

[{'name': 'websiteAddress', 'dataType': ['string'], 'description': 'The address of the website.', 'indexInverted': True}, {'name': 'scrapeSessions', 'dataType': ['ScrapeSession'], 'description': 'The scrape sessions associated with the website.'}]


In [2]:
import weaviate
import csv
from os import listdir
from datetime import datetime, timezone

def create_date(date_string):
    # convert it to datetime object
    dt_object = datetime.strptime(date_string, "%Y-%m-%dT%H-%M-%S")
    # convert datetime object to RFC 3339 string (with timezone)
    rfc3339_string = dt_object.replace(tzinfo=timezone.utc).isoformat()
    return rfc3339_string


# Initialize Weaviate client
client = weaviate.Client(
    url="http://34.66.77.236:8080",
    additional_headers={
        "X-OPENAI-Api-Key": "sk-OPEN_AI_KEY",  # Replace with your OpenAI key
    }
)

# Delete existing schema (caution: this deletes the current structure)
client.schema.delete_all()

# Here we use the schema created in the previous cell.
client.schema.create(schema)
print("Schema was created.")

# Function to load data from CSV and extract website name and timestamp from filename
def load_csv_data(directory):
    all_data = []
    for filename in listdir(directory):
        if filename.endswith('.csv'):
            website_name, timestamp = filename.rsplit('.', 1)[0].split('_')
            timestamp = create_date(timestamp)
            with open(f"{directory}/{filename}", mode='r') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    row['websiteAddress'] = website_name
                    row['timestamp'] = timestamp
                    all_data.append(row)
    return all_data

# Load CSV data. TODO: Scraper needs to be modified to save CSVs here
data_directory = '../data'

csv_data = load_csv_data(data_directory)

text_chunk_size = 500

def split_into_chunks(string, text_chunk_size):
    words = string.split()
    chunks = []
    chunk = ""
    
    # Split into chunks
    for idx, word in enumerate(words):
        if idx % text_chunk_size == 0 and idx > 0:
            chunks.append(chunk.strip())
            chunk = ""
        chunk += word + " "
    
    # Add last chunk to list by concatenating with last chunk in the list
    if chunk and len(chunks) > 0:
        chunks[-1] += chunk.strip()
        # chunks.append(chunk.strip())
        
    # If there's only one chunk, return list with one chunk
    elif chunk:
        chunks.append(chunk.strip())
    return chunks

Schema was created.


In [None]:
csv_data

In [4]:
from weaviate.batch import Batch 
from weaviate.util import generate_uuid5 

# create a "Website" for the website
csv_website_address = csv_data[0]['websiteAddress']

print(csv_website_address)

websites_in_weaviate = client.query.raw(f"""
{{
  Get {{
    Website(where: {{path: ["websiteAddress"], operator: Equal, valueString: "{csv_website_address}"}}){{
      websiteAddress
    }}
  }}
}}
""")

if len(websites_in_weaviate['data']['Get']['Website'])==0:
    website_uuid = generate_uuid5(csv_website_address)
    client.data_object.create({"websiteAddress": csv_website_address}, "Website", website_uuid)

# Create a "ScrapeSession" for the session
csv_timestamp = csv_data[0]['timestamp']  # assuming csv_data[0] has the timestamp
scrapesessions_in_weaviate = client.query.raw(f"""
{{
  Get {{
    ScrapeSession(where: {{path: ["timestamp"], operator: Equal, valueDate: "{csv_timestamp}"}}) {{
      timestamp
    }}
  }}
}}
""")

print(scrapesessions_in_weaviate)

if len(scrapesessions_in_weaviate['data']['Get']['ScrapeSession']) == 0:
    scrapesession_uuid = generate_uuid5(csv_timestamp)
    client.data_object.create({"timestamp": csv_timestamp}, "ScrapeSession", scrapesession_uuid)
    
    # Add to_object_class_name='Website' as an argument
    client.batch.add_reference(
        from_object_uuid=website_uuid, 
        from_object_class_name='Website',
        from_property_name='scrapeSessions',
        to_object_uuid=scrapesession_uuid,
        to_object_class_name='ScrapeSession'
    )

# For each entry in the list csv_data
for data in csv_data:
    # Create a "Page" entry for the webpage
    pageAddress = data['page']
    page_uuid = generate_uuid5(pageAddress)

    client.data_object.create({"pageURL": pageAddress}, "Page", page_uuid)

    # Add 'ScrapeSession' into to_object_class_name
    client.batch.add_reference(
        from_object_uuid=scrapesession_uuid,
        from_object_class_name='ScrapeSession',
        from_property_name='pages',
        to_object_uuid=page_uuid,
        to_object_class_name='Page'
    )

    # Split the text into chunks and create TextChunks
    chunks = split_into_chunks(data['text'], text_chunk_size)

    for chunk in chunks:
        textchunk_uuid = generate_uuid5(chunk)
        client.data_object.create({"text": chunk}, "TextChunk", textchunk_uuid)

        # Add 'Page' into to_object_class_name
        client.batch.add_reference(
            from_object_uuid=page_uuid,
            from_object_class_name='Page',
            from_property_name='chunks',
            to_object_uuid=textchunk_uuid,
            to_object_class_name='TextChunk'
        )

# Submit the batch to weaviate
# status_objects = client.batch.create_objects()
# status_references = client.batch.create_references()

# Flush to Weaviate
client.batch.configure(batch_size=20)
client.batch.flush()

www.chooch.com
{'data': {'Get': {'ScrapeSession': []}}}


In [5]:
client.schema.get()

{'classes': [{'class': 'TextChunk',
   'description': 'A segmented portion of text from a scraped webpage.',
   'invertedIndexConfig': {'bm25': {'b': 0.75, 'k1': 1.2},
    'cleanupIntervalSeconds': 60,
    'stopwords': {'additions': None, 'preset': 'en', 'removals': None}},
   'moduleConfig': {'generative-openai': {'model': 'gpt-3.5-turbo'},
    'text2vec-openai': {'model': 'ada',
     'modelVersion': '002',
     'type': 'text',
     'vectorizeClassName': True}},
   'multiTenancyConfig': {'enabled': False},
   'properties': [{'dataType': ['text'],
     'description': 'The identifier for the text chunk.',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-openai': {'skip': False,
       'vectorizePropertyName': False}},
     'name': 'key',
     'tokenization': 'whitespace'},
    {'dataType': ['text'],
     'description': 'The content of the text chunk.',
     'indexFilterable': True,
     'indexSearchable': True,
     'moduleConfig': {'text2vec-o

In [None]:
# Confirm data was properly loaded
query = """
{
  Get {
    Website {
      websiteAddress
      scrapeSessions {
        ... on ScrapeSession {
          pages {
            ... on Page {
              pageURL
              chunks {
                ... on TextChunk {
                  text
                }
              }
            }
          }
        }
      }
    }
  }
}
"""

result = client.query.raw(query)
print(result)


In [7]:
# # Setup the query for the Website class and run it
# query_result = (
#     client.query
#     .get("Website", ["websiteAddress", "scrapeSessions { timestamp, pages { pageURL, chunks { key, text } } }"])  # specify the properties you're interested in here
#     .with_where(main_filter)
#     .with_near_text({"concepts": [search_text]})
#     .do()
# )

# print(query_result)

In [9]:
search_text = "What is computer vision?"

# Define the where filter for websiteAddress
website_addr_filter = {
    "path": ["websiteAddress"],
    "operator": "Equal",
    "valueString": "www.chooch.com"   # Assuming websiteAddress is type string
}

# Define the where filter for timestamp
timestamp_filter = {
    "path": ["scrapeSessions", "ScrapeSession", "timestamp"], # Following the property path
    "operator": "Equal",
    "valueDate": "2023-10-03T15:30:00+00:00"   # Assuming timestamp is type date
}

# Define the main where filter object
main_filter = {
    "operator": "And",
    "operands": [
        website_addr_filter,
        timestamp_filter
    ]
}

# Setup the query for the Website class and run it
query_result = (
    client.query
    .get("Website", ["websiteAddress", "scrapeSessions { timestamp, pages { pageURL, chunks { key, text } } }"])  # specify the properties you're interested in here
    .with_where(main_filter)
    .with_near_text({"concepts": [search_text]})
    .do()
)

print(query_result)

{'errors': [{'locations': [{'column': 320, 'line': 1}], 'message': 'Cannot query field "timestamp" on type "WebsiteScrapeSessionsObj". Did you mean to use an inline fragment on "ScrapeSession"?', 'path': None}, {'locations': [{'column': 331, 'line': 1}], 'message': 'Cannot query field "pages" on type "WebsiteScrapeSessionsObj". Did you mean to use an inline fragment on "ScrapeSession"?', 'path': None}]}
