In [None]:
from llama_index import Document
from llama_index.schema import MetadataMode

In [27]:
schema = {
    "classes": [
        {
            "class": "Website",
            "description": "A website which can have multiple scraped sessions.",
            "properties": [
                {
                    "name": "websiteAddress",
                    "dataType": ["string"],
                    "description": "The address of the website.",
                    "indexInverted": True
                },
                {
                    "name": "scrapedSessions",
                    "dataType": ["ScrapedSession"],
                    "description": "Sessions when the website was scraped.",
                }
            ]
        },
        {
            "class": "ScrapedSession",
            "description": "A specific scraping session of a website.",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {
                "generative-openai": {
                    "model": "gpt-3.5-turbo"
                }
            },
            "properties": [
                {
                    "name": "timestamp",
                    "dataType": ["string"],
                    "description": "Timestamp when the scraping session occurred.",
                    "indexInverted": True
                },
                {
                    "name": "scrapedPages",
                    "dataType": ["ScrapedPage"],
                    "description": "Pages scraped in this session.",
                }
            ]
        },
        {
            "class": "ScrapedPage",
            "description": "Content of a specific page from a scraping session.",
            "vectorizer": "text2vec-openai",
            "moduleConfig": {
                "generative-openai": {
                    "model": "gpt-3.5-turbo"
                }
            },
            "properties": [
                {
                    "name": "pageURL",
                    "dataType": ["string"],
                    "description": "The URL of the scraped page.",
                    "indexInverted": True
                },
                {
                    "name": "textContent",
                    "dataType": ["string"],
                    "description": "The content text of the scraped page.",
                    "indexInverted": True
                }
            ]
        }
    ]
}


In [32]:
import weaviate
import csv
from os import listdir

# Initialize Weaviate client
client = weaviate.Client(
    url="http://35.226.115.132:8080",
    additional_headers={
        "X-OPENAI-Api-Key": "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",  # Replace with your OpenAI key
    }
)

# Delete existing schema (caution: this deletes the current structure)
client.schema.delete_all()

# Here we use the schema created in the previous cell.
client.schema.create(schema)
print("Schema was created.")

# Function to load data from CSV and extract website name and timestamp from filename
def load_csv_data(directory):
    all_data = []
    for filename in listdir(directory):
        if filename.endswith('.csv'):
            website_name, timestamp = filename.rsplit('.', 1)[0].split('_')
            with open(f"{directory}/{filename}", mode='r') as file:
                reader = csv.DictReader(file)
                for row in reader:
                    row['websiteAddress'] = website_name
                    row['timestamp'] = timestamp
                    all_data.append(row)
    return all_data

# Load CSV data. TODO: Scraper needs to be modified to save CSVs here
data_directory = '../data'

csv_data = load_csv_data(data_directory)

# Store UUIDs of already created Websites to avoid duplications
created_websites = {}

# Populate Weaviate
for data in csv_data:
    website_uuid = None
    scraped_session_uuid = None
    
    try:
        # Check if the website was already created to avoid duplicates
        if data['websiteAddress'] not in created_websites:
            website_uuid = client.data_object.create(
                class_name="Website",
                data_object={
                    "websiteAddress": data['websiteAddress']
                }
            )
            created_websites[data['websiteAddress']] = website_uuid
        else:
            website_uuid = created_websites[data['websiteAddress']]
    except Exception as e:
        print("Error during Website creation:", e)
        continue  # Skip this iteration

    try:
        scraped_session_uuid = client.data_object.create(
            data_object={
                "timestamp": data['timestamp'],
                "scrapedPages": [{
                    "pageURL": data['page'],
                    "textContent": data['text']
                }]
            },
            class_name="ScrapedSession"
        )
    except Exception as e:
        print("Error during ScrapedSession creation:", e)
        continue  # Skip this iteration

    try:
        # Use the reference attribute to add a link between the two objects
        client.data_object.reference.add(
            from_object={"class": "Website", "id": website_uuid},
            property_name="scrapedSessions",
            to_object={"class": "ScrapedSession", "id": scraped_session_uuid}
        )
    except Exception as e:
        print("Error during adding reference:", e)


print("Data was loaded into Weaviate.")


Schema was created.
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during ScrapedSession creation: Expecting value: line 1 column 1 (char 0)
Error during Scraped

KeyboardInterrupt: 

In [3]:
#!pip install weaviate-client

Collecting weaviate-client
  Obtaining dependency information for weaviate-client from https://files.pythonhosted.org/packages/59/8f/44d164ed990f7c6faf28125925160af9004595020aeaaf01e94462e3bf8e/weaviate_client-3.24.1-py3-none-any.whl.metadata
  Downloading weaviate_client-3.24.1-py3-none-any.whl.metadata (3.3 kB)
Collecting validators<1.0.0,>=0.21.2 (from weaviate-client)
  Obtaining dependency information for validators<1.0.0,>=0.21.2 from https://files.pythonhosted.org/packages/3a/0c/785d317eea99c3739821718f118c70537639aa43f96bfa1d83a71f68eaf6/validators-0.22.0-py3-none-any.whl.metadata
  Downloading validators-0.22.0-py3-none-any.whl.metadata (4.7 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client)
  Obtaining dependency information for authlib<2.0.0,>=1.2.1 from https://files.pythonhosted.org/packages/81/6e/f4522542322c7f53783da5f65464a7dee137c687111624d2ac733e2a1b98/Authlib-1.2.1-py2.py3-none-any.whl.metadata
  Downloading Authlib-1.2.1-py2.py3-none-any.whl.metadata (3.8 k