# Add scraped data from GCS bucket to Weaviate

###### Run this notebook to add **new** scarped data for websites in our GCS bucket. Files that already exist in Weaviate will be skipped.

In [2]:
# Install packages
!pip install weaviate-client
!pip install llama-index

Collecting weaviate-client
  Downloading weaviate_client-3.25.3-py3-none-any.whl (120 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/120.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━[0m [32m61.4/120.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m120.3/120.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Collecting validators<1.0.0,>=0.21.2 (from weaviate-client)
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client)
  Downloading Authlib-1.2.1-py2.py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: validators, authlib, weaviate-client
Successfully installed authlib-1.2.1 validators-0.22.0 weaviate-client-3.25.3
Collecting llama-index
  Downloadin

In [6]:
# Import libraries
import os
import json
from google.cloud import storage
from google.oauth2 import service_account
from datetime import datetime, timezone
from weaviate import Client
import pandas as pd
from datetime import datetime, timezone
from weaviate import Client
from weaviate.exceptions import UnexpectedStatusCodeException
from google.cloud import storage

# Additional imports for the llama_index module
from llama_index import Document
from llama_index.node_parser import SimpleNodeParser
from llama_index.vector_stores import WeaviateVectorStore
from llama_index import VectorStoreIndex, StorageContext
from llama_index.storage.storage_context import StorageContext
from io import BytesIO
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters

# Define the Weaviate IP address
WEAVIATE_IP_ADDRESS = "34.42.138.162"

# Set OpenAI API key in the environment variable
os.environ["OPENAI_API_KEY"] = "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"

In [7]:
def authenticate_with_service_account(key_path):
    """
    Authenticate using a service account key.

    Parameters:
    - key_path (str): Path to the service account key file.

    Returns:
    - credentials (google.auth.credentials.Credentials): Google Cloud credentials.
    """
    credentials = service_account.Credentials.from_service_account_file(
        key_path,
        scopes=["https://www.googleapis.com/auth/cloud-platform"],
    )
    return credentials

def text_chunk_exists(client, website_address, timestamp):
    """
    Check if a TextChunk with a specific website address and timestamp already exists in Weaviate.

    Parameters:
    - client (weaviate.Client): The Weaviate client object to interact with the Weaviate instance.
    - website_address (str): The website address to check.
    - timestamp (str): The timestamp to check in RFC 3339 format.

    Returns:
    - bool: True if the Pages exists in Weaviate, False otherwise.
    """

    # GraphQL query to retrieve Pages based on website address and timestamp
    query = f"""
    {{
      Get {{
        Pages (where: {{
            operator: And
            operands: [{{
                path: ["websiteAddress"],
                operator: Equal,
                valueString: "{website_address}"
            }}, {{
                path: ["timestamp"],
                operator: Equal,
                valueText: "{timestamp}"
            }}]
        }}) {{
          __typename
        }}
      }}
    }}
    """

    result = client.query.raw(query)
    print(result) # Print the result for debugging

    # # Check if the Pages exists based on the query results
    # return result['data']['Get']['Pages'] is not None

    # Check if the Pages exists based on the query results
    return len(result['data']['Get']['Pages']) > 0


def extract_website_and_timestamp(filename):
    """
    Extract website address and timestamp from a filename.
    Split the filename by '_' and check if there are at least two parts

    Parameters:
    - filename (str): The input filename.

    Returns:
    - website_address (str): Extracted website address.
    - timestamp (str): Extracted timestamp.
    """
    filename_parts = filename.split('_')
    if len(filename_parts) >= 2:
        website_address, timestamp = filename_parts[0][len("data/"):], filename_parts[1].split('.csv')[0]
        return website_address, timestamp
    else:
        # Handle the case where the filename doesn't match the expected structure
        return None, None

In [11]:
# Authenticate to allow access to GCS
credentials = authenticate_with_service_account(key_path = "key/sample_data/rag-detective-389f2d6f87a9.json")

# Define name of GCS bucket
bucket_name = "ac215_scraper_bucket"

In [None]:
# Set up Google Cloud Storage client and bucket
storage_client = storage.Client(credentials=credentials)
bucket = storage_client.bucket(bucket_name)

# List all files in the bucket
files = bucket.list_blobs()

# Set up Weaviate client
client = Client(url="http://" + WEAVIATE_IP_ADDRESS + ":8080")

# Iterate through each file in the bucket
for file in files:
    csv_file = os.path.basename(file.name)
    print(csv_file)

    # Extract website_address and timestamp
    website_address, timestamp = extract_website_and_timestamp(file.name)

    # Print the extracted values
    print(f"Website Address: {website_address}")
    print(f"Timestamp: {timestamp}")

    # Now, call the text_chunk_exists function with the extracted values
    result = text_chunk_exists(client, website_address, timestamp)
    print(result)

    # If file is not in Weaviate, add it!
    if result == False:
      print(f"Adding {csv_file} to Weaviate!")

      # Get the blob from the bucket
      filename = file.name
      blob = bucket.blob(filename)

      # Download the file contents as bytes
      file_contents = blob.download_as_bytes()

      # Use BytesIO to convert the bytes content to a file-like object
      file_like_object = BytesIO(file_contents)

      # Create a Pandas DataFrame from the file-like object
      df = pd.read_csv(file_like_object)
      print(df.head())

      # Manually assemble the documents
      documents = []
      for _, row in df.iterrows():
          document = Document(
              text=row['text'],
              metadata={
                  'websiteAddress': website_address,
                  'timestamp': timestamp
              }
          )
          document.doc_id = row['key']
          documents.append(document)

      # Create the parser and nodes
      parser = SimpleNodeParser.from_defaults(chunk_size=1024, chunk_overlap=20)
      nodes = parser.get_nodes_from_documents(documents)

      # construct vector store
      vector_store = WeaviateVectorStore(weaviate_client = client, index_name="Pages", text_key="text")
      # setting up the storage for the embeddings
      storage_context = StorageContext.from_defaults(vector_store = vector_store)
      # set up the index
      index = VectorStoreIndex(nodes, storage_context=storage_context)

      print(f"Added {csv_file} to Weaviate!")