In [15]:
from google.cloud import bigquery
import google.auth

from neo4j import GraphDatabase

import os
import json
from dotenv import dotenv_values
import time

from main import NodeImportBenchmark, NodeQueryBenchmark

from base.operations import NoSQLKnowledgeGraph
from databases.firestore_kg import FirestoreKG
from databases.n4j import AuraKG
from datamodel.data_model import NodeData, EdgeData, CommunityData

#### Setting env and global variables

In [16]:
secrets = dotenv_values("../.env")
credentials, _ = google.auth.load_credentials_from_file(secrets["GCP_CREDENTIAL_FILE"])

## Firestore Knowledge Graph vs. AuraDB (Neo4J) latency comparison

#### Define Knowledge Graph DB Interface Options

In [17]:
fskg = FirestoreKG(gcp_credential_file=secrets["GCP_CREDENTIAL_FILE"],
                   gcp_project_id=str(secrets["GCP_PROJECT_ID"]),
                   firestore_db_id=str(secrets["WIKIDATA_FS_DB"]),
                   node_collection_id=str(secrets["NODE_COLL_ID"]),
                   edges_collection_id=str(secrets["EDGES_COLL_ID"]),
                   community_collection_id=str(
                       secrets["COMM_COLL_ID"])
                   )

aura_kg = AuraKG(uri=str(secrets["NEO4J_URI"]),
                 auth=(str(secrets["NEO4J_USERNAME"]),
                       str(secrets["NEO4J_PASSWORD"]))
                 )

### Importing Nodes Comparison

#### Fetch graph data from BigQuery

In [18]:
import_lim = 100
task_index = int(os.getenv('CLOUD_RUN_TASK_INDEX', '0'))
task_count = int(os.getenv('CLOUD_RUN_TASK_COUNT', '1'))
rows_per_task = import_lim // task_count
offset = task_index * rows_per_task + 300500

print(
    f'$$$$ Task Index {task_index}, Task Count {task_count}, Offset {offset}')

# Fetch Node data from BigQuery
client = bigquery.Client(project=str(
    secrets["GCP_PROJECT_ID"]), credentials=credentials)

query_job = client.query(
    f"SELECT * FROM poerschmann-sem-search.wikidata_kg.entity_doc_alias_joined LIMIT {rows_per_task} OFFSET {offset}")

$$$$ Task Index 0, Task Count 1, Offset 300500


#### Run Node Import Benchmark

In [19]:
add_nodes_testing = NodeImportBenchmark(
    benchmark_name="Node Import", import_lim=import_lim, option_1=fskg, option_2=aura_kg)
add_nodes_testing(records=query_job)

$$$$ Starting Benchmark Firestore vs. Aura $$$$
Error adding node Q20811165 with Firestore: Error: Node with node_uid 'Q20811165' already exists.
Connection established.
Created 1 nodes with if Q20811165 in 2 ms.
Success adding node Q20811165 with Aura
Error adding node Q7804119 with Firestore: Error: Node with node_uid 'Q7804119' already exists.
Connection established.
Created 1 nodes with if Q7804119 in 4 ms.
Success adding node Q7804119 with Aura
Error adding node Q1834743 with Firestore: Error: Node with node_uid 'Q1834743' already exists.
Connection established.
Created 1 nodes with if Q1834743 in 2 ms.
Success adding node Q1834743 with Aura
Error adding node Q7945886 with Firestore: Error: Node with node_uid 'Q7945886' already exists.
Connection established.
Created 1 nodes with if Q7945886 in 4 ms.
Success adding node Q7945886 with Aura
Success adding node Q7967580 with Firestore
Connection established.
Created 1 nodes with if Q7967580 in 2 ms.
Success adding node Q7967580 with 

### Querying Nodes Comparison

In [20]:
import_lim = 100
task_index = int(os.getenv('CLOUD_RUN_TASK_INDEX', '0'))
task_count = int(os.getenv('CLOUD_RUN_TASK_COUNT', '1'))
rows_per_task = import_lim // task_count
offset = task_index * rows_per_task

print(
    f'$$$$ Task Index {task_index}, Task Count {task_count}, Offset {offset}')

# Fetch Node data from BigQuery
client = bigquery.Client(project=str(
    secrets["GCP_PROJECT_ID"]), credentials=credentials)

query_job = client.query(
    f"SELECT * FROM poerschmann-sem-search.wikidata_kg.firestore_kg_relations LIMIT {rows_per_task} OFFSET {offset}")

$$$$ Task Index 0, Task Count 1, Offset 0


In [21]:
query_nodes_testing = NodeQueryBenchmark(
    benchmark_name="Node Query", import_lim=import_lim, option_1=fskg, option_2=aura_kg)
query_nodes_testing(records=query_job)

$$$$ Starting Benchmark Firestore vs. Aura $$$$
Success fetching node data Q408 with Firestore
Success fetching node data Q408 with Aura
Success fetching node data Q15094515 with Firestore
Error fetching node data Q15094515 with Aura: 'Error: No node found with node_uid: Q15094515'
Success fetching node data Q44754 with Firestore
Success fetching node data Q44754 with Aura
Success fetching node data Q42497 with Firestore
Error fetching node data Q42497 with Aura: 'Error: No node found with node_uid: Q42497'
Success fetching node data Q1199 with Firestore
Error fetching node data Q1199 with Aura: 'Error: No node found with node_uid: Q1199'
Success fetching node data Q5092 with Firestore
Error fetching node data Q5092 with Aura: 'Error: No node found with node_uid: Q5092'
Success fetching node data Q485708 with Firestore
Error fetching node data Q485708 with Aura: 'Error: No node found with node_uid: Q485708'
Success fetching node data Q2488890 with Firestore
Success fetching node data Q

### Querying deeply nested structures comparison

Challenge: Finding friends of friends of "Q901" (2nd degree directed)

In [22]:
f0f_list = []

node_data = fskg.get_node(node_uid='Q901')

for e in node_data.edges_from:
    neigh_node = fskg.get_node(node_uid=e)
    f0f_list.append(neigh_node.edges_from)

len(sum(f0f_list, []))

113

f0f_list

In [23]:
records, summary, keys = aura_kg.driver.execute_query(
        "MATCH (n)-[]-()-[]-(result) WHERE n.node_uid = 'Q901' RETURN result")

len(records)

53

In [24]:
records, summary, keys = aura_kg.driver.execute_query(
        """MATCH (n)-[:DIRECTED]-()-[:DIRECTED]-(result)
            WHERE n.node_uid = 'Q901'
            RETURN result""")

len(records)

53

In [25]:
records, summary, keys = aura_kg.driver.execute_query(
        """MATCH (n)-[:DIRECTED*2]-(result)
            WHERE n.node_uid = 'Q901'
            RETURN result""")

len(records)

53

Challenge 2: Finding friends of friends of friends "Q901" (3rd degree undirected)

In [26]:
f0fof_list = []

node_data = fskg.get_node(node_uid='Q901')

for e in node_data.edges_from + node_data.edges_to:
    neigh_node = fskg.get_node(node_uid=e)
    for e2 in neigh_node.edges_from + neigh_node.edges_to:
        neigh_node2 = fskg.get_node(node_uid=e2)
        f0fof_list.append(neigh_node2.edges_from)
        f0fof_list.append(neigh_node2.edges_to)

len(sum(f0fof_list, []))

51947

In [27]:
with GraphDatabase.driver(uri=aura_kg.uri, auth=aura_kg.auth) as driver:
    driver.verify_connectivity()

    # Use a parameter for node_uid in the Cypher query
    records, summary, keys = driver.execute_query(
        "MATCH (n)-[]-()-[]-()-[]-(result) WHERE n.node_uid = 'Q901' RETURN result")

len(records)

### Running Community Identification Comparison

tbd in comparison