## Setup Webpage Index

In [11]:
import typesense
import os
from dotenv import load_dotenv

In [10]:
load_dotenv()

TYPESENSE_KEY = os.environ['typesense_key']

In [6]:
client = typesense.Client({
  'nodes': [{
    'host': 'localhost', # For Typesense Cloud use xxx.a1.typesense.net
    'port': '8108',      # For Typesense Cloud use 443
    'protocol': 'http'   # For Typesense Cloud use https
  }],
  'api_key': TYPESENSE_KEY,
  'connection_timeout_seconds': 2
})

In [37]:
schema = {
  'name': 'webpage',
  'fields': [
    # {'name': 'title', 'drop':True },
    # {'name': 'url', 'drop':True },
    # {'name': 'aka', 'drop':True },
    # {'name': 'site', 'drop':True },
    # {'name': 'site_type', 'drop':True },
    # {'name': 'entity_type', 'drop':True },
    {'name': 'id', 'type': 'string',  },
    {'name': 'title', 'type': 'string', 'optional': True },
    {'name': 'url', 'type': 'string', },
    {'name': 'aka', 'type': 'string[]', 'optional': True  },
    {'name': 'site', 'type': 'string', 'optional': True  },
    {'name': 'site_type', 'type': 'string', 'optional': True },
    {'name': 'entity_type', 'type': 'string[]', 'optional': True },
  ]
}

In [38]:
res = client.collections.create(schema)
# res = client.collections['webpage'].update(schema)

## Indexing Data from neo4j

In [27]:
from neo4j import GraphDatabase
from pathlib import Path
import pandas as pd

In [12]:
neo4j_uri = os.environ['neo4j_uri']
neo4j_username = os.environ['neo4j_username']
neo4j_password = os.environ['neo4j_password']
neo4j_dbname = os.environ['neo4j_dbname']

In [13]:
class Neo4jGraph:

    def __init__(self, neo4j_uri:str, neo4j_username:str, neo4j_password:str, db:str)->None:
        self.uri  = neo4j_uri
        self.auth = (neo4j_username, neo4j_password)
        self.db = db
        self.driver = GraphDatabase.driver(self.uri, auth=self.auth)

    def query(self, query:str, params:dict={}):
        with self.driver.session(database=self.db) as session:
            result = session.run(query, params)
            return [r for r in result]

In [14]:
graph = Neo4jGraph(
    neo4j_uri,
    neo4j_username,
    neo4j_password,
    neo4j_dbname,
)

In [22]:
load_script_path = Path(f"cypher-query/ser-webpage-retrieve.cypher")
count_script_path = Path(f"cypher-query/ser-webpage-count.cypher")
BATCH_SIZE = 10000
SEARCH_INDEX_NAME = 'webpage'

In [40]:
with open(str(load_script_path), 'r') as fp:
    cypher_load = fp.read()
with open(str(count_script_path), 'r') as fp:
    cypher_count = fp.read()

In [41]:
res = graph.query(cypher_count)
entity_count = res[0].get('count_')
print(entity_count)

8776


In [44]:
for skip_count in range(0, entity_count, BATCH_SIZE):
    res = graph.query(cypher_load, {'BATCH_SIZE':BATCH_SIZE, 'skip_count': skip_count})
    index_input_data = [doc.data() for doc in res]
    index_res = client.collections[SEARCH_INDEX_NAME].documents.import_(index_input_data, {'action': 'upsert'})

In [45]:
pd.Series(index_res).value_counts()

{'success': True}    8776
Name: count, dtype: int64

In [47]:
data_violate = {
    'id': 'https://localhost/test',
    'title': 'data violate',
    'url': 'https://localhost/test',
    'label': ['Data'],
    'test_field': False
}

In [48]:
client.collections[SEARCH_INDEX_NAME].documents.import_([data_violate], {'action': 'upsert'})

[{'success': True}]