In [1]:
import sys 
sys.path.append('..')
from dotenv import load_dotenv
import os
load_dotenv()

TEST_DUMMY_API_KEY_DEV = os.getenv('TEST_DUMMY_API_KEY_DEV')
TEST_DUMMY_API_URL_DEV = os.getenv('TEST_DUMMY_API_URL_DEV')

from krixik import krixik
krixik.init(api_key = TEST_DUMMY_API_KEY_DEV, 
            api_url = TEST_DUMMY_API_URL_DEV)

# a short pretty print in jupyter notebook function i like
import json
def json_print(data):
    print(json.dumps(data, indent=2))
    
%load_ext autoreload
%autoreload 2 

SUCCESS: You are now authenticated.


# 1.  just ocr module

- removed `easy-ocr` models (x2) as they didn't play well with others

In [2]:
from krixik.pipeline_builder.module import Module
from krixik.pipeline_builder.pipeline import CreatePipeline

# create a few modules
module_1 = Module(name='ocr')
pipeline = CreatePipeline(name='ocr-pipeline-1', 
                               module_chain=[module_1])
pipeline.save('pipeline_configs/ocr-pipeline-1.yaml')

In [3]:
my_pipeline = krixik.load_pipeline(config_path="pipeline_configs/ocr-pipeline-1.yaml")
test_file = "data/seal.png"

output = my_pipeline.process(local_file_path = test_file,
                             expire_time=180,
                             modules={})  # purposefully placing modules={}, they are filled in as necessary, not needed 

INFO: hydrated input modules: {'ocr': {'model': 'tesseract-en', 'params': {}}}
INFO: symbolic_directory_path was not set by user - setting to default of /etc
INFO: file_name was not set by user - setting to random file name: krixik_generated_nxtdehcijr.png
INFO: wait_for_process is set to True.
INFO: file will expire and be removed from you account in 180 seconds, at Tue Apr  9 15:58:32 2024 UTC
INFO: ocr-pipeline-1 file process and input processing started...
INFO: metadata can be updated using the .update api.
INFO: This file's process_id is: 9dcdad53-5176-9329-0dcc-c86c9ab13831
INFO: File process and processing status:
SUCCESS: module 1 (of 1) - ocr processing complete.
SUCCESS: pipeline process complete.
SUCCESS: process output downloaded


In [4]:
json_print(output)

{
  "status_code": 200,
  "request_id": "72282688-93a0-4f73-a0f7-912e2b8b5152",
  "file_id": "268a2212-13ca-4be7-9b41-737a3858ac3e",
  "message": "SUCCESS - output fetched for file_id 268a2212-13ca-4be7-9b41-737a3858ac3e.",
  "process_output": [
    {
      "text": "The Seventh Seal\n\nThe night had brought little relief from the heat, and at dawn a hot gust of\nwind blows across the colorless sea. The KNIGHT, Antonius Block, lies\nprostrate on some spruce branches spread over the fine sand. His eyes are\nwide-open and bloodshot from lack of sleep.\n\nNearby his squire JONS is snoring loudly. He has fallen asleep where he\ncollapsed, at the edge of the forest among the wind-gnarled fir trees. His\nopen mouth gapes towards the dawn, and unearthly sounds come from his throat.\nAt the sudden gust of wind, the horses stir, stretching their parched muzzles\ntowards the sea. They are as thin and worn as their masters.\n",
      "detections": [
        {
          "left": 10,
          "top":

# 2.  Ocr with vector search

In [3]:
from krixik.pipeline_builder.module import Module
from krixik.pipeline_builder.pipeline import CreatePipeline

# create a few modules
module_1 = Module(name="ocr")
module_2 = Module(name="json-to-txt")
module_3 = Module(name="parser")
module_4 = Module(name="text-embedder")
module_5 = Module(name="vector-search")

pipeline = CreatePipeline(name='ocr-pipeline-2', 
                               module_chain=[module_1, module_2, module_3, module_4, module_5])
pipeline.save('pipeline_configs/ocr-pipeline-2.yaml')

In [13]:
my_pipeline = krixik.load_pipeline(config_path="pipeline_configs/ocr-pipeline-2.yaml")
test_file = "data/seal.png"

output = my_pipeline.process(local_file_path = test_file,
                             expire_time=60*10,
                             modules={})  # purposefully placing modules={}, they are filled in as necessary, not needed 

INFO: hydrated input modules: {'ocr': {'model': 'tesseract-en', 'params': {}}, 'json-to-txt': {'model': 'base', 'params': {}}, 'parser': {'model': 'fixed', 'params': {'chunk_size': 10, 'overlap_size': 2}}, 'text-embedder': {'model': 'multi-qa-MiniLM-L6-cos-v1', 'params': {'quantize': True}}, 'vector-search': {'model': 'faiss', 'params': {}}}
INFO: symbolic_directory_path was not set by user - setting to default of /etc
INFO: file_name was not set by user - setting to random file name: krixik_generated_douxtinjdy.png
INFO: wait_for_process is set to True.
INFO: file will expire and be removed from you account in 600 seconds, at Fri Apr 12 19:36:30 2024 UTC


INFO: ocr-pipeline-2 file process and input processing started...
INFO: metadata can be updated using the .update api.
INFO: This file's process_id is: cf71f50a-ffa9-532b-adc2-e1054828fc73
INFO: File process and processing status:
SUCCESS: module 1 (of 5) - ocr processing complete.
SUCCESS: module 2 (of 5) - json-to-txt processing complete.
SUCCESS: module 3 (of 5) - parser processing complete.
SUCCESS: module 4 (of 5) - text-embedder processing complete.
SUCCESS: module 5 (of 5) - vector-search processing complete.
SUCCESS: pipeline process complete.
SUCCESS: process output downloaded


In [14]:
output = my_pipeline.vector_search(query="some respite from the temperature", 
                                   symbolic_directory_paths=['/*'])

json_print(output)

{
  "status_code": 200,
  "request_id": "5aebd2ae-01bd-48a6-998e-79ee5b118754",
  "message": "Successfully queried 1 user file.",
  "items": [
    {
      "file_id": "d0034dd8-566e-4543-9da4-2dcd4203ca19",
      "file_metadata": {
        "file_name": "krixik_generated_douxtinjdy.png",
        "symbolic_directory_path": "/etc",
        "file_tags": [],
        "num_vectors": 14,
        "created_at": "2024-04-13 02:26:33",
        "last_updated": "2024-04-13 02:26:33"
      },
      "search_results": [
        {
          "snippet": "relief from the heat, and at dawn a hot gust",
          "line_numbers": [
            3
          ],
          "distance": 0.311
        },
        {
          "snippet": "hot gust of wind blows across the colorless sea. The",
          "line_numbers": [
            3,
            4
          ],
          "distance": 0.343
        },
        {
          "snippet": "horses stir, stretching their parched muzzles towards the sea. They",
          "line_numbe

In [15]:
from krixik.pipeline_builder.module import Module
from krixik.pipeline_builder.pipeline import CreatePipeline

# create a few modules
module_1 = Module(name="ocr")
module_2 = Module(name="json-to-txt")
module_3 = Module(name="keyword-search")

pipeline = CreatePipeline(name='ocr-pipeline-3', 
                               module_chain=[module_1, module_2, module_3])
pipeline.save('pipeline_configs/ocr-pipeline-3.yaml')

In [16]:
my_pipeline = krixik.load_pipeline(config_path="pipeline_configs/ocr-pipeline-3.yaml")
test_file = "data/seal.png"

output = my_pipeline.process(local_file_path = test_file,
                             expire_time=60*3,
                             modules={})  # purposefully placing modules={}, they are filled in as necessary, not needed 

INFO: hydrated input modules: {'ocr': {'model': 'tesseract-en', 'params': {}}, 'json-to-txt': {'model': 'base', 'params': {}}, 'keyword-search': {'model': 'base', 'params': {}}}
INFO: symbolic_directory_path was not set by user - setting to default of /etc
INFO: file_name was not set by user - setting to random file name: krixik_generated_bnmovhxcfy.png
INFO: wait_for_process is set to True.
INFO: file will expire and be removed from you account in 180 seconds, at Fri Apr 12 19:32:22 2024 UTC
INFO: ocr-pipeline-3 file process and input processing started...
INFO: metadata can be updated using the .update api.
INFO: This file's process_id is: 19253bb7-275f-4702-7f3d-48c8542d0702
INFO: File process and processing status:
SUCCESS: module 1 (of 3) - ocr processing complete.
SUCCESS: module 2 (of 3) - json-to-txt processing complete.
SUCCESS: module 3 (of 3) - keyword-search processing complete.
SUCCESS: pipeline process complete.
SUCCESS: process output downloaded


In [18]:
output = my_pipeline.keyword_search(query="relief from the heat, and at dawn a hot gust", 
                                   symbolic_directory_paths=['/*'])

json_print(output)

{
  "status_code": 200,
  "request_id": "ceb7352f-e22a-4719-b37c-2ff9b6c2c48b",
  "message": "Successfully queried 1 user file.",
    {
        "from",
        "the",
        "and",
        "at",
        "a"
      ]
    }
  ],
  "items": [
    {
      "file_id": "ba53177d-38fa-4486-9567-0661a5d662ab",
      "file_metadata": {
        "file_name": "krixik_generated_bnmovhxcfy.png",
        "symbolic_directory_path": "/etc",
        "file_tags": [],
        "num_lines": 12,
        "created_at": "2024-04-13 02:29:23",
        "last_updated": "2024-04-13 02:29:23"
      },
      "search_results": [
        {
          "keyword": "relief",
          "line_number": 3,
          "keyword_number": 6
        },
        {
          "keyword": "heat",
          "line_number": 3,
          "keyword_number": 9
        },
        {
          "keyword": "dawn",
          "line_number": 3,
          "keyword_number": 12
        },
        {
          "keyword": "hot",
          "line_number": 3,
    