In [1]:
import sys 
sys.path.append('..')
from dotenv import load_dotenv
import os
load_dotenv()

TEST_DUMMY_API_KEY_DEV = os.getenv('TEST_DUMMY_API_KEY_DEV')
TEST_DUMMY_API_URL_DEV = os.getenv('TEST_DUMMY_API_URL_DEV')

from tests.krixik import text_files_path, audio_files_path, json_files_path, image_files_path

from krixik import krixik
krixik.init(api_key = TEST_DUMMY_API_KEY_DEV, 
            api_url = TEST_DUMMY_API_URL_DEV)


import json
def json_print(data):
    print(json.dumps(data, indent=2))

%load_ext autoreload
%autoreload 2 

SUCCESS: You are now authenticated.


# Pipeline configuration examples using translate

A simple set of experiments with modular pipelines - translate is the base here.

Default model used:

```english --> spanish translation```

We will use the small file whose contents are printed out below.

In [7]:
# import file from data/valid.json 
import json
with open(json_files_path + 'valid.json') as f:
    data = json.load(f)
# a little trick to pretty print in jupyter notebook using json
print(json.dumps(data, indent=2))

[
  {
    "snippet": "I love this movie and i would watch it again and again!"
  },
  {
    "snippet": "Operating profit totaled EUR 9.4 mn , down from EUR 11.7 mn in 2004 ."
  }
]


## 1.  Original single module 

Simple pipeline - one module: translate.

Our pipeline:

```translate```

In [14]:
from krixik.pipeline_builder.module import Module
from krixik.pipeline_builder.pipeline import CreatePipeline

# create a few modules
module_1 = Module(name='translate')
pipeline = CreatePipeline(name='translate-pipeline-1', 
                               module_chain=[module_1])
pipeline.save('pipeline_configs/translate-pipeline-1.yaml')

In [15]:
my_pipeline = krixik.load_pipeline(config_path="pipeline_configs/translate-pipeline-1.yaml")
test_file = json_files_path + "valid.json"

output = my_pipeline.process(local_file_path = test_file,
                             expire_time=180,
                             modules={"translate":{"model":"opus-mt-en-es"}}) 

INFO: symbolic_directory_path was not set by user - setting to default of /etc
INFO: file_name was not set by user - setting to random file name: krixik_generated_lbdfxmqlro.json
INFO: wait_for_process is set to True.
INFO: file will expire and be removed from you account in 180 seconds, at Tue Apr  9 14:53:50 2024 UTC


INFO: translate-pipeline-1 file process and input processing started...
INFO: metadata can be updated using the .update api.
INFO: This file's process_id is: 867598c1-01d7-1f76-24c0-533588191051
INFO: File process and processing status:
SUCCESS: module 1 (of 1) - translate processing complete.
SUCCESS: pipeline process complete.
SUCCESS: process output downloaded


Note: the output contains both translated text only.  Each translated snippet is provided in the order they're given in the input.

In [6]:
import json
print(json.dumps(output, indent=2))

{
  "request_id": "319b4551-7895-445b-8cc6-df25b93354a3",
  "file_id": "6c76280c-05c3-40c9-abd1-813723d91266",
  "message": "SUCCESS - output fetched for file_id 6c76280c-05c3-40c9-abd1-813723d91266.",
  "process_output": [
    {
      "translated_snippet": "Me encanta esta pelcula y la vea una y otra vez!"
    },
    {
      "translated_snippet": "Los beneficios de explotacin ascendieron a 9,4 millones de euros , frente a 11,7 millones de euros en 2004 ."
    }
  ]
}


In [2]:
# lets re-use this pretty print method i like it
import json
def json_print(data):
    print(json.dumps(data, indent=2))

## 2.  Add vector search

Lets add two more modules to make the translations' text searchable.  The pipeline:

```translate --> text-embedder --> vector-search```

Remember: we're mamking the *translated text* searchable.

In [3]:
from krixik.pipeline_builder.module import Module
from krixik.pipeline_builder.pipeline import CreatePipeline

# create a few modules
module_1 = Module(name="translate")
module_2 = Module(name="text-embedder")
module_3 = Module(name="vector-search")
pipeline = CreatePipeline(name='translate-pipeline-2', 
                               module_chain=[module_1, module_2, module_3])
pipeline.save('pipeline_configs/translate-pipeline-2.yaml')

In [4]:
my_pipeline = krixik.load_pipeline(config_path="pipeline_configs/translate-pipeline-2.yaml")
test_file = "data/valid.json"

output = my_pipeline.process(local_file_path = test_file,
                             expire_time=60*25,
                             modules={"translate":{"model":"opus-mt-en-es"}})  # purposefully placing modules={}, they are filled in as necessary, not needed 

INFO: hydrated input modules: {'translate': {'model': 'opus-mt-en-es', 'params': {}}, 'text-embedder': {'model': 'multi-qa-MiniLM-L6-cos-v1', 'params': {'quantize': True}}, 'vector-search': {'model': 'faiss', 'params': {}}}
INFO: symbolic_directory_path was not set by user - setting to default of /etc
INFO: file_name was not set by user - setting to random file name: krixik_generated_khajjfnxvj.json
INFO: wait_for_process is set to True.
INFO: file will expire and be removed from you account in 1500 seconds, at Tue Apr  9 15:36:49 2024 UTC


INFO: translate-pipeline-2 file process and input processing started...
INFO: metadata can be updated using the .update api.
INFO: This file's process_id is: 09ab1de0-5eff-6885-c029-715078e72654
INFO: File process and processing status:
SUCCESS: module 1 (of 3) - translate processing complete.
SUCCESS: module 2 (of 3) - text-embedder processing complete.
SUCCESS: module 3 (of 3) - vector-search processing complete.
SUCCESS: pipeline process complete.


In [5]:
json_print(output)

{
  "file_id": "b54f7484-f502-4627-82c9-f6fc23c136dd",
  "process_id": "cf940ef5-0205-b4cd-3d85-9f866f09628b",
  "file_name": "krixik_generated_gfcaiycmyh.json",
  "symbolic_directory_path": "/etc",
  "file_tags": null,
  "file_description": null
}


In [6]:
output = my_pipeline.vector_search(query="Me encanta", 
                                    symbolic_directory_paths=['/*'])

json_print(output)

{
  "status_code": 200,
  "request_id": "5f694651-b2ff-4d31-add9-2e1267e06fd1",
  "message": "Successfully queried 1 user file.",
  "items": [
    {
      "file_id": "b54f7484-f502-4627-82c9-f6fc23c136dd",
      "file_metadata": {
        "file_name": "krixik_generated_gfcaiycmyh.json",
        "symbolic_directory_path": "/etc",
        "file_tags": [],
        "num_vectors": 2,
        "created_at": "2024-04-09 22:04:07",
        "last_updated": "2024-04-09 22:04:07"
      },
      "search_results": [
        {
          "snippet": "Me encanta esta pelcula y la vea una y otra vez!",
          "line_numbers": [
            1
          ],
          "distance": 0.287
        },
        {
          "snippet": "El beneficio de explotacin ascendi a 9,4 millones EUR, frente a 11,7 millones EUR en 2004.",
          "line_numbers": [
            2
          ],
          "distance": 0.394
        }
      ]
    }
  ]
}


# 3.  Add keyword search instead

To add keyword search we need to use `json-to-txt` since the output `translate` is not a set of snippets (like it is vector search).

So our pipeline will look like

```translate --> json-to-txt --> keyword-search```

In [8]:
from krixik.pipeline_builder.module import Module
from krixik.pipeline_builder.pipeline import CreatePipeline

# create a few modules
module_1 = Module(name="translate")
module_2 = Module(name="json-to-txt")
module_3 = Module(name="keyword-search")

pipeline = CreatePipeline(name='translate-pipeline-3', 
                               module_chain=[module_1, module_2, module_3])
pipeline.save('pipeline_configs/translate-pipeline-3.yaml')

In [2]:
my_pipeline = krixik.load_pipeline(config_path="pipeline_configs/translate-pipeline-3.yaml")
test_file = "data/valid.json"

output = my_pipeline.process(local_file_path = test_file,
                             expire_time=60*5,
                             modules={"translate":{"model":"opus-mt-en-es"}})  # purposefully placing modules={}, they are filled in as necessary, not needed 

INFO: hydrated input modules: {'translate': {'model': 'opus-mt-en-es', 'params': {}}, 'json-to-txt': {'model': 'base', 'params': {}}, 'keyword-search': {'model': 'base', 'params': {}}}
INFO: symbolic_directory_path was not set by user - setting to default of /etc
INFO: file_name was not set by user - setting to random file name: krixik_generated_njmlpltkuh.json
INFO: wait_for_process is set to True.
INFO: file will expire and be removed from you account in 300 seconds, at Tue Apr  9 15:04:38 2024 UTC


INFO: translate-pipeline-3 file process and input processing started...
INFO: metadata can be updated using the .update api.
INFO: This file's process_id is: 3543f974-73fe-a292-69da-7a8bf996d833
INFO: File process and processing status:
SUCCESS: module 1 (of 3) - translate processing complete.
SUCCESS: module 2 (of 3) - json-to-txt processing complete.
SUCCESS: module 3 (of 3) - keyword-search processing complete.
SUCCESS: pipeline process complete.
SUCCESS: process output downloaded


In [6]:
output = my_pipeline.keyword_search(query="encanta pelcula", 
                                    symbolic_directory_paths=['/*'])

json_print(output)

{
  "status_code": 200,
  "request_id": "0cc84bea-bd71-4437-9c0c-48813577f3ff",
  "message": "Successfully queried 1 user file.",
  "items": [
    {
      "file_id": "2549b442-b5a5-47b1-8692-746c7bfa9478",
      "file_metadata": {
        "file_name": "krixik_generated_njmlpltkuh.json",
        "symbolic_directory_path": "/etc",
        "file_tags": [],
        "num_lines": 2,
        "created_at": "2024-04-09 21:59:40",
        "last_updated": "2024-04-09 21:59:40"
      },
      "search_results": [
        {
          "keyword": "encanta",
          "line_number": 1,
          "keyword_number": 2
        },
        {
          "keyword": "pelcula",
          "line_number": 1,
          "keyword_number": 4
        }
      ]
    }
  ]
}
