In [1]:
import os
from dotenv import load_dotenv
from pathlib import Path
import rich 

dotenv_path = Path('../../.env')
load_dotenv(dotenv_path=dotenv_path)

ELASTIC_PORT = os.environ.get("ELASTIC_PORT", None)
ELASTIC_USERNAME = os.environ.get("ELASTIC_USERNAME", None)
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", None)

assert ELASTIC_PORT is not None, "ELASTIC_PORT is not set"
assert ELASTIC_USERNAME is not None, "ELASTIC_USERNAME is not set"
assert ELASTIC_PASSWORD is not None, "ELASTIC_PASSWORD is not set"

root = Path('/home/lsc/2024/')
config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": ELASTIC_PORT,
    "USERNAME": ELASTIC_USERNAME,
    "PASSWORD": ELASTIC_PASSWORD,
    "INDEX": None,
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    "DIMENSION": 2,
}

rich.print(config)

## Prepapre DB

In [2]:
import json 
with open(root / 'datahub/metadata/clean_metadata.json', 'r') as f:
    metadata = json.load(f)

## Create index in Elasticsearch

In [3]:
from pysearch.elastic import ElasticProcessor
from pysearch.utils.time import nlp2datetime

In [4]:
config['INDEX'] = 'lsc24'
proc = ElasticProcessor(config, max_result_window=1000000)

Connected to Elasticsearch node


In [8]:
proc.available_indices()

dict_keys(['test_index_lsc', 'test_index'])

In [9]:
mapping = {
    "mappings": {
        "properties": {
            "image_name": {"type": "text"},
            "timestamp": {"type": "date", "format": "basic_date"},
            "tags": {"type": "text"},
            "categories": {"type": "text"},
            "semantic_time:__new_timezone": {"type": "text"},
            "semantic_time:__weekday": {"type": "text"},
            "semantic_time:__year": {"type": "text"},
            "semantic_time:__month": {"type": "text"},
            "semantic_time:__semantic_time": {"type": "text"},
            "music:__album": {"type": "text"},
            "music:__artist": {"type": "text"},
            "music:__song": {"type": "text"},
            "time:__utc_time": {"type": "text"},
            "time:__local_time": {"type": "text"},
            "time:__minute_id": {"type": "text"},
            "activity:__movement": {"type": "text"},
            "activity:__movement_prob": {"type": "float"},
            "activity:__stop": {"type": "boolean"},
            "location:__lat": {"type": "float"},
            "location:__lng": {"type": "float"},
            "location:__semantic_name": {"type": "text"},
            "location:__original_name": {"type": "text"},
            "location:__parent": {"type": "text"},
            "location:__city": {"type": "text"},
            "location:__country": {"type": "text"},
            "visual_concepts:__OCR": {"type": "text"},
            "visual_concepts:__Caption": {"type": "text"},
            "visual_concepts:__CaptionScore": {"type": "float"}
        }
    }
}

In [10]:
proc.index_json(metadata, mapping)

100%|██████████| 723329/723329 [00:54<00:00, 13297.41it/s]


In [11]:
rich.print(proc.info())

In [18]:
date = nlp2datetime('12-13-2019'); date

datetime.datetime(2019, 12, 13, 16, 21, 52)

In [13]:
def check_duplicate_response(responses):
    hits_name = [hit['_id'] for hit in responses]
    assert len(hits_name) == len(set(hits_name)), "Duplicate response"

In [14]:
import rich 
rich.print(proc.get_document_by_id(['20200509_133031_000']))

Function get_document_by_id elapsed time: 0:00:00.068057


In [15]:
check_duplicate_response(proc.compose_pipeline({'text': {'fields': ['semantic_time:__month'], 'must': '9', 'should': None }}))

Function run elapsed time: 0:00:00.000003


In [19]:
check_duplicate_response(proc.search_text_closestday_pipeline('indoor', [], timefield='timestamp', timestamp=date, filter=None))

Function run elapsed time: 0:00:00.000005


In [20]:
proc.search_text_closestday_pipeline('indoor', [], timefield='timestamp', timestamp=date, filter=None)

Function run elapsed time: 0:00:00.000005


[{'_index': 'lsc24',
  '_id': '20191219_201339_000',
  '_score': 11.58763,
  '_source': {'image_name': '20191219_201339_000.jpg',
   'timestamp': '20191219',
   'tags': 'text,indoor,person,scene,shop',
   'categories': 'Hardware Store',
   'semantic_time:__new_timezone': 'Europe/Dublin',
   'semantic_time:__weekday': 'Thursday',
   'semantic_time:__year': '2019',
   'semantic_time:__month': 'December',
   'semantic_time:__semantic_time': 'early evening',
   'music:__album': None,
   'music:__artist': None,
   'music:__song': None,
   'time:__utc_time': '2019-12-19 20:13:00',
   'time:__local_time': '2019-12-19 20:13:00',
   'time:__minute_id': '20191219_2013',
   'activity:__movement': 'Inside',
   'activity:__movement_prob': 0.9602422118,
   'activity:__stop': True,
   'location:__lat': 53.3945656,
   'location:__lng': -6.1909035,
   'location:__semantic_name': "Woodie's",
   'location:__original_name': "Woodie's",
   'location:__parent': None,
   'location:__city': 'Dublin, Ireland, 

In [21]:
check_duplicate_response(proc.compose_pipeline({'text': {'fields': ['semantic_time:__year'], 'must': '2019', 'should': None }}))

Function run elapsed time: 0:00:00.000004


In [22]:
len(proc.compose_pipeline({'text': {'fields': ['semantic_time:__year'], 'must': '2019', 'should': None }}, topk=10001))

Function run elapsed time: 0:00:00.000004


10001

In [23]:
rich.print(proc.compose_pipeline({'text': {'fields': ['semantic_time:__month'], 'must': None, 'should': 'december' }})[0])

Function run elapsed time: 0:00:00.000003


In [27]:
rich.print(proc.compose_pipeline({'text': {'fields': ['location:__semantic_name', 'visual_concepts:__OCR', 'visual_concepts:__Caption', 'location:__country', 'tags'], 'must': "Zeus Conference", 'should': None }})[0])

Function run elapsed time: 0:00:00.000004


In [29]:
rich.print(proc.compose_pipeline({'text': {'fields': ['location:__semantic_name', 'visual_concepts:__OCR', 'visual_concepts:__Caption', 'location:__country', 'tags'], 'must': None, 'should': "Zeus Conference" }})[0])


Function run elapsed time: 0:00:00.000003
