In [1]:
import requests
import os
from datetime import datetime
from dotenv import load_dotenv
from pathlib import Path
import numpy as np
import pandas as pd 

dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

ELASTIC_PORT = os.environ.get("ELASTIC_PORT", None)
ELASTIC_USERNAME = os.environ.get("ELASTIC_USERNAME", None)
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD", None)

assert ELASTIC_PORT is not None, "ELASTIC_PORT is not set"
assert ELASTIC_USERNAME is not None, "ELASTIC_USERNAME is not set"
assert ELASTIC_PASSWORD is not None, "ELASTIC_PASSWORD is not set"

config = {
    # Global config
    "HOST": "0.0.0.0",
    "PORT": ELASTIC_PORT,
    "USERNAME": ELASTIC_USERNAME,
    "PASSWORD": ELASTIC_PASSWORD,
    "INDEX": None,
    "RETURN_SIZE": 10,
    "CACHE_DIR": ".cache/",
    "DIMENSION": 2,
}

## Prepapre DB

In [2]:
import pandas as pd 
from tqdm import tqdm 
df = pd.read_csv('vbs22_meta.csv')

In [3]:
df.head()

Unnamed: 0,path,filename,dataset,video,shot,score,yolo_concept,color,ocr
0,VBS2022/keyframes/00001/shot00001_1_RKF.png,shot00001_1_RKF,V3C,1,1,0,"{'person': 5, 'bicycle': 2, 'cell phone': 1}","['gray', 'black']",['']
1,VBS2022/keyframes/00001/shot00001_2_RKF.png,shot00001_2_RKF,V3C,1,2,0,{'person': 4},"['pink', 'purple', 'skin-color']",['']
2,VBS2022/keyframes/00001/shot00001_3_RKF.png,shot00001_3_RKF,V3C,1,3,0,{},"['pink', 'skin-color', 'gray']",['']
3,VBS2022/keyframes/00001/shot00001_4_RKF.png,shot00001_4_RKF,V3C,1,4,0,"{'person': 5, 'bicycle': 5}","['pink', 'purple', 'white']",['']
4,VBS2022/keyframes/00001/shot00001_5_RKF.png,shot00001_5_RKF,V3C,1,5,0,"{'person': 1, 'bicycle': 3}","['gray', 'black', 'blue']",['']


## Create index in Elasticsearch

In [4]:
from pysearch.elastic import ElasticProcessor
from pysearch.utils.time import nlp2datetime

In [5]:
config['INDEX'] = 'vbs24_db'
proc = ElasticProcessor(config)

Connected to Elasticsearch node


In [14]:
[name for name in proc.available_indices() if not(name.startswith('.'))]



['lsc24', 'test_index_lsc', 'vbs24_db', 'test_index']

In [7]:
# proc.kill('vbs22_db')

In [8]:
df_structure = {  
    "mappings": {
        "properties": {
            'index': {"type": "text"}, # required for pysearch
            # "date": {"type": "date", "format": "basic_date"}, # required for pysearch            
            'id': {"type": "integer"},
            'path': {"type": "text"},
            'video_id': {"type": "text"},
            'shot_id': {"type": "text"},
            'yolo_concept': {"type": "text"},
            'color': {"type": "text"},
            'ocr': {"type": "text"},
        }
    }
}

In [9]:
df['index'] = df['filename'] 
df.head()
proc.index_dataframe(df, df_structure)

100%|██████████| 2508110/2508110 [05:08<00:00, 8127.13it/s] 


In [10]:
import rich
rich.print(proc.info())

In [11]:
proc.search("airplane")

[Pysearch] Function run elapsed time: 0:00:00.000006


[{'_index': 'vbs24_db',
  '_id': 'shot10437_67_RKF',
  '_score': 13.235781,
  '_source': {'path': 'VBS2022/keyframes/10437/shot10437_67_RKF.png',
   'filename': 'shot10437_67_RKF',
   'dataset': 'V3C',
   'video': 10437,
   'shot': 67,
   'score': 0,
   'yolo_concept': "{'person': 1}",
   'color': "['skin-color', 'black', 'purple', 'orange', 'white']",
   'ocr': "['You put on airplane mode and you take that s*#%!']"}},
 {'_index': 'vbs24_db',
  '_id': 'shot00478_28_RKF',
  '_score': 5.8676715,
  '_source': {'path': 'VBS2022/keyframes/00478/shot00478_28_RKF.png',
   'filename': 'shot00478_28_RKF',
   'dataset': 'V3C',
   'video': 478,
   'shot': 28,
   'score': 0,
   'yolo_concept': "{'airplane': 2}",
   'color': "['gray', 'black']",
   'ocr': "['']"}},
 {'_index': 'vbs24_db',
  '_id': 'shot00478_37_RKF',
  '_score': 5.8676715,
  '_source': {'path': 'VBS2022/keyframes/00478/shot00478_37_RKF.png',
   'filename': 'shot00478_37_RKF',
   'dataset': 'V3C',
   'video': 478,
   'shot': 37,
   

In [12]:
proc.search("airplane", top_k=10, filter=["shot00627_273_RKF", "shot00608_141_RKF"])

[Pysearch] Function run elapsed time: 0:00:00.000007


[{'_index': 'vbs24_db',
  '_id': 'shot00608_141_RKF',
  '_score': 6.8676715,
  '_source': {'path': 'VBS2022/keyframes/00608/shot00608_141_RKF.png',
   'filename': 'shot00608_141_RKF',
   'dataset': 'V3C',
   'video': 608,
   'shot': 141,
   'score': 0,
   'yolo_concept': "{'airplane': 1}",
   'color': "['white', 'purple', 'blue', 'pink']",
   'ocr': "['']"}},
 {'_index': 'vbs24_db',
  '_id': 'shot00627_273_RKF',
  '_score': 6.8676715,
  '_source': {'path': 'VBS2022/keyframes/00627/shot00627_273_RKF.png',
   'filename': 'shot00627_273_RKF',
   'dataset': 'V3C',
   'video': 627,
   'shot': 273,
   'score': 0,
   'yolo_concept': "{'airplane': 1}",
   'color': "['white', 'green', 'gray']",
   'ocr': "['']"}}]

In [13]:
rich.print(proc.get_document_by_id(['shot00627_273_RKF']))

[Pysearch] Function get_document_by_id elapsed time: 0:00:00.022315
