In [7]:
##### connect to the elasticsearch server
import json
from pprint import pprint
import os
import time
import pandas as pd 
from datetime import datetime

import pathlib
from dotenv import load_dotenv
from elasticsearch import Elasticsearch

from DataProfile_JSON_db import *
from minio_utils import *
from classes import *

load_dotenv()

##### dummy files to upload to minio and elasticsearch
path_to_main_input = "./examples/dummy_data"    
my_es = ESearch("elastic", "genov4")


In [8]:
##### prepare data to be added to database
file_type = "bam"
input_files = [item for item in pathlib.Path(os.path.join(path_to_main_input, file_type)).glob("*.{}".format(file_type))]
input_metadata = pd.DataFrame(data = [item.name for item in input_files], columns = ["FileName"])
input_metadata["FileType"] = file_type
input_metadata["Labcode"] = input_metadata["FileName"].apply(lambda x: x.replace(".{}".format(file_type), ""))
input_metadata["path"] = ["./examples/dummy_data/{}/{}.{}".format(file_type, labcode, file_type) for labcode in input_metadata.Labcode.values]
input_metadata["project"] = "ECD"
input_metadata["sub_project"] = "ECD_read_based"
input_metadata["Date"] = datetime.now()
input_metadata["pipeline"] = "bismark_wgbs"
input_metadata = input_metadata.set_index("path")
input_metadata_dict = input_metadata.to_dict(orient = "index") # the input metadata is ready to be added to the database elasticsearch

##### load the data profile database
db = DataProfileDB("ALL_DATA_PROFILES.json")

##### define the minio bucket to upload the files
bucket_name = "bam"

##### choose the data profile name, example: choose "dev2"
profile_name = "dev2"

##### check if the index is ready in the elasticsearch database, if not, create new.
if profile_name in my_es.all_indices == False:
    my_es.create_index(index_name = profile_name, metadata_profile = db.get_record(profile_name)[profile_name])

path_to_file = list(input_metadata_dict.keys())[1]
sample_metadata = input_metadata_dict[path_to_file]
    
def insert_data_to_database(path_to_file, sample_metadata, profile_name, my_es, bucket_name, minio_credentials):
    """
    Insert the data to MINIO database and metadata to ELASTICSEARCH database.

    Args:
        path_to_file (str): The path to the file to be uploaded.
        sample_metadata (dict): The metadata associated with the file.
        profile_name (str): The name of the profile in the Elasticsearch database.
        my_es: An instance of the Elasticsearch client.
        bucket_name (str): The name of the bucket in the MINIO database.
        minio_credentials: The credentials required to access the MINIO database.
        
    """

    object_name = sample_metadata["FileName"]

    ##### while the metadata will be added to the elasticsearch database
    my_es.insert_document(index=profile_name, document=sample_metadata)

    ##### the file will be uploaded to the MINIO database
    upload_file_with_metadata(
        bucket_name=bucket_name,
        object_name=object_name,
        file_to_upload=path_to_file,
        metadata=sample_metadata,
        minio_credentials=minio_credentials,
        verbose=False)


In [9]:
my_es

<classes.ESearch at 0x1183b01f0>