In [1]:
import json
from pprint import pprint
import os
import time
import pandas as pd 
from datetime import datetime
import pathlib

from dotenv import load_dotenv
from elasticsearch import Elasticsearch

import minio
from minio.error import S3Error
from minio.commonconfig import ENABLED
from minio.versioningconfig import VersioningConfig

from RDSBucket_class import *
from data_profiles import *

import warnings
warnings.filterwarnings("ignore")
##### input args
path_to_main_input = "./examples/dummy_data"    
minio_credential = "credentials.crc1382.json"
es_credential = "es_credential.json"

# define path to data folder, seperated by data type (bam, cov, features, ...)
path_to_ecd_data = "./examples/dummy_from_real"
path_to_metadata_dir = "./ECD_metadata"

#####-------------------------------------------------------#####
##### Preprocessing data to match data profile
#####-------------------------------------------------------#####
filetype = "bam"
project = "ECD"
for sub_project in ["ECD_WGS_hg19", "ECD_WGS_hg38"]:
    all_files = [item for item in pathlib.Path(os.path.join(path_to_ecd_data, filetype, sub_project)).glob("*.{}".format(filetype))]
    metadata = pd.read_excel(os.path.join(path_to_metadata_dir, "metadata_WGS_20240606.xlsx"))

    inputdf = pd.DataFrame(data = [str(item) for item in all_files], columns = ["path"])
    inputdf["Labcode"] = inputdf["path"].apply(lambda x: x.split("/")[-1].split("_")[0].split("-")[1])
    inputdf["SequencingID"] = inputdf["Labcode"]
    inputdf["FileName"] = inputdf["path"].apply(lambda x: x.split("/")[-1])
    inputdf["FileType"] = filetype
    inputdf["Date"] = datetime.now().strftime("%Y-%m-%d")
    inputdf["pipeline"] = sub_project
    inputdf["project"] = project
    inputdf["sub_project"] = "_".join(sub_project.split("_")[0:2])
    inputdf["ref_genome"] = sub_project.split("_")[-1]
    inputdf["depth"] = "low"
    inputdf["cancer_label"] = inputdf["Labcode"].apply(lambda x: metadata[metadata["SampleID"] == x]["Label"].values[0])

    input_metadata = inputdf.set_index("path")
    input_metadata_dict = input_metadata.to_dict(orient = "index") # the input metadata is ready to be added to the database elasticsearch

    #####-------------------------------------------------------#####
    ##### initialize the RDSBucket adn ESearch classes
    #####-------------------------------------------------------#####
    es = ESearch(es_credential = es_credential)

    # bamBucket = RDSBucket( 
    #                     minio_credential = minio_credential, 
    #                     bucketName = "{}-{}".format(project.lower().replace("_", "-"), 
    #                                                 sub_project.lower().replace("_", "-")),
    #                     PROFILE_NAME = "wgsbam",
    #                     DATA_PROFILES = DATA_PROFILES,
    #                     es_credential = es_credential, 
    #                     versioning = True, 
    #                     verbose = False)
    # bamBucket.initBucket()

    # for path in input_metadata_dict.keys():
    #     file_metadata = input_metadata_dict[path]
    #     bamBucket.upload_file_to_bucket(path_to_file = path, 
    #                                     object_name= file_metadata["FileName"], 
    #                                     file_metadata = file_metadata)



In [3]:
filetype = "bam"
project = "ECD"
sub_project = "ECD_WGS_hg19"
bamBucket = RDSBucket( 
                      minio_credential = minio_credential, 
                      bucketName = "{}-{}".format(project.lower().replace("_", "-"), 
                                                  sub_project.lower().replace("_", "-")),
                      PROFILE_NAME = "wgsbam",
                      DATA_PROFILES = DATA_PROFILES,
                      es_credential = es_credential, 
                      versioning = True, 
                      verbose = False)

es.list_all_data_from_a_profile("wgsbam")

Unnamed: 0,Labcode,SequencingID,FileName,FileType,Date,pipeline,project,sub_project,ref_genome,depth,cancer_label,bucket,versionID
0,ZK0AAAB41NB,ZK0AAAB41NB,8-ZK0AAAB41NB_S95075-S97075.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,c9c92473-0804-40a4-9cf6-6636b07bec88
1,ZLBE405NB,ZLBE405NB,24-ZLBE405NB_S95016-S97016.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Esophageal,ecd-ecd-wgs-hg19,1abd63d4-e396-4fc3-a450-12e03d3fb2b0
2,ZK0CAAA67NB,ZK0CAAA67NB,3-ZK0CAAA67NB_S95035-S97035.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,87089a55-7c6a-42fd-964d-6b2c7d59c01b
3,ZK0CAAA81NB,ZK0CAAA81NB,22-ZK0CAAA81NB_S95042-S97042.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,797888e9-3d2a-4ee1-a68c-cb77605083ea
4,ZMB183NB,ZMB183NB,9-ZMB183NB_S95063-S97063.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Breast,ecd-ecd-wgs-hg19,996c1437-0619-4c08-91d5-e8e420e1ad9f
5,ZMDGAAB63NB,ZMDGAAB63NB,56-ZMDGAAB63NB_S95001-S97001.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Endometrial,ecd-ecd-wgs-hg19,e93d202f-ed8f-42f7-8138-b4a2189d25c7
6,ZK0AAAB10NB,ZK0AAAB10NB,21-ZK0AAAB10NB_S95056-S97056.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,807e4e25-894e-472e-aa06-ebd7cbe2d82f
7,ZMH040NB,ZMH040NB,11-ZMH040NB_S95011-S97011.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Liver,ecd-ecd-wgs-hg19,526ee9cd-c5e5-4dc7-8732-d3f49b9ce40d
8,ZMG144NB,ZMG144NB,16-ZMG144NB_S95045-S97045.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Gastric,ecd-ecd-wgs-hg19,5d6327c2-e7bd-4166-976f-87503cc26e0d
9,ZLBE408NB,ZLBE408NB,10-ZLBE408NB_S95002-S97002.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Esophageal,ecd-ecd-wgs-hg19,298cd6df-a4c3-41d2-9f20-6947fea665db


In [8]:

search_query = {
  "query": {
    "match": {
      "pipeline": "ECD_WGS_hg19"
    }
  }
}

response = es.es.search(index="wgsbam", body=search_query, size = 1000)

search_resdf = pd.DataFrame([doc['_source'] for doc in response['hits']['hits']])

search_resdf

Unnamed: 0,Labcode,SequencingID,FileName,FileType,Date,pipeline,project,sub_project,ref_genome,depth,cancer_label,bucket,versionID
0,ZK0AAAB41NB,ZK0AAAB41NB,8-ZK0AAAB41NB_S95075-S97075.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,c9c92473-0804-40a4-9cf6-6636b07bec88
1,ZLBE405NB,ZLBE405NB,24-ZLBE405NB_S95016-S97016.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Esophageal,ecd-ecd-wgs-hg19,1abd63d4-e396-4fc3-a450-12e03d3fb2b0
2,ZK0CAAA67NB,ZK0CAAA67NB,3-ZK0CAAA67NB_S95035-S97035.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,87089a55-7c6a-42fd-964d-6b2c7d59c01b
3,ZK0CAAA81NB,ZK0CAAA81NB,22-ZK0CAAA81NB_S95042-S97042.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,797888e9-3d2a-4ee1-a68c-cb77605083ea
4,ZMB183NB,ZMB183NB,9-ZMB183NB_S95063-S97063.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Breast,ecd-ecd-wgs-hg19,996c1437-0619-4c08-91d5-e8e420e1ad9f
...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,ZMB126NB,ZMB126NB,15-ZMB126NB_S95055-S97055.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Breast,ecd-ecd-wgs-hg19,e93c5266-f6bc-49cf-90d9-831b28d8dd69
545,ZK0CAAA39NB,ZK0CAAA39NB,7-ZK0CAAA39NB_S95028-S97028.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,3264a65d-9331-4fc0-aa61-ab4c6ea50860
546,ZYCAB14NB,ZYCAB14NB,15-ZYCAB14NB_S95071-S97071.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Ovarian,ecd-ecd-wgs-hg19,ca2ae0d4-11b0-4fc9-9d01-201a25be3eac
547,ZLBE304NB,ZLBE304NB,14-ZLBE304NB_S95006-S97006.sorted.bam,bam,2024-07-08,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Head and Neck,ecd-ecd-wgs-hg19,b4e73efa-0663-406e-8a07-03a9a8afc407


In [36]:
from elasticsearch import Elasticsearch

client = Elasticsearch(
    "http://localhost:9200/",
    api_key="ak00RW5wQUJPZFN3Um5zTHpxZ2o6RzF3UFJXWlhRekd0dWg1NXNDVnpfQQ=="  # Use http_auth for API key authentication
)
client.info()

ObjectApiResponse({'name': '825b565ddcac', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'VpHbVt1cQJqT0oWMwuJ4Yg', 'version': {'number': '8.14.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '2afe7caceec8a26ff53817e5ed88235e90592a1b', 'build_date': '2024-07-01T22:06:58.515911606Z', 'build_snapshot': False, 'lucene_version': '9.10.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [41]:
search_response = client.search(index="wgsbam", q="ZK0AAAB41NB")["hits"]["hits"]
data = [hit['_source'] for hit in search_response]
data

[{'Labcode': 'ZK0AAAB41NB',
  'SequencingID': 'ZK0AAAB41NB',
  'FileName': '8-ZK0AAAB41NB_S95075-S97075.sorted.bam',
  'FileType': 'bam',
  'Date': '2024-07-08',
  'pipeline': 'ECD_WGS_hg19',
  'project': 'ECD',
  'sub_project': 'ECD_WGS',
  'ref_genome': 'hg19',
  'depth': 'low',
  'cancer_label': 'Control',
  'bucket': 'ecd-ecd-wgs-hg19',
  'versionID': 'c9c92473-0804-40a4-9cf6-6636b07bec88'},
 {'Labcode': 'ZK0AAAB41NB',
  'SequencingID': 'ZK0AAAB41NB',
  'FileName': '8-ZK0AAAB41NB_S95075-S97075.sorted.bam',
  'FileType': 'bam',
  'Date': '2024-07-08',
  'pipeline': 'ECD_WGS_hg38',
  'project': 'ECD',
  'sub_project': 'ECD_WGS',
  'ref_genome': 'hg38',
  'depth': 'low',
  'cancer_label': 'Control',
  'bucket': 'ecd-ecd-wgs-hg38',
  'versionID': '07a9bb72-88a2-4892-9b6c-bdb961e89a93'}]