In [18]:
import json
from pprint import pprint
import os
import time
import pandas as pd 
from datetime import datetime
import pathlib

from dotenv import load_dotenv
from elasticsearch import Elasticsearch

import minio
from minio.error import S3Error
from minio.commonconfig import ENABLED
from minio.versioningconfig import VersioningConfig

from RDSBucket_class import *
from data_profiles import *

import warnings
warnings.filterwarnings("ignore")
##### input args
path_to_main_input = "./examples/dummy_data"    
minio_credential = "credentials.crc1382.json"
es_credential = "es_credential.json"

# define path to data folder, seperated by data type (bam, cov, features, ...)
path_to_ecd_data = "./examples/dummy_from_real"
path_to_metadata_dir = "./ECD_metadata"

#####-------------------------------------------------------#####
##### Preprocessing data to match data profile
#####-------------------------------------------------------#####
filetype = "bam"
project = "ECD"
for sub_project in ["ECD_WGS_hg19", "ECD_WGS_hg38"]:
    all_files = [item for item in pathlib.Path(os.path.join(path_to_ecd_data, filetype, sub_project)).glob("*.{}".format(filetype))]
    metadata = pd.read_excel(os.path.join(path_to_metadata_dir, "metadata_WGS_20240606.xlsx"))

    inputdf = pd.DataFrame(data = [str(item) for item in all_files], columns = ["path"])
    inputdf["Labcode"] = inputdf["path"].apply(lambda x: x.split("/")[-1].split("_")[0].split("-")[1])
    inputdf["SequencingID"] = inputdf["Labcode"]
    inputdf["FileName"] = inputdf["path"].apply(lambda x: x.split("/")[-1])
    inputdf["FileType"] = filetype
    inputdf["Date"] = datetime.now().strftime("%Y-%m-%d")
    inputdf["pipeline"] = sub_project
    inputdf["project"] = project
    inputdf["sub_project"] = "_".join(sub_project.split("_")[0:2])
    inputdf["ref_genome"] = sub_project.split("_")[-1]
    inputdf["depth"] = "low"
    inputdf["cancer_label"] = inputdf["Labcode"].apply(lambda x: metadata[metadata["SampleID"] == x]["Label"].values[0])

    input_metadata = inputdf.set_index("path")
    input_metadata_dict = input_metadata.to_dict(orient = "index") # the input metadata is ready to be added to the database elasticsearch

    #####-------------------------------------------------------#####
    ##### initialize the RDSBucket adn ESearch classes
    #####-------------------------------------------------------#####
    es = ESearch(es_credential = es_credential)

    # bamBucket = RDSBucket( 
    #                     minio_credential = minio_credential, 
    #                     bucketName = "{}-{}".format(project.lower().replace("_", "-"), 
    #                                                 sub_project.lower().replace("_", "-")),
    #                     PROFILE_NAME = "wgsbam",
    #                     DATA_PROFILES = DATA_PROFILES,
    #                     es_credential = es_credential, 
    #                     versioning = True, 
    #                     verbose = False)
    # bamBucket.initBucket()

    # for path in input_metadata_dict.keys():
    #     file_metadata = input_metadata_dict[path]
    #     bamBucket.upload_file_to_bucket(path_to_file = path, 
    #                                     object_name= file_metadata["FileName"], 
    #                                     file_metadata = file_metadata)



In [19]:
bamBucket.minio_client.list_buckets()

[Bucket('ecd-ecd-wgs-hg19'), Bucket('ecd-ecd-wgs-hg38')]

In [20]:
filetype = "bam"
project = "ECD"
sub_project = "ECD_WGS_hg19"
bamBucket = RDSBucket( 
                      minio_credential = minio_credential, 
                      bucketName = "{}-{}".format(project.lower().replace("_", "-"), 
                                                  sub_project.lower().replace("_", "-")),
                      PROFILE_NAME = "wgsbam",
                      DATA_PROFILES = DATA_PROFILES,
                      es_credential = es_credential, 
                      versioning = True, 
                      verbose = False)

es.list_all_data_from_a_profile("wgsbam")

Unnamed: 0,Labcode,SequencingID,FileName,FileType,Date,pipeline,project,sub_project,ref_genome,depth,cancer_label,bucket,versionID
0,ZMDGAAA92NB,ZMDGAAA92NB,41-ZMDGAAA92NB_S95003-S97003.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Esophageal,ecd-ecd-wgs-hg19,d77c878f-c9ee-4eaf-9a46-c549812a1e93
1,ZK0AAAB61NB,ZK0AAAB61NB,13-ZK0AAAB61NB_S95039-S97039.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,703d700e-b502-4b7d-8b82-3514c1d53ef8
2,ZMG136NB,ZMG136NB,15-ZMG136NB_S95094-S97094.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Gastric,ecd-ecd-wgs-hg19,8a32694a-8861-4729-9028-068319e3c3a7
3,ZLBE117NB,ZLBE117NB,11-ZLBE117NB_S95076-S97076.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Pancreatic,ecd-ecd-wgs-hg19,14d62523-d09e-4652-88b5-844fbc18a7f4
4,ZK0DAAA29NB,ZK0DAAA29NB,1-ZK0DAAA29NB_S95001-S97001.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,6edc7fca-df1e-4a25-8a39-043239f533ce
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,K0AAAA09NB,K0AAAA09NB,7-K0AAAA09NB_S95007-S97007.sorted.bam,bam,2024-07-10,ECD_WGS_hg38,ECD,ECD_WGS,hg38,low,Control,ecd-ecd-wgs-hg38,74546ad6-5f3d-438d-8f70-c4872108e7dc
996,ZMC079NB,ZMC079NB,10-ZMC079NB_S95053-S97053.sorted.bam,bam,2024-07-10,ECD_WGS_hg38,ECD,ECD_WGS,hg38,low,CRC,ecd-ecd-wgs-hg38,87ccaba5-48c8-450a-ad9f-50929220649a
997,ZK0CAAA55NB,ZK0CAAA55NB,7-ZK0CAAA55NB_S95063-S97063.sorted.bam,bam,2024-07-10,ECD_WGS_hg38,ECD,ECD_WGS,hg38,low,Control,ecd-ecd-wgs-hg38,127fe3c9-43a3-4913-a4aa-b874f9e70fac
998,ZK0DAAA12NB,ZK0DAAA12NB,43-ZK0DAAA12NB_S95069-S97069.sorted.bam,bam,2024-07-10,ECD_WGS_hg38,ECD,ECD_WGS,hg38,low,Control,ecd-ecd-wgs-hg38,02031193-4b30-4d40-8edb-93a6cb6d5b1d


In [31]:

search_query = {
  "query": {
    "match": {
      "pipeline": "ECD_WGS_hg19"
    }
  }
}

response = es.es.search(index="wgsbam", body=search_query, size = 1000)

search_resdf = pd.DataFrame([doc['_source'] for doc in response['hits']['hits']])

In [32]:
search_resdf

Unnamed: 0,Labcode,SequencingID,FileName,FileType,Date,pipeline,project,sub_project,ref_genome,depth,cancer_label,bucket,versionID
0,ZMDGAAA92NB,ZMDGAAA92NB,41-ZMDGAAA92NB_S95003-S97003.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Esophageal,ecd-ecd-wgs-hg19,d77c878f-c9ee-4eaf-9a46-c549812a1e93
1,ZK0AAAB61NB,ZK0AAAB61NB,13-ZK0AAAB61NB_S95039-S97039.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,703d700e-b502-4b7d-8b82-3514c1d53ef8
2,ZMG136NB,ZMG136NB,15-ZMG136NB_S95094-S97094.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Gastric,ecd-ecd-wgs-hg19,8a32694a-8861-4729-9028-068319e3c3a7
3,ZLBE117NB,ZLBE117NB,11-ZLBE117NB_S95076-S97076.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Pancreatic,ecd-ecd-wgs-hg19,14d62523-d09e-4652-88b5-844fbc18a7f4
4,ZK0DAAA29NB,ZK0DAAA29NB,1-ZK0DAAA29NB_S95001-S97001.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,6edc7fca-df1e-4a25-8a39-043239f533ce
...,...,...,...,...,...,...,...,...,...,...,...,...,...
544,ZK0CAAA76NB,ZK0CAAA76NB,7-ZK0CAAA76NB_S95039-S97039.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,dc0e0e64-d530-463b-affb-87268322a3f2
545,ZK0CAAA05NB,ZK0CAAA05NB,20-ZK0CAAA05NB_S95068-S97068.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,67580576-e96d-438f-af00-bc51f60b44df
546,ZMC039NB,ZMC039NB,9-ZMC039NB_S95045-S97045.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,CRC,ecd-ecd-wgs-hg19,e9e7e6f0-e944-4f4d-883e-a6f21758ca3f
547,ZK0AAAD20NB,ZK0AAAD20NB,5-ZK0AAAD20NB_S95009-S97009.sorted.bam,bam,2024-07-10,ECD_WGS_hg19,ECD,ECD_WGS,hg19,low,Control,ecd-ecd-wgs-hg19,c3c24de5-b535-497f-a539-d33614eba8dc
