In [1]:
import json
from pprint import pprint
import os
import time
import pandas as pd 
from datetime import datetime
import pathlib

from dotenv import load_dotenv
from elasticsearch import Elasticsearch

import minio
from minio.error import S3Error
from minio.commonconfig import ENABLED
from minio.versioningconfig import VersioningConfig

from RDSBucket_class import *
from data_profiles import *

import warnings
warnings.filterwarnings("ignore")
##### input args
path_to_main_input = "./examples/dummy_data"    
minio_credential = "credentials.macstudio.json"
es_credential = "es_credential.json"

# define path to data folder, seperated by data type (bam, cov, features, ...)
path_to_ecd_data = "./examples/dummy_from_real"
path_to_metadata_dir = "./ECD_metadata"

#####-------------------------------------------------------#####
##### Preprocessing data to match data profile
#####-------------------------------------------------------#####
filetype = "bam"
project = "ECD"
for sub_project in ["ECD_WGS_hg19", "ECD_WGS_hg38"]:
    all_files = [item for item in pathlib.Path(os.path.join(path_to_ecd_data, filetype, sub_project)).glob("*.{}".format(filetype))]
    metadata = pd.read_excel(os.path.join(path_to_metadata_dir, "metadata_WGS_20240606.xlsx"))

    inputdf = pd.DataFrame(data = [str(item) for item in all_files], columns = ["path"])
    inputdf["Labcode"] = inputdf["path"].apply(lambda x: x.split("/")[-1].split("_")[0].split("-")[1])
    inputdf["SequencingID"] = inputdf["Labcode"]
    inputdf["FileName"] = inputdf["path"].apply(lambda x: x.split("/")[-1])
    inputdf["FileType"] = filetype
    inputdf["Date"] = datetime.now().strftime("%Y-%m-%d")
    inputdf["pipeline"] = sub_project
    inputdf["project"] = project
    inputdf["sub_project"] = "_".join(sub_project.split("_")[0:2])
    inputdf["ref_genome"] = sub_project.split("_")[-1]
    inputdf["depth"] = "low"
    inputdf["cancer_label"] = inputdf["Labcode"].apply(lambda x: metadata[metadata["SampleID"] == x]["Label"].values[0])

    input_metadata = inputdf.set_index("path")
    input_metadata_dict = input_metadata.to_dict(orient = "index") # the input metadata is ready to be added to the database elasticsearch

    #####-------------------------------------------------------#####
    ##### initialize the RDSBucket adn ESearch classes
    #####-------------------------------------------------------#####
    es = ESearch(es_credential = es_credential)

    bamBucket = RDSBucket( 
                        minio_credential = minio_credential, 
                        bucketName = "{}-{}".format(project.lower().replace("_", "-"), 
                                                    sub_project.lower().replace("_", "-")),
                        PROFILE_NAME = "wgsbam",
                        DATA_PROFILES = DATA_PROFILES,
                        es_credential = es_credential, 
                        versioning = True, 
                        verbose = False)
    bamBucket.initBucket()

    for path in input_metadata_dict.keys():
        file_metadata = input_metadata_dict[path]
        bamBucket.upload_file_to_bucket(path_to_file = path, 
                                        object_name= file_metadata["FileName"], 
                                        file_metadata = file_metadata)



Bucket 'ecd-ecd-wgs-hg19' created successfully.
Bucket 'ecd-ecd-wgs-hg38' created successfully.
