In [2]:
import pandas as pd
import os 
import pathlib 
import glob
from datetime import datetime
from tqdm import tqdm

from dotenv import load_dotenv
from elasticsearch import Elasticsearch

import minio
from minio.error import S3Error
from minio.commonconfig import ENABLED
from minio.versioningconfig import VersioningConfig

from RDSBucket_class import *
from data_profiles import *

import warnings
warnings.filterwarnings("ignore")

#####----------------------------------------------------------------#####
##### preprocessing metadata for bam file
#####----------------------------------------------------------------#####

path_to_save_prep_metadata = "/Users/hieunguyen/src/DVC_system/examples/dummy_from_real/prep_metadata"
os.system("mkdir -p {}".format(path_to_save_prep_metadata))

path_to_main_input = "./examples/dummy_data"    
minio_credential = "credentials.mb.json"
es_credential = "es_credential.json"

# define path to data folder, seperated by data type (bam, cov, features, ...)
path_to_ecd_data = "./examples/dummy_from_real"
path_to_metadata_dir = "./ECD_metadata"

#####-------------------------------------------------------#####
##### Preprocessing data to match data profile
#####-------------------------------------------------------#####
filetype = "bam"
project = "ECD"
for sub_project in ["ECD_WGS_hg19", "ECD_WGS_hg38"]:
    if os.path.isfile(os.path.join(path_to_save_prep_metadata, "WGS_metadata_bam_files_{}.csv".format(sub_project))) == False:
        all_files = [item for item in pathlib.Path(os.path.join(path_to_ecd_data, filetype, sub_project)).glob("*.{}".format(filetype))]
        metadata = pd.read_excel(os.path.join(path_to_metadata_dir, "metadata_WGS_20240606.xlsx"))

        inputdf = pd.DataFrame(data = [str(item) for item in all_files], columns = ["path"])
        inputdf["Labcode"] = inputdf["path"].apply(lambda x: x.split("/")[-1].split("_")[0].split("-")[1])
        inputdf["SequencingID"] = inputdf["Labcode"]
        inputdf["FileName"] = inputdf["path"].apply(lambda x: x.split("/")[-1])
        inputdf["FileType"] = filetype
        inputdf["Date"] = datetime.now().strftime("%Y-%m-%d")
        inputdf["pipeline"] = sub_project
        inputdf["project"] = project
        inputdf["sub_project"] = "_".join(sub_project.split("_")[0:2])
        inputdf["ref_genome"] = sub_project.split("_")[-1]
        inputdf["depth"] = "low"
        inputdf["cancer_label"] = inputdf["Labcode"].apply(lambda x: metadata[metadata["SampleID"] == x]["Label"].values[0])
        inputdf.to_csv(os.path.join(path_to_save_prep_metadata, "WGS_metadata_bam_files_{}.csv".format(sub_project)), index = False)
    else:
        inputdf = pd.read_csv(os.path.join(path_to_save_prep_metadata, "WGS_metadata_bam_files_{}.csv".format(sub_project)))

    input_metadata = inputdf.set_index("path")
    input_metadata_dict = input_metadata.to_dict(orient = "index") # the input metadata is ready to be added to the database elasticsearch
    bamBucket = RDSBucket( 
                        minio_credential = minio_credential, 
                        bucketName = "wgsbam",
                        PROFILE_NAME = "wgsbam",
                        DATA_PROFILES = DATA_PROFILES,
                        es_credential = es_credential, 
                        versioning = True, 
                        verbose = False)
    bamBucket.initBucket()

    for path in input_metadata_dict.keys():
        file_metadata = input_metadata_dict[path]
        bamBucket.upload_file_to_bucket(path_to_file = path, 
                                        object_name= file_metadata["FileName"], 
                                        file_metadata = file_metadata, update_version = True)


Bucket 'wgsbam' created successfully.
Bucket 'wgsbam' already exists. Cannot create bucket with the same name. Please choose another name
