In [1]:
import json
from pprint import pprint
import os
import time
import pandas as pd 
from datetime import datetime
import pathlib

from dotenv import load_dotenv
from elasticsearch import Elasticsearch

import minio
from minio.error import S3Error
from minio.commonconfig import ENABLED
from minio.versioningconfig import VersioningConfig

from RDSBucket_class import *
from data_profiles import *

import warnings
warnings.filterwarnings("ignore")
##### input args
path_to_main_input = "./examples/dummy_data"    
minio_credential = "credentials.macstudio.json"
es_credential = "es_credential.json"

file_type = "bam"
input_files = [item for item in pathlib.Path(os.path.join(path_to_main_input, file_type)).glob("*.{}".format(file_type))]
input_metadata = pd.DataFrame(data = [item.name for item in input_files], columns = ["FileName"])
input_metadata["Labcode"] = input_metadata["FileName"].apply(lambda x: x.replace(".{}".format(file_type), ""))
input_metadata["SequencingID"] = input_metadata["Labcode"]
input_metadata["FileType"] = file_type
input_metadata["Date"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
input_metadata["pipeline"] = "WGBS_Bismark"
input_metadata["project"] = "ECD"
input_metadata["sub_project"] = "read-based"
input_metadata["ref_genome"] = "hg19"
input_metadata["path"] = ["./examples/dummy_data/{}/{}.{}".format(file_type, labcode, file_type) for labcode in input_metadata.Labcode.values]
input_metadata = input_metadata.set_index("path")
input_metadata_dict = input_metadata.to_dict(orient = "index") # the input metadata is ready to be added to the database elasticsearch

##### load the main elasticsearch database object
es = ESearch(es_credential = es_credential)

#####-----------------------------------------------------------------------#####
##### Example usage of the RDSBucket class
#####-----------------------------------------------------------------------#####
# ##### Create a bucket name "bam1" using the data profile "bam_profile"
# bamBucket = RDSBucket( 
#                       minio_credential = minio_credential, 
#                       bucketName = "bam1", 
#                       PROFILE_NAME = "bamfile",
#                       DATA_PROFILES = DATA_PROFILES,
#                       es_credential = es_credential, 
#                       versioning = True, 
#                       verbose = False)
# bamBucket.initBucket()

# for path in input_metadata_dict.keys():
#     file_metadata = input_metadata_dict[path]
#     bamBucket.upload_file_to_bucket(path_to_file = path, 
#                                     object_name= file_metadata["FileName"], 
#                                     file_metadata = file_metadata)



Unnamed: 0,FileName,Labcode,SequencingID,FileType,Date,pipeline,project,sub_project,ref_genome,bucket,versionID
0,Iog543.bam,Iog543,Iog543,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam1,a2d2f536-ff72-42ca-a552-eb7a20bd4d2f
1,tsw228.bam,tsw228,tsw228,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam1,afb4eabb-4d79-437c-9f88-8535c9624c31
2,niz036.bam,niz036,niz036,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam1,f31cdf78-6cf8-4652-aef1-836b9076814b
3,oyZ749.bam,oyZ749,oyZ749,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam1,6c70266b-ed84-4fba-86a7-9b1ed4bdb0ab
4,iZN770.bam,iZN770,iZN770,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam1,8dae8f4a-641c-4b96-8366-ba1c9e2d505c
...,...,...,...,...,...,...,...,...,...,...,...
195,hSF108.bam,hSF108,hSF108,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam2,b4d4757e-e2c1-48c9-a0c9-02c3b3fab81f
196,Yas662.bam,Yas662,Yas662,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam2,38db5ea4-ba9c-4588-88ad-2be43e3652d0
197,AiN768.bam,AiN768,AiN768,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam2,77685be3-d7dc-4294-b996-74709372e09a
198,qeA128.bam,qeA128,qeA128,bam,2024-07-08 21:16:57,WGBS_Bismark,ECD,read-based,hg19,bam2,f833dd64-5ec5-4e0b-84e2-6c5cafe0ed50
