In [None]:
import boto3
import pyarrow.parquet as pq


In [6]:
URI = "s3://bbio-data-platform/prod/glue/target_intelligence/open_targets_evidence/part-00001-tid-6793543178480712938-a8559181-3f54-432c-9901-c16dc93f4338-412-1-c000.snappy.parquet"

In [7]:
schema=pq.read_schema(URI)

In [9]:
"run_date" in schema.names

False

In [12]:
# Initialize Glue client
glue_client = boto3.client('glue')

# Get table description
try:
    response = glue_client.get_table(
        DatabaseName='target_intelligence',  # replace with your database name
        Name='open_targets_evidence'        # table name from your URI
    )
    
    # Print table details
    table_info = response['Table']
    # print(f"Table Name: {table_info['Name']}")
    # print(f"Database: {table_info['DatabaseName']}")
    print(f"Description: {table_info.get('Description', 'No description available')}")
    # print("\nColumns:")
    # for col in table_info['StorageDescriptor']['Columns']:
    #     print(f"- {col['Name']}: {col['Type']}")
        
except glue_client.exceptions.EntityNotFoundException:
    print("Table not found")
except Exception as e:
    print(f"Error: {str(e)}")

Description: No description available


In [13]:
print(f"Partitioned: {'PartitionKeys' in table_info}")

Partitioned: True


In [15]:
table_info

{'Name': 'open_targets_evidence',
 'DatabaseName': 'target_intelligence',
 'Owner': 'root',
 'CreateTime': datetime.datetime(2024, 5, 30, 13, 7, 19, tzinfo=tzlocal()),
 'UpdateTime': datetime.datetime(2024, 5, 30, 13, 7, 19, tzinfo=tzlocal()),
 'LastAccessTime': datetime.datetime(1969, 12, 31, 16, 0, tzinfo=tzlocal()),
 'Retention': 0,
 'StorageDescriptor': {'Columns': [{'Name': 'datasource_id', 'Type': 'string'},
   {'Name': 'target_id', 'Type': 'string'},
   {'Name': 'ancestry', 'Type': 'string'},
   {'Name': 'ancestry_id', 'Type': 'string'},
   {'Name': 'beta', 'Type': 'double'},
   {'Name': 'beta_confidence_interval_lower', 'Type': 'double'},
   {'Name': 'beta_confidence_interval_upper', 'Type': 'double'},
   {'Name': 'biological_model_allelic_composition', 'Type': 'string'},
   {'Name': 'biological_model_genetic_background', 'Type': 'string'},
   {'Name': 'biological_model_id', 'Type': 'string'},
   {'Name': 'biomarker_name', 'Type': 'string'},
   {'Name': 'cell_type', 'Type': 'st

In [16]:
## Explore s3 buckets

s3_client = boto3.client('s3')
response = s3_client.list_buckets()

# Print bucket names
print("Available buckets:")
for bucket in response['Buckets']:
    print(f"- {bucket['Name']}")

Available buckets:
- bbio-admera-data-transfer
- bbio-adpkd-ncbi-data-delivery
- bbio-compgen-infra
- bbio-data-platform
- bbio-databricks-npv
- bbio-databricks-uc-metastore
- bbio-databricks-workspace-lambdazipsbucket-1hwnnj0y8rlon
- bbio-dls-data-transfer
- bbio-functional-genomics
- bbio-igv-app
- bbio-igv-app2
- bbio-nashbio-data
- bbio-nextflow
- bbio-nextflow-bbgt-aav-ngs-long-read-seq
- bbio-nextflow-bbgt-capsid-evolution
- bbio-pacbio-data-transfer
- bbio-pacbio-smrtlink-data
- bbio-pkd2-taiwan
- bbio-workflow-results
- bbiodbricks
- core-metaflow-s3-v1
- databricks-ukbiobank
- metaflows35ay3bs8e


In [17]:
paginator = s3_client.get_paginator('list_objects_v2')

In [18]:
paginator

<botocore.client.S3.Paginator.ListObjectsV2 at 0x11dd9eba0>

In [20]:
def list_dirs(bucket_name, prefix=''):
    """List all 'directories' in the bucket under specified prefix"""
    s3_client = boto3.client('s3')
    paginator = s3_client.get_paginator('list_objects_v2')
    
    # Use delimiter to emulate directory structure
    pages = paginator.paginate(
        Bucket=bucket_name,
        Prefix=prefix,
        Delimiter='/'
    )
    
    print(f"\nDirectories in {bucket_name}/{prefix}:")
    print("-" * 50)
    
    for page in pages:
        # Common prefixes are the "directories"
        for prefix in page.get('CommonPrefixes', []):
            print(f"└── {prefix['Prefix']}")

# Example usage - list top-level "directories"
list_dirs('bbio-data-platform')

# To go deeper into a specific path:
list_dirs('bbio-data-platform', 'dev/')


Directories in bbio-data-platform/:
--------------------------------------------------
└── dev/
└── prod/
└── raw/
└── tmp/

Directories in bbio-data-platform/dev/:
--------------------------------------------------
└── dev/airtable_archive/
└── dev/alpha_missense/
└── dev/athena/
└── dev/bbio-gt/
└── dev/citeline_trials_data/
└── dev/data_driven_target_disease_prioritization/
└── dev/dls_processed/
└── dev/edgar/
└── dev/evidgen/
└── dev/gene_id_mapping/
└── dev/genoox/
└── dev/geuvadis/
└── dev/global_data/
└── dev/gtex/
└── dev/human_genetics_dashboard/
└── dev/incidence_estimator/
└── dev/k8s_mnt/
└── dev/knowledge_graphs/
└── dev/manual_mappings/
└── dev/mesh_terms/
└── dev/nf-core--rnaseq/
└── dev/onc_dashboard_data/
└── dev/onc_main_data_repo/
└── dev/open-targets-analysis/
└── dev/orphanet/
└── dev/pacbio-smrtlink-data-processed/
└── dev/pops_features/
└── dev/reactome_database/
└── dev/rna_seq/
└── dev/scrnaseq/
└── dev/summary-statistics/
└── dev/target_id/
└── dev/users/
└─

In [21]:
# create personal dev dir

def create_dev_directory(bucket_name, dev_path):
    """
    Create a development directory in S3
    Note: S3 doesn't actually create directories, but creates a zero-byte object with 
    a trailing slash to simulate a directory
    """
    s3_client = boto3.client('s3')
    
    # Ensure path ends with trailing slash
    if not dev_path.endswith('/'):
        dev_path += '/'
        
    try:
        # Create empty object with trailing slash to simulate directory
        s3_client.put_object(
            Bucket=bucket_name,
            Key=dev_path,
            Body=''
        )
        print(f"Created directory: s3://{bucket_name}/{dev_path}")
        
        # Verify it exists by listing it
        list_dirs(bucket_name, 'dev/')
        
    except Exception as e:
        print(f"Error creating directory: {str(e)}")

# Create your dev directory
create_dev_directory('bbio-data-platform', 'dev/jcheng')

Created directory: s3://bbio-data-platform/dev/jcheng/

Directories in bbio-data-platform/dev/:
--------------------------------------------------
└── dev/airtable_archive/
└── dev/alpha_missense/
└── dev/athena/
└── dev/bbio-gt/
└── dev/citeline_trials_data/
└── dev/data_driven_target_disease_prioritization/
└── dev/dls_processed/
└── dev/edgar/
└── dev/evidgen/
└── dev/gene_id_mapping/
└── dev/genoox/
└── dev/geuvadis/
└── dev/global_data/
└── dev/gtex/
└── dev/human_genetics_dashboard/
└── dev/incidence_estimator/
└── dev/jcheng/
└── dev/k8s_mnt/
└── dev/knowledge_graphs/
└── dev/manual_mappings/
└── dev/mesh_terms/
└── dev/nf-core--rnaseq/
└── dev/onc_dashboard_data/
└── dev/onc_main_data_repo/
└── dev/open-targets-analysis/
└── dev/orphanet/
└── dev/pacbio-smrtlink-data-processed/
└── dev/pops_features/
└── dev/reactome_database/
└── dev/rna_seq/
└── dev/scrnaseq/
└── dev/summary-statistics/
└── dev/target_id/
└── dev/users/
└── dev/validation/


In [22]:
# Create organized subdirectories for different purposes
sandbox_dirs = [
    'dev/jcheng/tables/',           # For table development
    'dev/jcheng/temp/',            # For temporary data
    'dev/jcheng/experiments/',     # For experimental features
    'dev/jcheng/test_data/'       # For test datasets
]

for dir_path in sandbox_dirs:
    create_dev_directory('bbio-data-platform', dir_path)

Created directory: s3://bbio-data-platform/dev/jcheng/tables/

Directories in bbio-data-platform/dev/:
--------------------------------------------------
└── dev/airtable_archive/
└── dev/alpha_missense/
└── dev/athena/
└── dev/bbio-gt/
└── dev/citeline_trials_data/
└── dev/data_driven_target_disease_prioritization/
└── dev/dls_processed/
└── dev/edgar/
└── dev/evidgen/
└── dev/gene_id_mapping/
└── dev/genoox/
└── dev/geuvadis/
└── dev/global_data/
└── dev/gtex/
└── dev/human_genetics_dashboard/
└── dev/incidence_estimator/
└── dev/jcheng/
└── dev/k8s_mnt/
└── dev/knowledge_graphs/
└── dev/manual_mappings/
└── dev/mesh_terms/
└── dev/nf-core--rnaseq/
└── dev/onc_dashboard_data/
└── dev/onc_main_data_repo/
└── dev/open-targets-analysis/
└── dev/orphanet/
└── dev/pacbio-smrtlink-data-processed/
└── dev/pops_features/
└── dev/reactome_database/
└── dev/rna_seq/
└── dev/scrnaseq/
└── dev/summary-statistics/
└── dev/target_id/
└── dev/users/
└── dev/validation/
Created directory: s3://bbio-

In [23]:
# Example: Writing a test table to your sandbox
import pandas as pd

# Create sample data
test_df = pd.DataFrame({
    'id': range(1, 5),
    'value': ['a', 'b', 'c', 'd']
})

# Save to your sandbox (parquet format recommended for AWS)
output_path = 's3://bbio-data-platform/dev/jcheng/tables/test_table.parquet'
test_df.to_parquet(output_path)