In [8]:
SCRIPTS="/Users/sabrinami/Github/shared_folder/enformer_pipeline/scripts/" 
MODULES=SCRIPTS+"modules/"

Collecting cyvcf2
  Downloading cyvcf2-0.30.22-cp39-cp39-macosx_10_9_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting coloredlogs (from cyvcf2)
  Using cached coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->cyvcf2)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: humanfriendly, coloredlogs, cyvcf2
Successfully installed coloredlogs-15.0.1 cyvcf2-0.30.22 humanfriendly-10.0
Note: you may need to restart the kernel to use updated packages.


---
title: "Running the Enformer Pipeline locally"
description: "We picked two regions and ran enformer on reference genome and personalized genome for one individual"
author: "Sabrina Mi"
date: 7/25/23
---

In [2]:
import os, sys, json, re
import pandas as pd # for manipulating dataframes
import time
##import parsl ## local runs, no parsl
from datetime import date


# some locations and folders
# whereis_script = os.path.dirname(__file__) #os.path.dirname(sys.argv[0]) # or os.path.dirname(__file__)
script_path = SCRIPTS
# batch_utils_path = os.path.join(script_path, 'modules')
# sys.path.append(batch_utils_path)

## MODULES should be the location of the modules
sys.path.append(MODULES)

import loggerUtils
import directives


In [3]:

def enformer_predict(parameters):

    params_path = parameters

    if not os.path.isabs(params_path):
        params_path = os.path.abspath(params_path)

    p_two = os.path.join(script_path, 'modules', 'predictUtils_two.py')

    with open(f'{params_path}') as f:
        parameters = json.load(f)
        # The rest of the script remains the same

        prediction_data_name = parameters['prediction_data_name']
        prediction_id = parameters['prediction_id']
        run_date = parameters['date'] if parameters['date'] is not None else date.today().strftime("%Y-%m-%d")

        if parameters['sub_dir'] == True:
            project_dir = os.path.join(parameters['project_dir'], 'predictions_folder', f'{prediction_data_name}_{prediction_id}', f'predictions_{run_date}')
        elif parameters['sub_dir'] == False:
            project_dir = os.path.join(parameters['project_dir'], f'{prediction_data_name}_{prediction_id}', f'predictions_{run_date}')
        else:
            raise Exception('ERROR - `sub_dir` argument must be a boolean, either true or false')

        interval_list_file = parameters['interval_list_file']
        predictions_log_dir = os.path.join(project_dir, parameters['predictions_log_dir'])
        job_log_dir = os.path.join(project_dir, parameters['write_log']['logdir'])
        n_regions = parameters["n_regions"]
        batch_regions = int(parameters['batch_regions'])
        use_parsl = parameters['use_parsl']
        parsl_parameters = parameters['parsl_parameters']
        sequence_source = parameters['sequence_source']
        exclude_regions = parameters["exclude_regions"]
        reverse_complement = parameters["reverse_complement"]
    
        metadata_dir = parameters['metadata_dir']
        if not os.path.isdir(metadata_dir):
            os.makedirs(metadata_dir)

        output_dir = os.path.join(project_dir, parameters['output_dir'])
        if not os.path.isdir(output_dir):
            os.makedirs(output_dir)

        if int(n_regions) == -1:
            n_regions = None
        elif int(n_regions) > 0:
            n_regions = (n_regions) if isinstance(n_regions, int) else None

        # personalized parameters 
        individuals = parameters['individuals'] if sequence_source == 'personalized' else None
        vcf_files_dict = parameters['vcf_files'] if sequence_source == 'personalized' else None

        if sequence_source == 'personalized':
             # use only the chromosomes that have been made available in the config file vcf params
            print(f'INFO - Sequence source is {sequence_source}. Using a reference genome + vcf files.')
            chromosomes = list(vcf_files_dict['files'].keys())

            batch_individuals = parameters["batch_individuals"]
            n_individuals = int(parameters['n_individuals'])
        # list of chromosomes (if the sequence source is reference)
        elif sequence_source == 'reference':
            print(f'INFO - Sequence source is {sequence_source}. Using a reference genome.')
            chromosomes = [f'chr{i}' for i in range(1, 23)]
            chromosomes.extend(['chrX'])

        if reverse_complement:
            print(f'INFO - Predicting on reverse complements too')

    # # write the params_path to a config.json file in a predefined folder
    # tmp_config_data = {'params_path': params_path}
    # tmp_config_file = os.path.join(batch_utils_path, f'tmp_config_{prediction_data_name}_{prediction_id}.json')
    # with open(tmp_config_file, mode='w') as cj:
    #     json.dump(tmp_config_data, cj)

    # modify parsl parameters to add the working directory
    parsl_parameters['working_dir'] = project_dir

    if not os.path.isdir(job_log_dir):
        os.makedirs(job_log_dir)

    # set parsl directives
    if use_parsl:
        directives.parsl_directives(use_parsl, parsl_parameters)
    
    # importing this module does not work; best to execute it here
    predict_utils_one = os.path.join(script_path, 'modules', 'predictUtils_one.py')
    exec(open(predict_utils_one).read(), globals(), globals())

    # decorate the prediction function with or without parsl
    prediction_fxn = return_prediction_function(use_parsl)

    # determine what individuals to predict on and all that
    if sequence_source == 'personalized':
        
        if isinstance(individuals, list):
            id_list = individuals
            pass
        elif isinstance(individuals, type('str')):
            if os.path.isfile(individuals):
                if n_individuals == -1:
                    id_list = pd.read_table(individuals, header=None)[0].tolist()[0:]
                elif n_individuals > 0:
                    id_list = pd.read_table(individuals, header=None)[0].tolist()[0:(n_individuals)]
            else:
                id_list = [individuals]
        print(f'INFO - Found {len(id_list)} individuals to predict on')

    elif sequence_source == 'reference':
        id_list = [prediction_data_name]
        print(f'INFO - Found one reference set named {id_list[0]} to predict on')
    elif sequence_source == 'random':
        id_list = [prediction_data_name]
        print(f'INFO - Prediction will be on a randomly generated set')

    # set log files to be put in a folder and touch the log files per sample
    prediction_logfiles_folder = predictions_log_dir
    if not os.path.isdir(prediction_logfiles_folder):
        os.makedirs(prediction_logfiles_folder)
        
    # list of intervals to be predicted on
    a = pd.read_table(interval_list_file, sep=' ', header=None).dropna(axis=0) #.drop_duplicates(subset=['region', 'sample', 'status', 'sequence_source'], keep='last')
    list_of_regions = a[0].tolist()[0:(n_regions)] # a list of queries
    print(f'INFO - Found {len(list_of_regions)} regions to be split into batches with at most {batch_regions} regions in each batch.')

    # filter the list of chromosomes to be compatible with the available regions
    chromosomes = list(set([r.split('_')[0] for r in list_of_regions]))
    #print(f'INFO - Chromosomes to predict on are: {chromosomes}')

    # should some regions be excluded?
    if exclude_regions == True:
        # seach for the invalid_regions.csv file
        exclude_file = os.path.join(job_log_dir, 'invalid_queries.csv')
        if os.path.isfile(exclude_file):
            exclude_these_regions = pd.read_csv(exclude_file)['region'].tolist()
            print(f'INFO - Found regions to be excluded from the input regions.')
            list_of_regions = [l for l in list_of_regions if l not in exclude_these_regions]  
            print(f'INFO - Updated number of regions to predict on is {len(list_of_regions)}')
        else:
            print(f'INFO - No regions to exclude yet. You either did not supply a file, this is the first run, or there are truly no regions to exclude')
            exclude_these_regions = None
    else:
        exclude_file = None
    
    # batch the samples too
    # if you have 1000 individuals, it may be too much
    if len(id_list) > 5:
        if batch_individuals is not None:
            if isinstance(batch_individuals, int):
                sample_batches = list(generate_batch_n_elems(id_list, n = batch_individuals)) # 5 samples in each batch
                print(f'INFO - There are more than 10 individuals. Predictions will be done for every {batch_individuals} individuals.')
            else:
                raise Exception(f'ERROR - argument `batch_individuals` is not a str type. You supplied a {type(batch_individuals).__name__}')
        else:
            print(f'INFO - You have multiple individuals/samples and have not supplied how to batch them. For efficient use of resources, use the `batch_individuals` argument.')
    else:
        sample_batches = [id_list] # put the list in a list
        print(f'INFO - There seem to be just one sample i.e. {sample_batches}. No need to batch.')

    # to make this fast, pass multiple regions to one parsl app
    sample_app_futures = []
    for sample_list in sample_batches:
        for chromosome in chromosomes:
            #print(chromosome)
            chr_list_of_regions = [r for r in list_of_regions if r.startswith(f"{chromosome}_")]
            if sequence_source == 'personalized':
                chr_vcf_file = os.path.join(vcf_files_dict['folder'], vcf_files_dict['files'][chromosome])
            elif sequence_source == 'reference':
                chr_vcf_file = None

            if not chr_list_of_regions:
                print(f'WARNING - {chromosome} sites are not available.')
                continue

            # I want many regions to be put in a parsl app
            if len(chr_list_of_regions) > batch_regions:
                region_batches = generate_batch_n_elems(chr_list_of_regions, n=batch_regions) # batch_regions total batches
            else:
                region_batches = [chr_list_of_regions]
            
            count = 0
            for region_list in region_batches:
                #print(len(sample_list))
                #print(f'{len(region_list)} regions in {chromosome} for {len(sample_list)} samples')
                sample_app_futures.append(prediction_fxn(batch_regions=list(region_list), samples=list(sample_list), path_to_vcf = chr_vcf_file, batch_num = count, script_path=script_path, output_dir=output_dir, prediction_logfiles_folder=prediction_logfiles_folder, sequence_source=sequence_source, tmp_config_path=params_path, p_two=p_two))   

                count = count + 1 

    if use_parsl == True:
        print(f'INFO - Executing parsl futures for {len(sample_app_futures)} parsl apps')
        exec_futures = [q.result() for q in sample_app_futures] 
        #print(sample_app_futures)
        print(f'INFO - Finished predictions for all')
    elif use_parsl == False:
        print(f'INFO - Finished predictions for: {sample_app_futures} ...')

    # just so I don't have to deal with having too many resources, I can request a small amount of resource
    check_fxn = return_check_function(use_parsl)
    SUMMARY_FILE = os.path.join(job_log_dir, f'{prediction_data_name}_{prediction_id}_{run_date}.summary')
    summary_exec = []
    for sample in id_list:
        if os.path.isfile(os.path.join(prediction_logfiles_folder, f"{sample}_log.csv")):
            summary_exec.append(check_fxn(sample=sample, predictions_folder=output_dir, log_folder=prediction_logfiles_folder, interval_list_file=interval_list_file, exclude_csv=exclude_file, sequence_source=sequence_source))

    if use_parsl:
        summary_exec = [q.result() for q in summary_exec]
        parsl.clear() # end parsl

    #summary_exec = list(set(summary_exec))
    for i, qr in enumerate(summary_exec):
        loggerUtils.write_logger(log_msg_type=qr['logtype'], logfile=SUMMARY_FILE, message=qr['logmessage'])

    # regex the summary file and save the failed ones e.t.c to csv
    # --- there is a better way to do this but for now, this will do

    warning_pattern = r"^\[WARNING.*For\s(\w+|\d+).*"
    success_pattern = r"^\[INFO.*For\s(\w+|\d+).*"
    with open(SUMMARY_FILE, 'r') as f:
        lines = list(set(f.readlines()))
    # print(line)
    warning_result = [re.search(warning_pattern, l).group(1) for l in lines if not re.search(warning_pattern, l) is None]
    success_result = [re.search(success_pattern, l).group(1) for l in lines if not re.search(success_pattern, l) is None]

    pd.DataFrame(list(set(warning_result))).to_csv(os.path.join(metadata_dir, f'{prediction_data_name}_{prediction_id}_{run_date}.unsuccessful_predictions.csv'), index=False, header=False)

    pd.DataFrame(list(set(success_result))).to_csv(os.path.join(metadata_dir, f'{prediction_data_name}_{prediction_id}_{run_date}.successful_predictions.csv'), index=False, header=False)

    # collect the successfule predictions
    # successful_predictions = list(set([q['sample'] for q in summary_exec if q['logtype'] == 'INFO']))
    # unsuccessful_predictions = list(set([q['sample'] for q in summary_exec if q['logtype'] == 'WARNING']))
    # pd.DataFrame({'successful_predictions':successful_predictions}).to_csv(os.path.join(metadata_dir, f'{prediction_data_name}_{prediction_id}_{run_date}.successful_predictions.csv'), index=False, header=False)
    # pd.DataFrame({'unsuccessful_predictions':unsuccessful_predictions}).to_csv(os.path.join(metadata_dir, f'{prediction_data_name}_{prediction_id}_{run_date}.unsuccessful_predictions.csv'), index=False, header=False)

    print(f'INFO - Check {SUMMARY_FILE} for a summary of the entire run.')
    print(f'INFO - Check `{metadata_dir}` for successful and unsucessful predictions.')

    # == After predictions are complete, a json file will be written out to help with aggregation
    print(f'INFO - Writing `aggregation_config_{prediction_data_name}_{prediction_id}.json` file to {metadata_dir}')
    agg_dt = {'predictions_folder': project_dir, 'enformer_prediction_path': f'{output_dir}', 'prediction_logfiles_folder':prediction_logfiles_folder, 'prediction_data_name':prediction_data_name, 'sequence_source': sequence_source, 'run_date':run_date, 'prediction_id':prediction_id, 'individuals': None if sequence_source in ['reference', 'random'] else individuals, 'n_individuals':n_individuals if sequence_source == 'personalized' else None}

    with(open(f'{metadata_dir}/aggregation_config_{prediction_data_name}_{prediction_id}.json', mode='w')) as wj:
        json.dump(agg_dt, wj)

    # remove temporatry config file
    # print(f"INFO - Cleaning up: Removing temporary config file at {tmp_config_file}")
    # os.remove(tmp_config_file)

In [9]:
enformer_predict('run_locally.json')

INFO - Sequence source is reference. Using a reference genome.
INFO - Found one reference set named reference_enformer_minimal to predict on
INFO - Found 2 regions to be split into batches with at most 5 regions in each batch.
INFO - No regions to exclude yet. You either did not supply a file, this is the first run, or there are truly no regions to exclude
INFO - There seem to be just one sample i.e. [['reference_enformer_minimal']]. No need to batch.
Using this config file: /Users/sabrinami/Github/shared_folder/enformer_pipeline/run_locally.json


[INFO: 07/24/2023 10:43:13 AM] [CACHE] (fasta) [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1) for chr1_65419_71585]
INFO:cache_log:[CACHE] (fasta) [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1) for chr1_65419_71585]
[INFO: 07/24/2023 10:43:58 AM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1)]
[INFO: 07/24/2023 10:43:58 AM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1)]
INFO:cache_log:[CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1)]
[INFO: 07/24/2023 10:43:58 AM] [CACHE] (fasta) [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1) for chr1_450740_451678]
[INFO: 07/24/2023 10:43:58 AM] [CACHE] (fasta) [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1) for chr1_450740_451678]
[INFO: 07/24/2023 10:43:58 AM] [CACHE] (fasta) [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1) for chr1_450740_451678]
INFO:cache_log:[CACHE] (fasta) [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1) fo

Sample reference_enformer_minimal chr1_65419_71585 haplotype0 predictions are of the correct shape:  (896, 5313)
Sample reference_enformer_minimal chr1_65419_71585 haplotypes predictions have been saved.
Sample reference_enformer_minimal chr1_65419_71585 haplotypes predictions have been logged.


[INFO: 07/24/2023 10:44:42 AM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1)]
[INFO: 07/24/2023 10:44:42 AM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1)]
[INFO: 07/24/2023 10:44:42 AM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1)]
[INFO: 07/24/2023 10:44:42 AM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1)]
INFO:cache_log:[CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=1, maxsize=5, currsize=1)]


Sample reference_enformer_minimal chr1_450740_451678 haplotype0 predictions are of the correct shape:  (896, 5313)
Sample reference_enformer_minimal chr1_450740_451678 haplotypes predictions have been saved.
Sample reference_enformer_minimal chr1_450740_451678 haplotypes predictions have been logged.
[INFO] (time) to predict on batch 0 is 90.45835176800006
INFO - Finished predictions for: [[0, 0]] ...




INFO - Check /Users/sabrinami/Desktop/2022-23/tutorials/enformer_pipeline_test/predictions_folder/reference_enformer_minimal_some_regions/predictions_2023-07-24/job_logs/reference_enformer_minimal_some_regions_2023-07-24.summary for a summary of the entire run.
INFO - Check `/Users/sabrinami/Github/shared_folder/enformer_pipeline/metadata` for successful and unsucessful predictions.
INFO - Writing `aggregation_config_reference_enformer_minimal_some_regions.json` file to /Users/sabrinami/Github/shared_folder/enformer_pipeline/metadata


In [11]:
import h5py
file_path="/Users/sabrinami/Desktop/2022-23/tutorials/enformer_pipeline_test/predictions_folder/reference_enformer_minimal_some_regions/predictions_2023-07-24/enformer_predictions/reference_enformer_minimal/haplotype0/chr1_65419_71585_predictions.h5"
with h5py.File(file_path, "r") as file:
    # List all the groups and datasets in the file
    print("Groups and Datasets in the HDF5 file:")
    for name in file:
        print(name)

Groups and Datasets in the HDF5 file:
chr1_65419_71585


In [12]:
with h5py.File(file_path, "r") as file:
    # Step 2: Access datasets and attributes within the file
    dataset_name = "chr1_65419_71585"
    dataset = file[dataset_name]

    # Example: Read the entire dataset into a NumPy array
    data = dataset[()]

In [13]:
print(data.shape)
print(data)

(896, 5313)
[[0.5291589  0.5204935  0.57351285 ... 0.00935043 0.01473221 0.01198051]
 [0.65397567 0.6207656  0.7431343  ... 0.01434602 0.02783546 0.02128439]
 [0.5326947  0.5510576  0.767584   ... 0.01038613 0.03195343 0.0286333 ]
 ...
 [0.00713983 0.0077101  0.00634603 ... 0.00115762 0.01039891 0.00428226]
 [0.00585703 0.00676019 0.00490488 ... 0.0047263  0.0377146  0.01976843]
 [0.00355603 0.00391064 0.00305343 ... 0.00085077 0.00762693 0.00313292]]


In [14]:
enformer_predict('run_local_personalized.json')

INFO - Sequence source is personalized. Using a reference genome + vcf files.
INFO - Found 2 individuals to predict on
INFO - Found 2 regions to be split into batches with at most 5 regions in each batch.
INFO - No regions to exclude yet. You either did not supply a file, this is the first run, or there are truly no regions to exclude
INFO - There seem to be just one sample i.e. [['HG00096', 'HG00097']]. No need to batch.
Using this config file: /Users/sabrinami/Github/shared_folder/enformer_pipeline/run_local_personalized.json


[INFO: 07/24/2023 03:25:05 PM] [CACHE] (fasta) [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2) for chr1_65419_71585]
[INFO: 07/24/2023 03:25:05 PM] [CACHE] (fasta) [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2) for chr1_65419_71585]
[INFO: 07/24/2023 03:25:05 PM] [CACHE] (fasta) [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2) for chr1_65419_71585]
[INFO: 07/24/2023 03:25:05 PM] [CACHE] (fasta) [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2) for chr1_65419_71585]
[INFO: 07/24/2023 03:25:05 PM] [CACHE] (fasta) [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2) for chr1_65419_71585]
INFO:cache_log:[CACHE] (fasta) [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2) for chr1_65419_71585]
  variants_dictionary['positions'] = tuple(variant.POS for variant in cyvcf2_object(query))
[E::hts_parse_region] Coordinates must be > 0


Sample HG00096 chr1_65419_71585 haplotype1 predictions are of the correct shape:  (896, 5313)
Sample HG00096 chr1_65419_71585 haplotype2 predictions are of the correct shape:  (896, 5313)
Sample HG00096 chr1_65419_71585 haplotypes predictions have been saved.
Sample HG00096 chr1_65419_71585 haplotypes predictions have been logged.


[INFO: 07/24/2023 03:34:49 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:34:49 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:34:49 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:34:49 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:34:49 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:34:49 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
INFO:cache_log:[CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:34:50 PM] [CACHE] (fasta) [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2) for chr1_450740_451678]
[INFO: 07/24/2023 03:34:50 PM] [CACHE] (fasta) [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2) for chr1_4

Sample HG00097 chr1_65419_71585 haplotype1 predictions are of the correct shape:  (896, 5313)
Sample HG00097 chr1_65419_71585 haplotype2 predictions are of the correct shape:  (896, 5313)
Sample HG00097 chr1_65419_71585 haplotypes predictions have been saved.
Sample HG00097 chr1_65419_71585 haplotypes predictions have been logged.
Sample HG00096 chr1_450740_451678 haplotype1 predictions are of the correct shape:  (896, 5313)
Sample HG00096 chr1_450740_451678 haplotype2 predictions are of the correct shape:  (896, 5313)
Sample HG00096 chr1_450740_451678 haplotypes predictions have been saved.
Sample HG00096 chr1_450740_451678 haplotypes predictions have been logged.


[INFO: 07/24/2023 03:38:37 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:38:37 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:38:37 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:38:37 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:38:37 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:38:37 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:38:37 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
[INFO: 07/24/2023 03:38:37 PM] [CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]
INFO:cache_log:[CACHE] (model) at batch 0: [CacheInfo(hits=1, misses=2, maxsize=5, currsize=2)]


Sample HG00097 chr1_450740_451678 haplotype1 predictions are of the correct shape:  (896, 5313)
Sample HG00097 chr1_450740_451678 haplotype2 predictions are of the correct shape:  (896, 5313)
Sample HG00097 chr1_450740_451678 haplotypes predictions have been saved.
Sample HG00097 chr1_450740_451678 haplotypes predictions have been logged.
[INFO] (time) to predict on batch 0 is 1503.2652588659985
INFO - Finished predictions for: [[0, 0, 0, 0]] ...




INFO - Check /Users/sabrinami/Desktop/2022-23/tutorials/enformer_pipeline_test/predictions_folder/personalized_enformer_minimal_some_regions/predictions_2023-07-24/job_logs/personalized_enformer_minimal_some_regions_2023-07-24.summary for a summary of the entire run.
INFO - Check `/Users/sabrinami/Desktop/2022-23/tutorials/enformer_pipeline/metadata` for successful and unsucessful predictions.
INFO - Writing `aggregation_config_personalized_enformer_minimal_some_regions.json` file to /Users/sabrinami/Desktop/2022-23/tutorials/enformer_pipeline/metadata


In [17]:
import h5py
file_path="/Users/sabrinami/Desktop/2022-23/tutorials/enformer_pipeline_test/predictions_folder/personalized_enformer_minimal_some_regions/predictions_2023-07-24/enformer_predictions/HG00096/haplotype1/chr1_65419_71585_predictions.h5"
with h5py.File(file_path, "r") as file:
    # List all the groups and datasets in the file
    print("Groups and Datasets in the HDF5 file:")
    for name in file:
        print(name)


Groups and Datasets in the HDF5 file:
chr1_65419_71585


In [20]:
with h5py.File(file_path, "r") as file:
    # Step 2: Access datasets and attributes within the file
    dataset_name = "chr1_65419_71585"
    dataset = file[dataset_name]

    # Example: Read the entire dataset into a NumPy array
    data = dataset[()]
print(data.shape)
print(data)

(896, 5313)
[[0.5291589  0.5204935  0.57351285 ... 0.00935043 0.01473221 0.01198051]
 [0.65397567 0.6207656  0.7431343  ... 0.01434602 0.02783546 0.02128439]
 [0.5326947  0.5510576  0.767584   ... 0.01038613 0.03195343 0.0286333 ]
 ...
 [0.00713983 0.0077101  0.00634603 ... 0.00115762 0.01039891 0.00428226]
 [0.00585703 0.00676019 0.00490488 ... 0.0047263  0.0377146  0.01976843]
 [0.00355603 0.00391064 0.00305343 ... 0.00085077 0.00762693 0.00313292]]
