In [None]:
import os
import json

In [None]:
# Create list with each line containing old path followed by new path
lines = []
with open("reformat_buckets.txt", "r") as f:
    for line in f:
        line = line.replace("gsutil -u broad-ctsa -m cp -r ", "").replace("*", "")
        lines.append(line)

In [None]:
# Create dict mapping current urls to new urls
new_mappings = {}

# GCS
# Mappings from old_name: new_name for hail-datasets-us
for line in lines:
    line = line.strip().split(" ")
    line = [x.rstrip("/") for x in line]
    new_mappings[line[0]] = line[1]
# Mappings from old_name: new_name for hail-datasets-eu
for line in lines:
    line = line.replace("hail-datasets-us/", "hail-datasets-eu/")
    line = line.strip().split(" ")
    line = [x.rstrip("/") for x in line]
    new_mappings[line[0]] = line[1]
# AWS
# Mappings from old_name: new_name for hail-datasets-us-east-1
for line in lines:
    line = line.replace("gs://hail-datasets-us/", "s3://hail-datasets-us-east-1/")
    line = line.strip().split(" ")
    line = [x.rstrip("/") for x in line]
    new_mappings[line[0]] = line[1]

with open("reformat_buckets_mappings.json", "w") as f:
    json.dump(new_mappings, f, sort_keys=True, ensure_ascii=False, indent=2)

In [None]:
# Load config file
datasets_json_path = os.path.abspath("../../hail/python/hail/experimental/datasets.json")
with open(datasets_json_path) as f:
    datasets_json = json.load(f)

# Update urls for all datasets according to new mappings
dataset_names = [name for name in datasets_json]
for name in dataset_names:
    versions = datasets_json[name]["versions"]
    for version in versions:
        if "aws" in version["url"]:
            if version["url"]["aws"]["us"] in new_mappings.keys():
                version["url"]["aws"]["us"] = new_mappings[version["url"]["aws"]["us"]]
        if "gcp" in version["url"]:
            if "us" in version["url"]["gcp"]:
                if version["url"]["gcp"]["us"] in new_mappings.keys():
                    version["url"]["gcp"]["us"] = new_mappings[version["url"]["gcp"]["us"]]
            if "eu" in version["url"]["gcp"]:
                if version["url"]["gcp"]["eu"] in new_mappings.keys():
                    version["url"]["gcp"]["eu"] = new_mappings[version["url"]["gcp"]["eu"]]
    # Update GTEx names while we're at it
    if "GTEx_eQTL" in name or "GTEx_sQTL" in name:
        tissue = name.split("_", 3)[-1]
        qtl = name.split("_", 3)[1]
        updated_name = f"GTEx_{qtl}_{tissue}_all_snp_gene_associations"
        datasets_json[updated_name] = datasets_json[name]
        del datasets_json[name]

# Write new entries to config file
with open(datasets_json_path, "w") as f:
    json.dump(datasets_json, f, sort_keys=True, ensure_ascii=False, indent=2)

In [None]:
import os
import json
import hail as hl
hl.init(spark_conf={"spark.hadoop.fs.s3a.aws.credentials.provider":
                        "org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider"})

# Test that we can load datasets from GCS and AWS
datasets_json_path = os.path.abspath("../../hail/python/hail/experimental/datasets.json")
with open(datasets_json_path) as f:
    datasets_json = json.load(f)

dataset_names = [name for name in datasets_json]
for name in dataset_names:
    print(name)
    versions = datasets_json[name]["versions"]
    for version in versions:
        if "gcp" in version["url"]:
            if "us" in version["url"]["gcp"]:
                url = version["url"]["gcp"]["us"]
                print(url)
                if url.endswith(".ht"):
                    hl.read_table(url)
                elif url.endswith(".mt"):
                    hl.read_matrix_table(url)
                else:
                    hl.linalg.BlockMatrix.read(url)
            if "eu" in version["url"]["gcp"]:
                url = version["url"]["gcp"]["eu"]
                print(url)
                if url.endswith(".ht"):
                    hl.read_table(url)
                elif url.endswith(".mt"):
                    hl.read_matrix_table(url)
                else:
                    hl.linalg.BlockMatrix.read(url)
        if "aws" in version["url"]:
            url = version["url"]["aws"]["us"].replace("s3://", "s3a://")
            print(url)
            if url.endswith(".ht"):
                hl.read_table(url)
            elif url.endswith(".mt"):
                hl.read_matrix_table(url)
            else:
                hl.linalg.BlockMatrix.read(url)