In [None]:
import os
import json
import hail as hl
hl.init(spark_conf={"spark.hadoop.fs.gs.requester.pays.mode": "AUTO",
                    "spark.hadoop.fs.gs.requester.pays.project.id": "broad-ctsa"})

The workflow I ended up using to add these GIANT didn't fit nicely into the old extract/load setup, so I added this notebook directory.

For this particular case, the files were small enough that I was exploring things locally, and it ended up being easiest to just get things done here. You can run the cells for height/bmi/whr independently to generate the tables and entries for `datasets.json`.

It just assumes that you have the [GIANT consortium data files](https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files) downloaded to a `tmp` folder in your home directory , e.g. for height, `~/tmp/giant_height_exome_summary/` would contain all the `.txt` files for height.

In [None]:
# Open our datasets config file so we can add our newly generated entries
datasets_path = os.path.abspath("../../hail/python/hail/experimental/datasets.json")
with open(datasets_path, "r") as f:
    datasets = json.load(f)

# To generate correct descriptions in datasets.json entries for each population
json_populations = dict(
    zip(
        ["AFR", "EUR", "EAS", "AMR", "SAS", "ALL"],
        ["African/African-American", "European", "East Asian", "Latino/Admixed American", "South Asian", "all"]
    )
)

### Height

Iterate through GIANT height text files to write Hail Tables to GCS and generate entries to insert into datasets.json:

In [None]:
# For GIANT 2018 Exome Array Summary Statistics - Height
populations = ["AA", "EA", "EAS", "HA", "SA", "All"]
renamed_populations = ["AFR", "EUR", "EAS", "AMR", "SAS", "ALL"]

# Map population name to maf and exac_maf field names
maf = dict(
    zip(populations, ["AFR_MAF", "EUR_MAF", "EAS_MAF", "AMR_MAF", "SAS_MAF", "GMAF"])
)
exac_maf = dict(
    zip(populations, ["ExAC_AFR_MAF", "ExAC_NFE_MAF", "ExAC_EAS_MAF", "ExAC_AMR_MAF", "ExAC_SAS_MAF", "ExAC_MAF"])
)

# Re-map population names for file name of Hail Table
output_name = dict(zip(populations, renamed_populations))

for population in populations:
    print(population)
    
    input_file = os.path.expanduser("~") + f"/tmp/giant_height_exome_summary/height_{population}_add_SV.txt"
    name = f"giant_height_exome_{output_name[population]}"
    version = "2018"
    build = "GRCh37"
    
    ht = hl.import_table(input_file,
                         impute=True,
                         missing=["-", "NA", "Inf"],
                         delimiter="\s+",
                         types = {"beta": hl.tfloat64,
                                  "se": hl.tfloat64,
                                  "Pvalue": hl.tfloat64})

    ht2 = ht.annotate(locus = hl.locus(ht.CHR, ht.POS, reference_genome=build),
                      alleles = [ht.REF, ht.ALT],
                      temp_maf = hl.dict(
                          ht[maf[population]].split(",").map(
                              lambda x: (x.split(":")[0], hl.float(x.split(":")[1]))
                          )
                      ),
                      temp_exac_maf = hl.dict(
                          ht[exac_maf[population]].split(",").map(
                              lambda x: (x.split(":")[0], hl.float(x.split(":")[1]))
                          )
                      )
                     )
    
    ht2 = ht2.select("locus", "alleles", "SNPNAME", "temp_maf", "temp_exac_maf", "beta", "se", "Pvalue")
    ht2 = ht2.rename({"temp_maf" : maf[population].lower(),
                      "temp_exac_maf" : exac_maf[population].lower(),
                      "SNPNAME": "snp_name",
                      "Pvalue": "pvalue"})
    ht2 = ht2.key_by("locus", "alleles")

    n_rows = ht2.count()
    n_partitions = ht2.n_partitions()

    ht2 = ht2.annotate_globals(metadata=hl.struct(name=name,
                                                  version=version,
                                                  reference_genome=build,
                                                  n_rows=n_rows,
                                                  n_partitions=n_partitions))
    
    for region in ["us", "eu"]:
        output_file = f"gs://hail-datasets-{region}/{name}_{version}_{build}.ht"
        ht2.write(output_file, overwrite=True)

    json_entry = {
        "annotation_db": {
            "key_properties": [
                "unique"
            ]
        },
        "description": f"GIANT (Genetic Investigation of ANthropometric Traits): "
                       f"height exome array summary statistics Hail Table for {json_populations[output_name[population]]} "
                       f"population(s).",
        "url": "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files",
        "versions": [
            {
                "reference_genome": build,
                "url": {
                    "aws": {
                        "us": f"s3://hail-datasets-us-east-1/{name}_{version}_{build}.ht"
                    },
                    "gcp": {
                        "us": f"gs://hail-datasets-us/{name}_{version}_{build}.ht",
                        "eu": f"gs://hail-datasets-eu/{name}_{version}_{build}.ht"
                    }
                },
                "version": version
            }
        ]
    }
    datasets[name] = json_entry

# Write new entries back to datasets.json config:
with open(datasets_path, "w") as f:
    json.dump(datasets, f, sort_keys=True, ensure_ascii=False, indent=2)

### BMI

Iterate through GIANT body mass index (BMI) text files to write Hail Tables to GCS and generate entries to insert into datasets.json:

In [None]:
# For GIANT 2018 Exome Array Summary Statistics - BMI
populations = ["African_American", "European", "Eastern_Asian", "Hispanic_American", "South_Asian", "All_ancestry"]
renamed_populations = ["AFR", "EUR", "EAS", "AMR", "SAS", "ALL"]

# Map population name to maf and exac_maf field names
maf = dict(
    zip(populations, ["AFR_MAF", "EUR_MAF", "EAS_MAF", "AMR_MAF", "SAS_MAF", "GMAF"])
)
exac_maf = dict(
    zip(populations, ["ExAC_AFR_MAF", "ExAC_NFE_MAF", "ExAC_EAS_MAF", "ExAC_AMR_MAF", "ExAC_SAS_MAF", "ExAC_MAF"])
)

# Re-map population names for file name of Hail Table
output_name = dict(zip(populations, renamed_populations))

for population in populations:
    print(population)
    
    input_file = os.path.expanduser("~") + f"/tmp/giant_bmi_exome_summary/BMI_{population}.fmt.gzip.txt"
    name = f"giant_bmi_exome_{output_name[population]}"
    version = "2018"
    build = "GRCh37"
    
    ht = hl.import_table(input_file,
                         impute=True,
                         missing=["-", "NA", "Inf"],
                         delimiter="\s+",
                         types = {"beta": hl.tfloat64,
                                  "se": hl.tfloat64,
                                  "Pvalue": hl.tfloat64})

    ht2 = ht.annotate(locus = hl.locus(ht.CHR, ht.POS, reference_genome=build),
                      alleles = [ht.REF, ht.ALT],
                      snp_name = ht.SNPNAME,
                      pvalue = ht.Pvalue,
                      temp_maf = hl.dict(
                          ht[maf[population]].split(",").map(
                              lambda x: (x.split(":")[0], hl.float(x.split(":")[1]))
                          )
                      ),
                      temp_exac_maf = hl.dict(
                          ht[exac_maf[population]].split(",").map(
                              lambda x: (x.split(":")[0], hl.float(x.split(":")[1]))
                          )
                      )
                     )

    ht2 = ht2.select("locus", "alleles", "SNPNAME", "temp_maf", "temp_exac_maf", "beta", "se", "Pvalue")
    ht2 = ht2.rename({"temp_maf" : maf[population].lower(),
                      "temp_exac_maf" : exac_maf[population].lower(),
                      "SNPNAME": "snp_name",
                      "Pvalue": "pvalue"})
    ht2 = ht2.key_by("locus", "alleles")

    n_rows = ht2.count()
    n_partitions = ht2.n_partitions()

    ht2 = ht2.annotate_globals(metadata=hl.struct(name=name,
                                                  version=version,
                                                  reference_genome=build,
                                                  n_rows=n_rows,
                                                  n_partitions=n_partitions))

    for region in ["us", "eu"]:
        output_file = f"gs://hail-datasets-{region}/{name}_{version}_{build}.ht"
        ht2.write(output_file, overwrite=True)

    json_entry = {
        "annotation_db": {
            "key_properties": [
                "unique"
            ]
        },
        "description": f"GIANT (Genetic Investigation of ANthropometric Traits): "
                       f"body mass index (BMI) exome array summary statistics Hail Table "
                       f"for {json_populations[output_name[population]]} population(s).",
        "url": "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files",
        "versions": [
            {
                "reference_genome": build,
                "url": {
                    "aws": {
                        "us": f"s3://hail-datasets-us-east-1/{name}_{version}_{build}.ht"
                    },
                    "gcp": {
                        "us": f"gs://hail-datasets-us/{name}_{version}_{build}.ht",
                        "eu": f"gs://hail-datasets-eu/{name}_{version}_{build}.ht"
                    }
                },
                "version": version
            }
        ]
    }
    datasets[name] = json_entry

# Write new entries back to datasets.json config:
with open(datasets_path, "w") as f:
    json.dump(datasets, f, sort_keys=True, ensure_ascii=False, indent=2)

### WHR

Iterate through GIANT waist-hip ratio (BMI adj.) text files to write Hail Tables to GCS and generate entries to insert into datasets.json:

In [None]:
# For GIANT 2018 Exome Array Summary Statistics - WHR
sexes = ["C", "M", "W"]
populations = ["All", "Eur"]
renamed_populations = ["ALL", "EUR"]
models = ["Rec", "Add"]

# Generate valid combinations for existing filenames, e.g. ('C', 'All', 'Rec')
combinations = [(sexes[i], populations[j], models[k]) 
                for i in range(0, len(sexes))
                for j in range(0, len(populations))
                for k in range(0, len(models))]

# Map population name to maf and exac_maf field names
maf = dict(zip(populations, ["gmaf", "eur_maf"]))
exac_maf = dict(zip(populations, ["exac_maf", "exac_nfe_maf"]))

for triplet in combinations:
    print(triplet)
    
    input_file = os.path.expanduser("~") + f"/tmp/giant_whr_exome_summary/PublicRelease.WHRadjBMI.{triplet[0]}.{triplet[1]}.{triplet[2]}.txt"
    name = f"giant_whr_exome_{triplet[0]}_{triplet[1].upper()}_{triplet[2]}"
    version = "2018"
    build = "GRCh37"
    
    ht = hl.import_table(input_file,
                         impute=True,
                         missing=["-", "NA", "Inf", ""],
                         types = {"beta": hl.tfloat64,
                                  "se": hl.tfloat64,
                                  "pvalue": hl.tfloat64})

    ht2 = ht.annotate(locus = hl.locus(ht.markername.split(":")[0],
                                       hl.int(ht.markername.split(":")[1]),
                                       reference_genome=build),
                      alleles = [ht.ref, ht.alt],
                      temp_maf = hl.dict(
                          ht[maf[triplet[1]]].split(",").map(
                              lambda x: (x.split(":")[0], hl.float(x.split(":")[1]))
                          )
                      ),
                      temp_exac_maf = hl.dict(
                          ht[exac_maf[triplet[1]]].split(",").map(
                              lambda x: (x.split(":")[0], hl.float(x.split(":")[1]))
                          )
                      )
                     )
                  
    ht2 = ht2.select("locus", "alleles", "snpname", "temp_maf", "temp_exac_maf", "beta", "se", "pvalue", "n")
    ht2 = ht2.rename({"n": "sample_size",
                      "snpname": "snp_name",
                      "temp_maf": maf[triplet[1]],
                      "temp_exac_maf": exac_maf[triplet[1]]})
    ht2 = ht2.key_by("locus", "alleles")

    n_rows = ht2.count()
    n_partitions = ht2.n_partitions()

    ht2 = ht2.annotate_globals(metadata=hl.struct(name=name,
                                                  version=version,
                                                  reference_genome=build,
                                                  n_rows=n_rows,
                                                  n_partitions=n_partitions))

    for region in ["us", "eu"]:
        output_file = f"gs://hail-datasets-{region}/{name}_{version}_{build}.ht"
        ht2.write(output_file, overwrite=True)

    json_entry = {
        "annotation_db": {
            "key_properties": [
                "unique"
            ]
        },
        "description": f"GIANT (Genetic Investigation of ANthropometric Traits): "
                       f"waist-hip ratio (WHR) adjusted for BMI exome array summary statistics Hail Table. "
                       f"Note the following abbreviations used in filenames: C-combined sexes, M-men, W-women, "
                       f"ALL-all ancestries, EUR-european descent only, Add-additive genetic model, "
                       f"and Rec-recessive genetic model.",
        "url": "https://portals.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files",
        "versions": [
            {
                "reference_genome": build,
                "url": {
                    "aws": {
                        "us": f"s3://hail-datasets-us-east-1/{name}_{version}_{build}.ht"
                    },
                    "gcp": {
                        "us": f"gs://hail-datasets-us/{name}_{version}_{build}.ht",
                        "eu": f"gs://hail-datasets-eu/{name}_{version}_{build}.ht"
                    }
                },
                "version": version
            }
        ]
    }
    datasets[name] = json_entry

# Write new entries back to datasets.json config:
with open(datasets_path, "w") as f:
    json.dump(datasets, f, sort_keys=True, ensure_ascii=False, indent=2)

### Docs

Generate .rst files for dataset schemas:

In [None]:
import textwrap

output_dir = os.path.abspath("../../hail/python/hail/docs/datasets/schemas")
datasets_path = os.path.abspath("../../hail/python/hail/experimental/datasets.json")
with open(datasets_path, "r") as f:
    datasets = json.load(f)

names = [name for name in list(datasets.keys()) if "giant" in name]
for name in names:
    versions = sorted(set(dataset["version"] for dataset in datasets[name]["versions"]))
    if not versions:
        versions = [None]
    reference_genomes = sorted(set(dataset["reference_genome"] for dataset in datasets[name]["versions"]))
    if not reference_genomes:
        reference_genomes = [None]

    print(name)
    print(versions[0])
    print(reference_genomes[0])

    path = [dataset["url"]["gcp"]["us"]
            for dataset in datasets[name]["versions"]
            if all([dataset["version"] == versions[0],
                    dataset["reference_genome"] == reference_genomes[0]])]
    assert len(path) == 1
    path = path[0]

    table = hl.methods.read_table(path)
    description = table.describe(handler=lambda x: str(x))

    if path.endswith(".ht"):
        table_class = "hail.Table"
    else:
        table_class = "hail.MatrixTable"

    template = """.. _{dataset}:

{dataset}
{underline1}

*  **Versions:** {versions}
*  **Reference genome builds:** {ref_genomes}
*  **Type:** :class:`{class}`

Schema ({version0}, {ref_genome0})
{underline2}

.. code-block:: text

{schema}

"""
    context = {
        "dataset": name,
        "underline1": len(name) * "=",
        "version0": versions[0],
        "ref_genome0": reference_genomes[0],
        "versions": ", ".join([str(version) for version in versions]),
        "ref_genomes": ", ".join([str(reference_genome) for reference_genome in reference_genomes]),
        "underline2": len("".join(["Schema (", str(versions[0]), ", ", str(reference_genomes[0]), ")"])) * "~",
        "schema": textwrap.indent(description, "    "),
        "class": table_class
     }
    with open(output_dir + "/" + name + ".rst", "w") as f:
        f.write(template.format(**context))