In [None]:
import os
import subprocess
import json
import hail as hl
hl.init(spark_conf={"spark.hadoop.fs.gs.requester.pays.mode": "AUTO",
                    "spark.hadoop.fs.gs.requester.pays.project.id": "broad-ctsa"})

### GTEx v8 eQTL tissue-specific all SNP gene associations

Files in `gs://gtex-resources/GTEx_Analysis_v8_QTLs/GTEx_Analysis_v8_eQTL_all_associations/` were gzipped, so we need to get them bgzipped and moved over to `gs://hail-datasets-tmp`. First I generated a text file for the input paths and  a text file for desired output paths.

In [None]:
# Generate list of all eQTL all association files in gs://gtex-resources
list_eqtl_files_gz = subprocess.run(["gsutil",
                                     "-u",
                                     "broad-ctsa",
                                     "ls",
                                     "gs://gtex-resources/GTEx_Analysis_v8_QTLs/GTEx_Analysis_v8_eQTL_all_associations/"],
                                stdout=subprocess.PIPE)
eqtl_files_gz = list_eqtl_files_gz.stdout.decode('utf-8').split()

# Write eQTL file paths to text for input
with open("gtex_eQTL_paths_in.txt", "w") as f:
    for eqtl_file in eqtl_files_gz:
        f.write(f"{eqtl_file}\n")

# Change bucket to "gs://hail-datasets-tmp" and filename extension to ".bgz" and write to another text file for output
with open("gtex_eQTL_paths_out.txt", "w") as f:
    for eqtl_file in eqtl_files_gz:
        eqtl_file_out = eqtl_file.replace("gs://gtex-resources", "gs://hail-datasets-tmp").replace(".gz", ".bgz")
        f.write(f"{eqtl_file_out}\n")

After generating the text files as above, ran the below to get the files bgzipped so we can read them in and create Hail Tables.

```
paste gtex_eQTL_paths_in.txt gtex_eQTL_paths_out.txt |
while read infile outfile;
do
  gsutil -u broad-ctsa cat $infile |
  gzip -d |
  bgzip -c |
  gsutil cp - $outfile
done
```

Now can generate Hail Tables (do this on a cluster):

#### Create GTEx v8 Hail Tables

In [None]:
# Generate list of .bgz files in gs://hail-datasets-tmp
with open("gtex_eQTL_paths_out.txt") as f:
    eqtl_files = f.read().splitlines()

for eqtl_file in eqtl_files_bgz:
    print(eqtl_file)
    ht = hl.import_table(eqtl_file,
                         force_bgz=True,
                         types = {"gene_id": hl.tstr,
                                  "variant_id": hl.tstr,
                                  "tss_distance": hl.tint32,
                                  "ma_samples": hl.tint32,
                                  "ma_count": hl.tint32,
                                  "maf": hl.tfloat64,
                                  "pval_nominal": hl.tfloat64,
                                  "slope": hl.tfloat64,
                                  "slope_se": hl.tfloat64})

    name = "GTEx_eQTL_allpairs_" + eqtl_file.split(".")[0].split("/")[-1]
    version = "v8"
    build = "GRCh38"

    ht2 = ht.annotate(locus = hl.locus(ht.variant_id.split("_")[0],
                                       hl.int(ht.variant_id.split("_")[1]),
                                       reference_genome=build),
                      alleles = [ht.variant_id.split("_")[2],
                                 ht.variant_id.split("_")[3]])
    ht2 = ht2.select("locus", "alleles", "gene_id", "variant_id", "tss_distance",
                     "ma_samples", "ma_count", "maf", "pval_nominal", "slope", "slope_se")
    ht2 = ht2.key_by("locus", "alleles")

    n_rows = ht2.count()
    n_partitions = ht2.n_partitions()

    ht2 = ht2.annotate_globals(metadata=hl.struct(name=name,
                                                  version=version,
                                                  reference_genome=build,
                                                  n_rows=n_rows,
                                                  n_partitions=n_partitions))

    for region in ["us"]:
        output_file = f"gs://hail-datasets-{region}/{name}_{version}_{build}.ht"
        ht2.write(output_file, overwrite=False)

    print(f"Wrote {name} to Hail Table.\n")

#### Add entries for new Hail Tables to config

Now can create entries in `datasets.json` for new tables:

In [None]:
# Open our datasets config file so we can add our new entries
datasets_path = os.path.abspath("../../hail/python/hail/experimental/datasets.json")
with open(datasets_path, "r") as f:
    datasets = json.load(f)

# Get list of GTEx eQTL tables in hail-datasets-us
list_datasets = subprocess.run(["gsutil", "-u", "broad-ctsa", "ls", "gs://hail-datasets-us"], stdout=subprocess.PIPE)
all_datasets = list_datasets.stdout.decode('utf-8').split()
tables = [x.strip("/") for x in all_datasets if "GTEx_eQTL_allpairs_" in x]

for table in tables:
    gs_us_url = table
    gs_eu_url = table.replace("hail-datasets-us", "hail-datasets-eu")
    aws_url = table.replace("gs", "s3", 1).replace("hail-datasets-us", "hail-datasets-us-east-1")

    full_table_name = table.split("/")[-1]

    build = full_table_name.split("_")[-1].replace(".ht", "")
    version = full_table_name.split("_")[-2]
    tissue_name = full_table_name.replace("GTEx_eQTL_allpairs_", "").replace(f"_{version}_{build}.ht", "")

    json_entry = {
            "annotation_db": {
                "key_properties": []
            },
            "description": f"GTEx: {tissue_name} eQTL tissue-specific all SNP gene "
                           f"associations Hail Table. All variant-gene cis-eQTL associations "
                           f"tested in each tissue (including non-significant associations).",
            "url": "https://gtexportal.org/home/datasets",
            "versions": [
                {
                    "reference_genome": build,
                    "url": {
                        "aws": {
                            "us": f"{aws_url}"
                        },
                        "gcp": {
                            "us": f"{gs_us_url}",
                            "eu": f"{gs_eu_url}"
                        }
                    },
                    "version": version
                }
            ]
        }
    datasets[f"GTEx_eQTL_allpairs_{tissue_name}"] = json_entry

# Write new entries back to datasets.json config:
with open(datasets_path, "w") as f:
    json.dump(datasets, f, sort_keys=True, ensure_ascii=False, indent=2)

#### Create schemas for docs for new Hail Tables

In [None]:
import textwrap

output_dir = os.path.abspath("../../hail/python/hail/docs/datasets/schemas")
datasets_path = os.path.abspath("../../hail/python/hail/experimental/datasets.json")
with open(datasets_path, "r") as f:
    datasets = json.load(f)

names = [name for name in list(datasets.keys()) if "GTEx_eQTL_allpairs_" in name]
for name in names:
    versions = sorted(set(dataset["version"] for dataset in datasets[name]["versions"]))
    if not versions:
        versions = [None]
    reference_genomes = sorted(set(dataset["reference_genome"] for dataset in datasets[name]["versions"]))
    if not reference_genomes:
        reference_genomes = [None]

    print(name)
    print(versions[0])
    print(reference_genomes[0] + "\n")

    path = [dataset["url"]["gcp"]["us"]
            for dataset in datasets[name]["versions"]
            if all([dataset["version"] == versions[0],
                    dataset["reference_genome"] == reference_genomes[0]])]
    assert len(path) == 1
    path = path[0]

    table = hl.methods.read_table(path)
    description = table.describe(handler=lambda x: str(x)).split("\n")
    description = "\n".join([line.rstrip() for line in description])

    if path.endswith(".ht"):
        table_class = "hail.Table"
    else:
        table_class = "hail.MatrixTable"

    template = """.. _{dataset}:

{dataset}
{underline1}

*  **Versions:** {versions}
*  **Reference genome builds:** {ref_genomes}
*  **Type:** :class:`{class}`

Schema ({version0}, {ref_genome0})
{underline2}

.. code-block:: text

{schema}

"""
    context = {
        "dataset": name,
        "underline1": len(name) * "=",
        "version0": versions[0],
        "ref_genome0": reference_genomes[0],
        "versions": ", ".join([str(version) for version in versions]),
        "ref_genomes": ", ".join([str(reference_genome) for reference_genome in reference_genomes]),
        "underline2": len("".join(["Schema (", str(versions[0]), ", ", str(reference_genomes[0]), ")"])) * "~",
        "schema": textwrap.indent(description, "    "),
        "class": table_class
     }
    with open(output_dir + f"/{name}.rst", "w") as f:
        f.write(template.format(**context).strip())