In [None]:
import hail as hl
hl.init()

NYGC 30x HighCov samples Hail Table:

In [None]:
ht_samples = hl.import_table(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_samples_ped_population.txt.bgz", 
    delimiter="\s+",
    impute=True
)

ht_samples = ht_samples.annotate(
    FatherID = hl.if_else(ht_samples.FatherID == "0", 
                          hl.missing(hl.tstr), 
                          ht_samples.FatherID), 
    MotherID = hl.if_else(ht_samples.MotherID == "0", 
                          hl.missing(hl.tstr), 
                    ht_samples.MotherID),
                                 Sex = hl.if_else(ht_samples.Sex == 1, "male", "female")
)
ht_samples = ht_samples.key_by("SampleID")

n_rows = ht_samples.count()
n_partitions = ht_samples.n_partitions()

ht_samples = ht_samples.annotate_globals(
    metadata=hl.struct(
        name="1000_Genomes_HighCov_samples",
        n_rows=n_rows,
        n_partitions=n_partitions)
)

ht_samples.write("gs://hail-datasets-us/1000_Genomes_NYGC_30x_HighCov_samples.ht", overwrite=False)
ht_samples = hl.read_table("gs://hail-datasets-us/1000_Genomes_NYGC_30x_HighCov_samples.ht")
ht_samples.describe()

### Phased genotypes

Creating MTs for the phased data is straightforward, as multiallelic variants were split during phasing.

#### Autosomes (phased):

In [None]:
mt = hl.import_vcf(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_phased_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_GRCh38.vcf.bgz",
    reference_genome="GRCh38"
)

n_rows, n_cols = mt.count()
n_partitions = mt.n_partitions()

mt = mt.annotate_globals(
    metadata=hl.struct(
        name="1000_Genomes_HighCov_autosomes",
        reference_genome="GRCh38",
        n_rows=n_rows,
        n_cols=n_cols,
        n_partitions=n_partitions
    )
)

# Get list of INFO fields that are arrays
known_keys = [x[0] for x in list(mt.row.info.items()) if "array" in str(x[1])]

# Extract value from INFO array fields (all arrays are length 1)
mt = mt.annotate_rows(
    info = mt.info.annotate(
        **{k: hl.or_missing(hl.is_defined(mt.info[k]),
                            mt.info[k][0])
           for k in known_keys}
    )
)

mt = mt.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_phased_GRCh38.mt",
    overwrite=False,
    _read_if_exists=True
)

mt = mt.annotate_cols(**ht_samples[mt.s])
mt = hl.sample_qc(mt)
mt = hl.variant_qc(mt)

mt.write("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/autosomes_phased.mt", overwrite=False)
mt = hl.read_matrix_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/autosomes_phased.mt")
mt.describe()

#### ChrX (phased):

In [None]:
mt = hl.import_vcf(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_phased_chrX_GRCh38.vcf.bgz",
    reference_genome="GRCh38"
)

n_rows, n_cols = mt.count()
n_partitions = mt.n_partitions()

mt = mt.annotate_globals(
    metadata=hl.struct(
        name="1000_Genomes_HighCov_chrX",
        reference_genome="GRCh38",
        n_rows=n_rows,
        n_cols=n_cols,
        n_partitions=n_partitions
    )
)

# Get list of INFO fields that are arrays
known_keys = [x[0] for x in list(mt.row.info.items()) if "array" in str(x[1])]

# Extract appropriate value from INFO array fields (all arrays are length 1)
mt = mt.annotate_rows(
    info = mt.info.annotate(
        **{k: hl.or_missing(hl.is_defined(mt.info[k]),
                            mt.info[k][0])
           for k in known_keys}
    )
)

mt = mt.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_phased_GRCh38.mt",
    overwrite=False,
    _read_if_exists=True
)

mt = mt.annotate_cols(**ht_samples[mt.s])
mt = hl.sample_qc(mt)
mt = hl.variant_qc(mt)

mt.write("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrX_phased.mt", overwrite=False)
mt = hl.read_matrix_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrX_phased.mt")
mt.describe()

### Unphased genotypes

#### Autosomes (unphased):

Import chr1-chr22 VCF to `MatrixTable` and checkpoint:

In [None]:
mt = hl.import_vcf(
        ("gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_"
         "chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_"
         "GRCh38.vcf.bgz"),
        reference_genome="GRCh38",
        array_elements_required=False
)
mt = mt.annotate_entries(
    PL = hl.if_else(mt.PL.contains(hl.missing(hl.tint32)), 
                    hl.missing(mt.PL.dtype), 
                    mt.PL)
)
mt = mt.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_imported_vcf.mt", 
    overwrite=False, 
    _read_if_exists=True
)

Separate biallelic and multiallelic variants, split multiallelic variants with `split_multi_hts`, and then `union_rows` the split multiallelic MT back to the biallelic MT. 

For multiallelic variants we will just set `PL` to be missing, to avoid running into index out of bounds errors in `split_multi_hts`.

In [None]:
mt = hl.read_matrix_table(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_imported_vcf.mt"
)

bi = mt.filter_rows(hl.len(mt.alleles) == 2)
bi = bi.annotate_rows(a_index=1, was_split=False)
bi = bi.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_biallelic.mt", 
    overwrite=False, 
    _read_if_exists=True
)

multi = mt.filter_rows(hl.len(mt.alleles) > 2)
multi = multi.annotate_entries(PL = hl.missing(multi.PL.dtype))
multi = multi.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_multiallelic.mt", 
    overwrite=False,
    _read_if_exists=True
)

split = hl.split_multi_hts(multi, keep_star=True, permit_shuffle=True)
split = split.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_multiallelic_split.mt", 
    overwrite=False,     
    _read_if_exists=True
)

unioned = split.union_rows(bi)
unioned = unioned.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_unioned.mt", 
    overwrite=False,    
    _read_if_exists=True
)

unioned = unioned.repartition(12000, shuffle=True)
unioned = unioned.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_unioned_repart.mt", 
    overwrite=False,    
    _read_if_exists=True
)

After splitting multiallelic variants, we need to extract the appropriate values from the `INFO` array fields with `a_index`. 

Then annotate globals with metadata, annotate columns with sample relationships, perform `sample_qc` and `variant_qc`, and write final MT to `hail-datasets-us`.

In [None]:
unioned = hl.read_matrix_table(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/autosomes_unphased_GRCh38_unioned_repart.mt"
)

# Get list of INFO fields that are arrays
known_keys = [x[0] for x in list(unioned.row.info.items()) if "array" in str(x[1])]

# Extract appropriate values from INFO array fields after splitting
mt = unioned.annotate_rows(
    info = unioned.info.annotate(
        **{k: hl.or_missing(hl.is_defined(unioned.info[k]), 
                            unioned.info[k][unioned.a_index - 1]) 
           for k in known_keys}
    )
)

n_rows, n_cols = mt.count()
n_partitions = mt.n_partitions()

mt = mt.annotate_globals(
    metadata=hl.struct(
        name="1000_Genomes_HighCov_autosomes",
        reference_genome="GRCh38",
        n_rows=n_rows,
        n_cols=n_cols,
        n_partitions=n_partitions
    )
)

ht_samples = hl.read_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/samples.ht")
mt = mt.annotate_cols(**ht_samples[mt.s])
mt = hl.sample_qc(mt)
mt = hl.variant_qc(mt)

mt.write("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/autosomes_unphased.mt", overwrite=False)
mt = hl.read_matrix_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/autosomes_unphased.mt")
mt.describe()

#### ChrX (unphased):

Import chrX VCF to `MatrixTable` and checkpoint:

In [None]:
mt = hl.import_vcf(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_chrX_GRCh38.vcf.bgz",
    reference_genome="GRCh38", 
    array_elements_required=False
)
mt = mt.annotate_entries(
    PL = hl.if_else(mt.PL.contains(hl.missing(hl.tint32)), 
                    hl.missing(mt.PL.dtype), 
                    mt.PL)
)
mt = mt.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_imported_vcf.mt", 
    overwrite=False, 
    _read_if_exists=True
)

Separate biallelic and multiallelic variants, split multiallelic variants with `split_multi_hts`, and then `union_rows` the split multiallelic MT back to the biallelic MT. 

For multiallelic variants we will just set `PL` to be missing, to avoid running into index out of bounds errors in `split_multi_hts`.

In [None]:
mt = hl.read_matrix_table(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_imported_vcf.mt"
)

bi = mt.filter_rows(hl.len(mt.alleles) == 2)
bi = bi.annotate_rows(a_index=1, was_split=False)
bi = bi.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_biallelic.mt", 
    overwrite=False, 
    _read_if_exists=True
)

multi = mt.filter_rows(hl.len(mt.alleles) > 2)
multi = multi.annotate_entries(PL = hl.missing(multi.PL.dtype))
multi = multi.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_multiallelic.mt", 
    overwrite=False,
    _read_if_exists=True
)

split = hl.split_multi_hts(multi, keep_star=True, permit_shuffle=True)
split = split.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_multiallelic_split.mt", 
    overwrite=False,     
    _read_if_exists=True
)

unioned = split.union_rows(bi)
unioned = unioned.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_unioned.mt", 
    overwrite=False,    
    _read_if_exists=True
)

unioned = unioned.repartition(512, shuffle=True)
unioned = unioned.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_unioned_repart.mt", 
    overwrite=False,    
    _read_if_exists=True
)

After splitting multiallelic variants, we need to extract the appropriate values from the `INFO` array fields with `a_index`. 

Then annotate globals with metadata, annotate columns with sample relationships, perform `sample_qc` and `variant_qc`, and write final MT to `hail-datasets-us`.

In [None]:
unioned = hl.read_matrix_table(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrX_unphased_GRCh38_unioned_repart.mt"
)

# Get list of INFO fields that are arrays
known_keys = [x[0] for x in list(unioned.row.info.items()) if "array" in str(x[1])]

# Extract appropriate values from INFO array fields after splitting
mt = unioned.annotate_rows(
    info = unioned.info.annotate(
        **{k: hl.or_missing(hl.is_defined(unioned.info[k]), 
                            unioned.info[k][unioned.a_index - 1]) 
           for k in known_keys}
    )
)

n_rows, n_cols = mt.count()
n_partitions = mt.n_partitions()

mt = mt.annotate_globals(
    metadata=hl.struct(
        name="1000_Genomes_HighCov_chrX",
        reference_genome="GRCh38",
        n_rows=n_rows,
        n_cols=n_cols,
        n_partitions=n_partitions
    )
)

ht_samples = hl.read_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/samples.ht")
mt = mt.annotate_cols(**ht_samples[mt.s])
mt = hl.sample_qc(mt)
mt = hl.variant_qc(mt)

mt.write("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrX_unphased.mt", overwrite=False)
mt = hl.read_matrix_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrX_unphased.mt")
mt.describe()

#### ChrY (unphased):

Import chrY VCF to `MatrixTable` and checkpoint:

In [None]:
mt = hl.import_vcf(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/1000_Genomes_NYGC_30x_chrY_GRCh38.vcf.bgz",
    reference_genome="GRCh38", 
    array_elements_required=False
)
mt = mt.annotate_entries(
    PL = hl.if_else(mt.PL.contains(hl.missing(hl.tint32)), 
                    hl.missing(mt.PL.dtype), 
                    mt.PL)
)
mt = mt.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_imported_vcf.mt", 
    overwrite=False, 
    _read_if_exists=True
)

Separate biallelic and multiallelic variants, split multiallelic variants with `split_multi_hts`, and then `union_rows` the split multiallelic MT back to the biallelic MT. 

For multiallelic variants we will just set `PL` to be missing, to avoid running into index out of bounds errors in `split_multi_hts`.

In [None]:
mt = hl.read_matrix_table(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_imported_vcf.mt"
)

bi = mt.filter_rows(hl.len(mt.alleles) == 2)
bi = bi.annotate_rows(a_index=1, was_split=False)
bi = bi.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_biallelic.mt", 
    overwrite=False, 
    _read_if_exists=True
)

multi = mt.filter_rows(hl.len(mt.alleles) > 2)
multi = multi.annotate_entries(PL = hl.missing(multi.PL.dtype))
multi = multi.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_multiallelic.mt", 
    overwrite=False,
    _read_if_exists=True
)

split = hl.split_multi_hts(multi, keep_star=True, permit_shuffle=True)
split = split.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_multiallelic_split.mt", 
    overwrite=False,     
    _read_if_exists=True
)

unioned = split.union_rows(bi)
unioned = unioned.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_unioned.mt", 
    overwrite=False,    
    _read_if_exists=True
)

unioned = unioned.repartition(8, shuffle=True)
unioned = unioned.checkpoint(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_unioned_repart.mt", 
    overwrite=False,    
    _read_if_exists=True
)

After splitting multiallelic variants, we need to extract the appropriate values from the `INFO` array fields with `a_index`. 

Then annotate globals with metadata, annotate columns with sample relationships, perform `sample_qc` and `variant_qc`, and write final MT to `hail-datasets-us`.

In [None]:
unioned = hl.read_matrix_table(
    "gs://hail-datasets-tmp/1000_Genomes_NYGC_30x/checkpoints/chrY_unphased_GRCh38_unioned_repart.mt"
)

# Get list of INFO fields that are arrays
known_keys = [x[0] for x in list(unioned.row.info.items()) if "array" in str(x[1])]

# Extract appropriate values from INFO array fields after splitting
mt = unioned.annotate_rows(
    info = unioned.info.annotate(
        **{k: hl.or_missing(hl.is_defined(unioned.info[k]), 
                            unioned.info[k][unioned.a_index - 1]) 
           for k in known_keys}
    )
)

n_rows, n_cols = mt.count()
n_partitions = mt.n_partitions()

mt = mt.annotate_globals(
    metadata=hl.struct(
        name="1000_Genomes_HighCov_chrY",
        reference_genome="GRCh38",
        n_rows=n_rows,
        n_cols=n_cols,
        n_partitions=n_partitions
    )
)

ht_samples = hl.read_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/samples.ht")
mt = mt.annotate_cols(**ht_samples[mt.s])
mt = hl.sample_qc(mt)
mt = hl.variant_qc(mt)

mt.write("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrY_unphased.mt", overwrite=False)
mt = hl.read_matrix_table("gs://hail-datasets-us/1000_Genomes/NYGC_30x/GRCh38/chrY_unphased.mt")
mt.describe()

### Create/update schemas

In [None]:
import json
import os
import textwrap

output_dir = os.path.abspath("../../hail/python/hail/docs/datasets/schemas")
datasets_path = os.path.abspath("../../hail/python/hail/experimental/datasets.json")
with open(datasets_path, "r") as f:
    datasets = json.load(f)

names = datasets.keys()
for name in [name for name in names if "1000_Genomes_HighCov" in name]:
    versions = sorted(set(dataset["version"] for dataset in datasets[name]["versions"]))
    if not versions:
        versions = [None]
    reference_genomes = sorted(set(dataset["reference_genome"] for dataset in datasets[name]["versions"]))
    if not reference_genomes:
        reference_genomes = [None]

    print(name)
    # Create schemas for unphased versions, since phased entries only have GT
    if name == "1000_Genomes_HighCov_chrY":
        v = versions[0]
    else:
        v = versions[1]
    print(v)
    print(reference_genomes[0] + "\n")

    path = [dataset["url"]["gcp"]["us"]
            for dataset in datasets[name]["versions"]
            if all([dataset["version"] == v,
                    dataset["reference_genome"] == reference_genomes[0]])]
    assert len(path) == 1
    path = path[0]
    if path.endswith(".ht"):
        table = hl.methods.read_table(path)
        table_class = "hail.Table"
    else:
        table = hl.methods.read_matrix_table(path)
        table_class = "hail.MatrixTable"

    description = table.describe(handler=lambda x: str(x)).split("\n")
    description = "\n".join([line.rstrip() for line in description])

    template = """.. _{dataset}:

{dataset}
{underline1}

*  **Versions:** {versions}
*  **Reference genome builds:** {ref_genomes}
*  **Type:** :class:`{class}`

Schema ({version0}, {ref_genome0})
{underline2}

.. code-block:: text

{schema}

"""
    context = {
        "dataset": name,
        "underline1": len(name) * "=",
        "version0": v,
        "ref_genome0": reference_genomes[0],
        "versions": ", ".join([str(version) for version in versions]),
        "ref_genomes": ", ".join([str(reference_genome) for reference_genome in reference_genomes]),
        "underline2": len("".join(["Schema (", str(v), ", ", str(reference_genomes[0]), ")"])) * "~",
        "schema": textwrap.indent(description, "    "),
        "class": table_class
    }
    with open(output_dir + f"/{name}.rst", "w") as f:
        f.write(template.format(**context).strip())