### CADD

Use to create CADD Hail Tables after downloading raw data from https://cadd.gs.washington.edu/ with Hail Batch (see `datasets/extract/extract_CADD.py`).

In [None]:
import hail as hl
hl.init()

In [None]:
input_root = "gs://hail-datasets-tmp"
output_root = "gs://hail-datasets-us"

name = "CADD"
version = "v1.6"
builds = ["GRCh37", "GRCh38"]

for build in builds:
    ht = hl.import_table(f"{input_root}/{name}/{name}_{version}_{build}.tsv.bgz",
                         min_partitions=2048,
                         types={"position": hl.tint,
                                "raw_score": hl.tfloat,
                                "PHRED_score": hl.tfloat})

    if build == "GRCh37":
        ht = ht.annotate(locus = hl.locus(ht.chromosome, ht.position, build))
    else:
        ht = ht.annotate(locus = hl.locus("chr" + ht.chromosome, ht.position, build))

    ht = ht.annotate(alleles = [ht.ref, ht.alt])
    ht = ht.select("locus", "alleles", "raw_score", "PHRED_score")
    ht = ht.key_by("locus", "alleles")
    
    n_rows = ht.count()
    n_partitions = ht.n_partitions()
    
    ht = ht.annotate_globals(
        metadata=hl.struct(
            name=name,
            version=version,
            reference_genome=build,
            n_rows=n_rows,
            n_partitions=n_partitions
        )
    )
    
    ht.write(f"{output_root}/{name}_{version}_{build}.ht")
    ht = hl.read_table(f"{output_root}/{name}_{version}_{build}.ht")
    ht.describe()

In [None]:
ht37 = hl.read_table("gs://hail-datasets-us/CADD_v1.6_GRCh37.ht")
ht37.describe()
print(f"GRCh37: {str(hl.eval(ht37.metadata))}")
ht37.show()

In [None]:
ht38 = hl.read_table("gs://hail-datasets-us/CADD_v1.6_GRCh38.ht")
ht38.describe()
print(f"GRCh38: {str(hl.eval(ht38.metadata))}")
ht38.show()