# GENCODE
https://www.gencodegenes.org


In [None]:
import hail as hl
hl.init(spark_conf={"spark.hadoop.fs.gs.requester.pays.mode": "CUSTOM",
                    "spark.hadoop.fs.gs.requester.pays.project.id": "broad-ctsa",
                    "spark.hadoop.fs.gs.requester.pays.buckets": "hail-datasets-tmp,hail-datasets-us,hail-datasets-eu"})

## Create GENCODE v35 annotation Hail Table:
**GENCODE Release 35 (GRCh38.p13):**
https://www.gencodegenes.org/human/release_35.html

**Comprehensive gene annotation GTF file (on reference chromosomes only):**
https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_35/gencode.v35.annotation.gtf.gz

**To download comprehensive gene annotation GTF file used to create Hail Table:**
Run `extract_gencode_v35_annotation_gtf.sh` to download the GTF file to the `hail-datasets-tmp` bucket.

In [None]:
overwrite = False
ht = hl.experimental.import_gtf("gs://hail-datasets-tmp/GENCODE/gencode.v35.annotation.gtf.bgz",
                                reference_genome="GRCh38",
                                skip_invalid_contigs=True,
                                min_partitions=4)

# Check all str fields for stray semicolons that remain after GTF import and remove them
# e.g. set(ht.transcript_support_level.collect()) == {'3','2','3;','2;','1'}, but should be {1,2,3}
fields = list(ht.row_value)
str_fields = [f for f in fields if f not in {"score", "frame"}]
ht = ht.annotate(**{f: ht[f].replace(";", "") for f in str_fields})
ht = ht.annotate(**{f: hl.int32(ht[f]) for f in ["level", "exon_number"]})

# Restore original order of table fields after annotating and checkpoint
ht = ht.select(*fields)
ht = ht.checkpoint("gs://hail-datasets-tmp/GENCODE/v35/GRCh38/annotation.ht",
                   overwrite=overwrite,
                   _read_if_exists=not overwrite)
ht.describe()
ht.show()

In [None]:
# Read in checkpointed table, write out to hail-datasets GCS buckets
ht = hl.read_table("gs://hail-datasets-tmp/GENCODE/v35/GRCh38/annotation.ht")
ht.write("gs://hail-datasets-us/GENCODE/v35/GRCh38/annotation.ht")
ht.write("gs://hail-datasets-eu/GENCODE/v35/GRCh38/annotation.ht")

In [None]:
# Read in checkpointed table, write out to hail-datasets S3 bucket
# To read/write to S3, need to set authentication properties in spark_conf
ht = hl.read_table("gs://hail-datasets-tmp/GENCODE/v35/GRCh38/annotation.ht")
ht.write("s3a://hail-datasets-us-east-1/GENCODE/v35/GRCh38/annotation.ht")
