In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

# This line will hide code by default when the notebook is exported as HTML
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))


import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz')

Running on Apache Spark version 2.4.3
SparkUI available at http://p1427.prometheus:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.30-2ae07d872f43
LOGGING: writing to /net/archive/groups/plggneuromol/GTS-analysis/analysis/hail-20201116-1330-0.2.30-2ae07d872f43.log


In [2]:
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()

import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain
import statistics as stat

In [4]:
from bokeh.plotting import figure, show, output_notebook
output_notebook()

In [28]:
import bokeh.models.mappers

## CADD annotation

In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-with-pheno.mt')

In [None]:
cd = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/cadd/whole_genome_SNVs.tsv.gz', force_bgz = True, delimiter='\t', comment='#', 
                     types = {'f0':'tstr', 'f1':'tint', 'f2':'tstr', 'f3':'tstr', 'f4':'tfloat', 'f5':'tfloat'},
                     no_header = True)

In [None]:
cd = cd.select(variants = hl.parse_variant('chr'+hl.str(cd.f0)+':'+hl.str(cd.f1)+':'+hl.str(cd.f2)+':'+hl.str(cd.f3), reference_genome='GRCh38'),
               cadd_score = cd.f5)

In [None]:
cd = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/data/cadd-all.mt')

In [None]:
contigs = ['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chr20',
 'chr21',
 'chr22']

In [None]:
for c in contigs:
    cd_chr = cd.filter(cd.variants.locus.contig == c)
    cd_chr = cd_chr.key_by(cd_chr.variants.locus,cd_chr.variants.alleles)
    cd_chr.write('/net/archive/groups/plggneuromol/GTS-analysis/data/cadd-per-chr/'+c+'_cadd.mt')
    
cd_chr = cd.filter(hl.array(contigs).contains(cd.variants.locus.contig), keep = False)
cd_chr = cd_chr.key_by(cd_chr.variants.locus,cd_chr.variants.alleles)
cd_chr.write('/net/archive/groups/plggneuromol/GTS-analysis/data/cadd-per-chr/other_chroms_cadd.mt')

In [None]:
#do the same for mts
for c in contigs:
    mt_chr = mt.filter_rows(mt.locus.contig == c)
    cd = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/data/cadd-per-chr/'+c+'_cadd.mt')
    
    mt_chr = mt_chr.annotate_rows(cadd = cd[mt_chr.row_key])
    mt_chr.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-annotated-per-chr/'+c+'-anno.mt')

In [None]:
cd = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/data/cadd-per-chr/other_chroms_cadd.mt')

mt_chr = mt.filter_rows(hl.array(contigs).contains(mt.locus.contig), keep = False)

mt_chr = mt_chr.annotate_rows(cadd =cd[mt_chr.row_key])

mt_chr.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-annotated-per-chr/other_chroms-anno.mt')

In [None]:
#now read all the matrices and join
mts = !ls /net/archive/groups/plggneuromol/GTS-analysis/data/mt-annotated-per-chr

In [None]:
mt_chr = []

for m in mts:
    mt_chr.append(hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-annotated-per-chr/'+m)) 

In [None]:
mt = hl.MatrixTable.union_rows(*mt_chr)

In [None]:
mt = mt.transmute_rows(cadd = mt.cadd.cadd_score)

In [None]:
mt.write('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-pheno-cadd.mt')

In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-pheno-cadd.mt')

## join with gnomad

In [None]:
gd = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/gnomad/gnomad-ready-to-join.ht')

In [None]:
#adjust column schema:

gd = gd.annotate_cols(s = hl.str(hl.str(gd.col_idx)+'gnomad'),
                      phenotypes = hl.struct(family = hl.str('gnomad'),
                                             sex = hl.str('gnomad'),
                                             kinship = hl.str('gnomad'),
                                             disease = hl.str('gnomad'),
                                             phenotype = hl.str('gnomad'),
                                             add_pheno = hl.str('gnomad'),
                                             heavy_tics = hl.str('gnomad')))
                      

In [None]:
gd = gd.key_cols_by(gd.s)

In [None]:
gd = gd.drop(gd.col_idx)

In [None]:
mt = mt.union_cols(gd, row_join_type='outer')

In [None]:
#filter out any rows that contain only refs:

mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

In [None]:
mt.write('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-gnomad.mt')

### Perform PCA on subseted genotypes:

In [8]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-gnomad.mt')

In [9]:
subset = mt.sample_rows(0.001)

In [10]:
subset.count()

(11731, 370)

In [11]:
eigenvalues, pcs, _ = hl.hwe_normalized_pca(subset.GT)

2020-11-16 13:34:16 Hail: INFO: hwe_normalized_pca: running PCA using 11651 variants.
2020-11-16 13:34:29 Hail: INFO: pca: running PCA with 10 components...


In [12]:
mt = mt.annotate_cols(scores = pcs[mt.s].scores)

In [56]:
p = hl.plot.scatter(mt.scores[0],
                    mt.scores[1],
                    label=(mt.phenotypes.family),
                    title='PCA', xlabel='PC1', ylabel='PC2')

In [57]:
show(p)

## annotate mt with nearest_genes_20kb



In [None]:
genes = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/vcf_preprocessing/genecode_v32.ht')
genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))


start = hl.cond(genes['hg38.knownGene.txStart'] < 20000, 1, genes['hg38.knownGene.txStart'] - 20000)
stop =  hl.cond(hl.contig_length(genes['hg38.knownGene.chrom'], reference_genome='GRCh38') - genes['hg38.knownGene.txEnd'] < 20000, 
                hl.contig_length(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'),
                genes['hg38.knownGene.txEnd'] + 20000)

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38'))

genes = genes.key_by(genes.interval)

mt = mt.annotate_rows(nearest_genes_20kb = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))

## Make the simulated controls gender equal

PAR coordinates in GrCh38: http://m.ensembl.org/info/genome/genebuild/human_PARS.html

In [None]:
par1 = [10001,2781479]
par2 = [155701383,156030895]

In [None]:
rg = hl.get_reference('GRCh38')

In [None]:
rg.lengths['chrX']

In [None]:
nonpars = [[1,10000],[2781480,155701382]]

### first see sex distribution in our samples

In [None]:
samples = mt.s.collect()

In [None]:
gnomad_females = samples[185:(185+70)] 

In [None]:
gnomads_males = samples[(185+70):]

In [None]:
mt = mt.transmute_cols(phenotypes = mt.phenotypes.annotate(sex = hl.cond(mt.s.contains('gnomad'),
                                                      hl.cond(hl.array(gnomad_females).contains(mt.s),
                                                            'F', 'M'),
                                                      mt.phenotypes['sex'])))

In [None]:
mt.aggregate_cols(hl.agg.counter(mt.phenotypes['sex']))

In [None]:
mt = mt.annotate_entries(GT = hl.cond(
                         mt.locus.contig == 'chrX',
                         hl.cond(
                            mt.phenotypes['sex'] == 'F',
                            mt.GT,
                            hl.cond(
                                mt.locus.position > nonpars[1][1], 
                                mt.GT,
                                hl.cond(
                                    mt.locus.position < nonpars[0][1],
                                    hl.cond(
                                        mt.locus.position > nonpars[0][0],
                                        mt.GT,
                                        hl.call(mt.GT[0], mt.GT[0])),
                                    hl.call(mt.GT[0], mt.GT[0])))),
                         mt.GT))     

In [None]:
#mt.write('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-gnomad-sex.mt')

prepare matrix tables

In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-gnomad-sex.mt')

In [None]:
mt = mt.filter_rows(mt.cadd > 0) # this is so I don't do too many tests

In [None]:
mt_subset = mt.sample_rows(0.002)

In [None]:
mt.count()

In [None]:
mt_subset.count()

In [None]:
mt = mt.annotate_cols(category = hl.cond(mt.s.contains('gnomad'), False, (mt.phenotypes.disease == 'YES')))
mt_subset = mt_subset.annotate_cols(category = hl.cond(mt_subset.s.contains('gnomad'), False, (mt_subset.phenotypes.disease == 'YES')))

mt = mt.filter_cols((mt.s.contains('gnomad')) | ((mt.phenotypes.family =='.') & (mt.phenotypes.disease =='YES')))
mt_subset = mt_subset.filter_cols((mt_subset.s.contains('gnomad')) | ((mt_subset.phenotypes.family =='.') & (mt_subset.phenotypes.disease =='YES')))
samples = mt.s.collect()

#first 70 gnomad samples are females

to_keep = samples[0:44] + samples[110:144]
mt = mt.filter_cols(hl.array(to_keep).contains(mt.s)) # filter out excessive gnomads
mt_subset = mt_subset.filter_cols(hl.array(to_keep).contains(mt_subset.s))

In [None]:
mt_test = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/GTS-gnomad-sex.mt')
mt_test = mt_test.filter_rows(mt_test.cadd > 0) 
mt_test = mt_test.annotate_cols(category = hl.cond(mt_test.s.contains('gnomad'), False, (mt_test.phenotypes.disease == 'YES')))
mt_test = mt_test.filter_cols(mt_test.phenotypes.family == '.', keep = False)
mt_test = mt_test.filter_cols((mt_test.phenotypes.disease == 'YES') | (mt_test.phenotypes.disease == 'NO'))

In [None]:
# generate new alternative data subsets:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-skat.mt')


mt_subset_2 = mt.sample_rows(0.002)
mt_subset_2.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-subset-2.mt')

In [7]:
mt_subset_3 = mt.sample_rows(0.002)
mt_subset_3.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-subset-3.mt')

2020-10-29 11:20:54 Hail: INFO: wrote matrix table with 12721 rows and 78 columns in 6622 partitions to /net/archive/groups/plggneuromol/GTS-analysis/data/mt-subset-3.mt


In [None]:
mt.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-for-skat.mt')
mt_subset.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-subset.mt')
mt_test.write('/net/archive/groups/plggneuromol/GTS-analysis/data/mt-test.mt')