In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

#This line will hide code by default when the notebook is exported as HTML
#di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz')

Running on Apache Spark version 2.4.3
SparkUI available at http://p0916.prometheus:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.62-84fa81b9ea3d
LOGGING: writing to /net/archive/groups/plggneuromol/sportsmen-wgs/analysis/imdik-zekanowski-sportwgs/hail-20210701-1811-0.2.62-84fa81b9ea3d.log


In [3]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis

output_notebook()

## filtering analysis

In [4]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/joint-unrelated-corrected.mt')

In [17]:
mt = mt.filter_rows(mt.gnomad_v3.v3_nfe.AF < 0.05)

In [18]:
mt.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005.mt')

2021-04-23 15:33:11 Hail: INFO: wrote matrix table with 4925533 rows and 147 columns in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005.mt
    Total size: 3.38 GiB
    * Rows/entries: 3.38 GiB
    * Columns: 648.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  4095 rows (3.74 MiB)


In [4]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005.mt')

In [20]:
genes = hl.read_table('/net/archive/groups/plggneuromol/GTS-analysis/analysis/vcf_preprocessing/genecode_v32.ht')

genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))

hpo = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/analysis/vcf_preprocessing/hpo.tsv', impute = True, no_header=True)

start = genes['hg38.knownGene.txStart']
stop =  genes['hg38.knownGene.txEnd']

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38', includes_start=False))

genes = genes.key_by(genes['hg38.kgXref.geneSymbol'])

hpo = hpo.key_by(hpo.f0)

genes = genes.annotate(hpo = hpo.index(genes['hg38.kgXref.geneSymbol'], all_matches = True)['f1'])

genes = genes.key_by(genes.interval)

mt = mt.annotate_rows(within_gene = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))
mt = mt.annotate_rows(hpo = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hpo'])))
  
    
mt.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-genes.mt')

2021-04-23 15:33:11 Hail: INFO: Reading table to impute column types
2021-04-23 15:33:12 Hail: INFO: Finished type imputation
  Loading field 'f0' as type str (imputed)
  Loading field 'f1' as type str (imputed)
2021-04-23 15:33:13 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-04-23 15:33:14 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-04-23 15:33:15 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-04-23 15:33:15 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-04-23 15:33:16 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-04-23 15:33:17 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-04-23 15:34:32 Hail: INFO: wrote matrix table with 4925533 rows and 147 columns in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-genes.mt
    Total size: 3.49 GiB
    * Rows/entries: 3.49 GiB
    * Columns: 648.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows

In [6]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-genes.mt')

In [22]:
mt.count()

(4925533, 147)

In [23]:
mt = mt.filter_rows(mt.within_gene == hl.empty_array(hl.tstr), keep = False)

In [24]:
mt.count()

(3191612, 147)

In [25]:
mt.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic.mt')

2021-04-23 15:35:23 Hail: INFO: wrote matrix table with 3191612 rows and 147 columns in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic.mt
    Total size: 2.39 GiB
    * Rows/entries: 2.39 GiB
    * Columns: 648.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  3841 rows (4.51 MiB)


In [66]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic.mt')

In [67]:
mt.count()

(3191612, 147)

## lets further filter variants - higher quality qenotypes only

In [28]:
rpmk = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/vcf_preprocessing/repeatmasker_all.ht')

contig_len = hl.contig_length(rpmk.interval.end.contig, reference_genome='GRCh38')
end_pos = rpmk.interval.end.position
start_pos = rpmk.interval.start.position
contig = rpmk.interval.start.contig

rpmk = rpmk.annotate(larger_int_start = hl.if_else(start_pos == 1,
                                                   hl.locus(contig, start_pos, reference_genome='GRCh38'),
                                                   hl.locus(contig, start_pos-1, reference_genome='GRCh38')),
                     larger_int_end = hl.if_else(end_pos == contig_len,
                                                 hl.locus(contig, end_pos, reference_genome='GRCh38'),
                                                 hl.locus(contig, end_pos+1, reference_genome='GRCh38')),
                     larger_int_start_2 = hl.if_else((start_pos == 1) | (start_pos == 2),
                                                   hl.locus(contig, start_pos, reference_genome='GRCh38'),
                                                   hl.locus(contig, start_pos-2, reference_genome='GRCh38')),
                     larger_int_end_2 = hl.if_else((end_pos == contig_len) | (end_pos == (contig_len-1)),
                                                 hl.locus(contig, end_pos, reference_genome='GRCh38'),
                                                 hl.locus(contig, end_pos+2, reference_genome='GRCh38')))

In [25]:
rpmk.write('/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/vcf_preprocessing/repeatmasker_anno.ht')

2021-03-30 20:23:23 Hail: INFO: wrote table with 5520017 rows in 48 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/analysis/vcf_preprocessing/repeatmasker_anno.ht
    Total size: 214.57 MiB
    * Rows: 214.57 MiB
    * Globals: 11.00 B
    * Smallest partition: 51769 rows (2.42 MiB)
    * Largest partition:  153107 rows (5.90 MiB)


In [None]:
#for some reason a loop doesn't work here
rpmk = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/vcf_preprocessing/repeatmasker_anno.ht')
temp = rpmk.key_by(rpmk['larger_int_start'])
mt = mt.filter_rows(hl.is_defined(temp[mt.locus]), keep = False)
mt.checkpoint('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered0.mt')
    
rpmk = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/vcf_preprocessing/repeatmasker_anno.ht')
temp = rpmk.key_by(rpmk['larger_int_end'])
mt = mt.filter_rows(hl.is_defined(temp[mt.locus]), keep = False)
mt.checkpoint('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered1.mt')

rpmk = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/vcf_preprocessing/repeatmasker_anno.ht')
temp = rpmk.key_by(rpmk['larger_int_start_2'])
mt = mt.filter_rows(hl.is_defined(temp[mt.locus]), keep = False)
mt.checkpoint('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered2.mt')



2021-04-23 15:35:26 Hail: INFO: Coerced sorted dataset
2021-04-23 15:36:08 Hail: INFO: wrote matrix table with 3191612 rows and 147 columns in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered0.mt
    Total size: 3.19 GiB
    * Rows/entries: 3.19 GiB
    * Columns: 669.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  3841 rows (5.60 MiB)
2021-04-23 15:36:11 Hail: INFO: Coerced sorted dataset
2021-04-23 15:36:13 Hail: INFO: Ordering unsorted dataset with network shuffle


In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered2.mt')

In [None]:
rpmk = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/vcf_preprocessing/repeatmasker_anno.ht')
temp = rpmk.key_by(rpmk['larger_int_end_2'])
mt = mt.filter_rows(hl.is_defined(temp[mt.locus]), keep = False)
mt.checkpoint('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered.mt')

    

In [68]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered.mt')

In [None]:
mt = hl.variant_qc(mt)

In [None]:
mt = mt.annotate_cols(group = hl.if_else(mt.s.contains("B"), "sport", "control"))

In [None]:
mt = mt.filter_rows(mt.variant_qc.dp_stats.mean > 3)

In [None]:
mt = mt.annotate_rows(controls_non_ref = hl.agg.filter((mt.group == 'control'), hl.agg.count_where(mt.GT.is_non_ref())),
                      sports_non_ref = hl.agg.filter((mt.group == 'sport'), hl.agg.count_where(mt.GT.is_non_ref())))
mt = mt.annotate_rows(controls_ref = hl.agg.filter((mt.group == 'control'), hl.agg.count_where(mt.GT.is_hom_ref())),
                      sports_ref = hl.agg.filter((mt.group == 'sport'), hl.agg.count_where(mt.GT.is_hom_ref())))


In [None]:
mt.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

## analysis of all protein coding genes

In [5]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [6]:
mt.count()

(3154875, 147)

In [7]:
mt = mt.filter_rows((mt.controls_non_ref == 0)|(mt.sports_non_ref == 0), keep=False)

In [8]:
go_genes = hl.import_table('/net/archive/groups/plggneuromol/GTS-analysis/analysis/gts_gene_lists/human-genes-with-GO-and-symbols')

2021-07-01 17:32:11 Hail: INFO: Reading table without type imputation
  Loading field 'Gene stable ID' as type str (not specified)
  Loading field 'UniProtKB Gene Name symbol' as type str (not specified)


In [9]:
all_genes = go_genes['UniProtKB Gene Name symbol'].collect()

In [10]:
len(all_genes)

23804

In [11]:
mt = mt.filter_rows(hl.any(lambda x: hl.literal(all_genes).contains(x), mt.within_gene))

In [12]:
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))

In [13]:
to_test = mt.rows()
to_test = to_test.select(to_test.controls_non_ref, to_test.sports_non_ref, to_test.controls_ref, to_test.sports_ref)

to_test.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_005.ht')

FatalError: HailException: file already exists: /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_005.ht

Java stack trace:
is.hail.utils.HailException: file already exists: /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_005.ht
	at is.hail.utils.ErrorHandling$class.fatal(ErrorHandling.scala:11)
	at is.hail.utils.package$.fatal(package.scala:77)
	at is.hail.expr.ir.TableNativeWriter.apply(TableWriter.scala:93)
	at is.hail.expr.ir.Interpret$.run(Interpret.scala:825)
	at is.hail.expr.ir.Interpret$.alreadyLowered(Interpret.scala:53)
	at is.hail.expr.ir.InterpretNonCompilable$.interpretAndCoerce$1(InterpretNonCompilable.scala:16)
	at is.hail.expr.ir.InterpretNonCompilable$.is$hail$expr$ir$InterpretNonCompilable$$rewrite$1(InterpretNonCompilable.scala:53)
	at is.hail.expr.ir.InterpretNonCompilable$.apply(InterpretNonCompilable.scala:58)
	at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.transform(LoweringPass.scala:67)
	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:15)
	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3$$anonfun$1.apply(LoweringPass.scala:15)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:15)
	at is.hail.expr.ir.lowering.LoweringPass$$anonfun$apply$3.apply(LoweringPass.scala:13)
	at is.hail.utils.ExecutionTimer.time(ExecutionTimer.scala:81)
	at is.hail.expr.ir.lowering.LoweringPass$class.apply(LoweringPass.scala:13)
	at is.hail.expr.ir.lowering.InterpretNonCompilablePass$.apply(LoweringPass.scala:62)
	at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:14)
	at is.hail.expr.ir.lowering.LoweringPipeline$$anonfun$apply$1.apply(LoweringPipeline.scala:12)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.WrappedArray.foreach(WrappedArray.scala:35)
	at is.hail.expr.ir.lowering.LoweringPipeline.apply(LoweringPipeline.scala:12)
	at is.hail.expr.ir.CompileAndEvaluate$._apply(CompileAndEvaluate.scala:28)
	at is.hail.backend.spark.SparkBackend.is$hail$backend$spark$SparkBackend$$_execute(SparkBackend.scala:360)
	at is.hail.backend.spark.SparkBackend$$anonfun$execute$1.apply(SparkBackend.scala:344)
	at is.hail.backend.spark.SparkBackend$$anonfun$execute$1.apply(SparkBackend.scala:341)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:25)
	at is.hail.expr.ir.ExecuteContext$$anonfun$scoped$1.apply(ExecuteContext.scala:23)
	at is.hail.utils.package$.using(package.scala:618)
	at is.hail.annotations.RegionPool$.scoped(RegionPool.scala:13)
	at is.hail.expr.ir.ExecuteContext$.scoped(ExecuteContext.scala:23)
	at is.hail.backend.spark.SparkBackend.withExecuteContext(SparkBackend.scala:254)
	at is.hail.backend.spark.SparkBackend.execute(SparkBackend.scala:341)
	at is.hail.backend.spark.SparkBackend$$anonfun$7.apply(SparkBackend.scala:385)
	at is.hail.backend.spark.SparkBackend$$anonfun$7.apply(SparkBackend.scala:383)
	at is.hail.utils.ExecutionTimer$.time(ExecutionTimer.scala:52)
	at is.hail.backend.spark.SparkBackend.executeJSON(SparkBackend.scala:383)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:748)



Hail version: 0.2.62-84fa81b9ea3d
Error summary: HailException: file already exists: /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_005.ht

In [8]:
to_test = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_005.ht')

In [9]:
to_test.count()

587838

In [10]:
to_test = to_test.annotate(fisher = hl.fisher_exact_test(hl.int32(to_test.controls_non_ref), 
                                                         hl.int32(to_test.controls_ref),
                                                         hl.int32(to_test.sports_non_ref),
                                                         hl.int32(to_test.sports_ref)))

In [11]:
to_test = to_test.filter(to_test.fisher.p_value < 0.5/to_test.count())

In [13]:
to_test.count()

23

In [14]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [15]:
mt_test = mt.filter_rows(hl.is_defined(to_test[mt.row_key]), keep = True)
mt_test = mt_test.annotate_rows(fisher = to_test[mt_test.row_key]['fisher'])

In [16]:
mt_test = mt_test.select_rows(rsid = mt_test.gnomad_v3.rsid,
                                 gnomAD_v3_1_allele_frequency = mt_test.gnomad_v3.v3_nfe.AF,
                                 gnomAD_v3_1_homozygote_count = mt_test.gnomad_v3.v3_nfe.homozygote_count,
                                 predicted_most_severe_consequence = mt_test.gnomad_v3.most_severe_consequence,
                                 gene = mt_test.within_gene,
                                 hpo = mt_test.hpo,
                                 controls_non_ref = mt_test.controls_non_ref,
                                 sports_non_ref = mt_test.sports_non_ref,
                                 controls_ref = mt_test.controls_ref,
                                 sports_ref = mt_test.sports_ref,
                                 percent_controls_non_ref = mt_test.controls_non_ref/47*100,
                                 percent_sports_non_ref = mt_test.sports_non_ref/100*100,
                                 percent_controls_ref = mt_test.controls_ref/46*100,
                                 percent_sports_ref = mt_test.sports_ref/100*100,
                                 percent_sports_genotyped = (mt_test.sports_non_ref + mt_test.sports_ref)/100*100,
                                 percent_controls_genotyped = (mt_test.controls_non_ref + mt_test.controls_ref)/47*100,
                                 fisher = mt_test.fisher.p_value,
                                 )

In [17]:
mt_test = mt_test.annotate_rows(
    samples_het = hl.agg.filter(mt_test.GT.is_het(), hl.agg.collect(mt_test.s)),
    samples_hom_non_ref = hl.agg.filter(mt_test.GT.is_hom_var(), hl.agg.collect(mt_test.s)))

In [18]:
mt_test.count()

(23, 147)

In [19]:
### annotate with frequency in GTS:

gts = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/unrelated-gts.mt')

gts = gts.annotate_rows(gts_non_ref = hl.agg.count_where(gts.GT.is_non_ref()),
                        gts_ref = hl.agg.count_where(gts.GT.is_hom_ref()))

gts = gts.rows()

gts = gts.select(gts_non_ref = gts.gts_non_ref,
                 gts_ref = gts.gts_ref,
                 percent_gts_non_ref = gts.gts_non_ref/40*100,
                 percent_gts_ref = gts.gts_ref/40*100,
                 percent_gts_genotyped = (gts.gts_non_ref + gts.gts_ref)/40*100)

mt_test = mt_test.annotate_rows(gts_non_ref = gts.index(mt_test.row_key).gts_non_ref,
                                gts_ref = gts.index(mt_test.row_key).gts_ref,
                                percent_gts_non_ref = gts.index(mt_test.row_key).percent_gts_non_ref,
                                percent_gts_ref = gts.index(mt_test.row_key).percent_gts_ref,
                                percent_gts_genotyped = gts.index(mt_test.row_key).percent_gts_genotyped)



to_export = mt_test.rows()

to_export = to_export.to_pandas()

In [20]:
to_export.columns

Index(['locus.contig', 'locus.position', 'alleles', 'rsid',
       'gnomAD_v3_1_allele_frequency', 'gnomAD_v3_1_homozygote_count',
       'predicted_most_severe_consequence', 'gene', 'hpo', 'controls_non_ref',
       'sports_non_ref', 'controls_ref', 'sports_ref',
       'percent_controls_non_ref', 'percent_sports_non_ref',
       'percent_controls_ref', 'percent_sports_ref',
       'percent_sports_genotyped', 'percent_controls_genotyped', 'fisher',
       'samples_het', 'samples_hom_non_ref', 'gts_non_ref', 'gts_ref',
       'percent_gts_non_ref', 'percent_gts_ref', 'percent_gts_genotyped'],
      dtype='object')

In [21]:
to_export.shape

(23, 27)

In [22]:
to_export

Unnamed: 0,locus.contig,locus.position,alleles,rsid,gnomAD_v3_1_allele_frequency,gnomAD_v3_1_homozygote_count,predicted_most_severe_consequence,gene,hpo,controls_non_ref,...,percent_sports_genotyped,percent_controls_genotyped,fisher,samples_het,samples_hom_non_ref,gts_non_ref,gts_ref,percent_gts_non_ref,percent_gts_ref,percent_gts_genotyped
0,chr1,237515763,"[C, G]",rs1436318549,0.016573,2,intron_variant,[RYR2],"[[HP:0001699:Sudden death, HP:0011663:Right ve...",4,...,89.0,100.0,1.721915e-08,"[B338, B432, B433, B435, B436, B437, B442, B44...","[B102, B156, B439, B444, B448, B450, B453, B45...",3,36,7.5,90.0,97.5
1,chr2,95935226,"[T, C]",rs550694093,0.005094,0,intron_variant,[ANKRD36C],[None],5,...,98.0,100.0,4.146488e-10,"[492, B24, B338, B399, B427, B430, B431, B432,...",[],0,39,0.0,97.5,97.5
2,chr2,114871252,"[G, C]",rs1438743709,0.004593,32,intron_variant,[DPP10],[None],19,...,94.0,100.0,5.655587e-09,"[B462, S_7213, S_7214, S_7227, S_7229, S_7241,...","[S_7237, WGS_37c]",10,28,25.0,70.0,95.0
3,chr3,75686761,"[C, G]",rs186911967,1.6e-05,0,intron_variant,"[LINC00960, ZNF717]",[None],1,...,97.0,100.0,6.353729e-15,"[479, B24, B382, B399, B427, B430, B431, B432,...",[],1,38,2.5,95.0,97.5
4,chr3,75686786,"[C, A]",rs182818770,3.4e-05,0,intron_variant,"[LINC00960, ZNF717]",[None],1,...,94.0,100.0,6.914127e-14,"[479, B156, B24, B427, B431, B432, B433, B435,...",[],1,38,2.5,95.0,97.5
5,chr3,75785301,"[C, T]",rs370090260,0.04171,0,non_coding_transcript_exon_variant,[ZNF717],[None],15,...,93.0,100.0,1.671457e-07,"[B452, S_7212, S_7214, S_7229, S_7237, S_7246,...",[],11,27,27.5,67.5,95.0
6,chr5,170389319,"[G, C]",,0.021391,0,5_prime_UTR_variant,"[KCNIP1, KCNMB1]","[[HP:0000006:Autosomal dominant inheritance, H...",14,...,93.0,100.0,5.948068e-07,"[462, B438, S_7229, S_7241, S_7252, S_7254, S_...",[],5,33,12.5,82.5,95.0
7,chr6,157314313,"[C, G]",,0.044423,0,intron_variant,[TMEM242],[None],27,...,93.0,100.0,4.059197e-13,"[468, 475, 490, B468, B471, B487, S_7212, S_72...",[],16,23,40.0,57.5,97.5
8,chr7,100956443,"[C, A]",rs1235870137,0.025931,0,missense_variant,[MUC3A],[None],28,...,93.0,100.0,2.150403e-10,"[B427, B432, B439, B469, B474, B481, B483, B53...",[],26,14,65.0,35.0,100.0
9,chr7,152388718,"[G, T]",,0.008512,0,intron_variant,[KMT2C],"[[HP:0001263:Global developmental delay, HP:01...",24,...,95.0,100.0,3.640856e-07,"[468, B439, B457, B467, B470, B473, B474, B489...",[],11,28,27.5,70.0,97.5


In [23]:
to_export.to_csv('filtered_variants_sportsmen_05.csv')

### genes enriched in muscle: https://www.proteinatlas.org/search/tissue_category_rna%3Askeletal+muscle%3BTissue+enriched+AND+sort_by%3Atissue+specific+score+AND+show_columns%3Agroupenriched

In [33]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [34]:
mt.count()

(3154875, 147)

In [35]:
mt = mt.filter_rows((mt.controls_non_ref == 0)|(mt.sports_non_ref == 0), keep=False)

In [37]:
pwd

'/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/imdik-zekanowski-sportwgs'

In [38]:
muscle_genes = hl.import_table('/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/gene-lists-and-results/tissue_category_rna_skeletal.tsv')

2021-07-01 17:57:04 Hail: INFO: Loading 292 fields. Counts by type:
  str: 292


In [39]:
genes = muscle_genes['Gene'].collect()

In [40]:
len(genes)

907

In [41]:
mt = mt.filter_rows(hl.any(lambda x: hl.literal(genes).contains(x), mt.within_gene))

In [42]:
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))

In [84]:
to_test = mt.rows()
to_test = to_test.select(to_test.controls_non_ref, to_test.sports_non_ref, to_test.controls_ref, to_test.sports_ref)

to_test.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_muscle_005.ht')

2021-04-23 19:06:16 Hail: INFO: wrote table with 33484 rows in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_muscle_005.ht
    Total size: 712.53 KiB
    * Rows: 712.52 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  927 rows (15.40 KiB)


In [24]:
to_test = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_muscle_005.ht')

In [25]:
to_test.count()

33484

In [26]:
to_test = to_test.annotate(fisher = hl.fisher_exact_test(hl.int32(to_test.controls_non_ref), 
                                                         hl.int32(to_test.controls_ref),
                                                         hl.int32(to_test.sports_non_ref),
                                                         hl.int32(to_test.sports_ref)))

In [46]:
to_test = to_test.filter(to_test.fisher.p_value < 0.3/to_test.count())

In [47]:
to_test.count() #so I skip this

0

### genes elevated in the brain: https://www.proteinatlas.org/search/tissue_category_rna%3Abrain%3BTissue+enriched%2CGroup+enriched%2CTissue+enhanced+AND+sort_by%3Atissue+specific+score+AND+show_columns%3Agroupenriched

In [48]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [49]:
mt.count()

(3154875, 147)

In [50]:
mt = mt.filter_rows((mt.controls_non_ref == 0)|(mt.sports_non_ref == 0), keep=False)

In [52]:
brain_genes = hl.import_table('/net/archive/groups/plggneuromol/sportsmen-wgs/analysis/gene-lists-and-results/tissue_category_rna_brain_Tissue.tsv')

2021-07-01 17:58:09 Hail: INFO: Loading 292 fields. Counts by type:
  str: 292


In [None]:
genes = brain_genes['Gene'].collect()

In [None]:
len(genes)

In [None]:
mt = mt.filter_rows(hl.any(lambda x: hl.literal(genes).contains(x), mt.within_gene))

In [None]:
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))

In [131]:
to_test = mt.rows()
to_test = to_test.select(to_test.controls_non_ref, to_test.sports_non_ref, to_test.controls_ref, to_test.sports_ref)

to_test.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_brain_005.ht')

2021-04-23 19:14:45 Hail: INFO: wrote table with 167346 rows in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_brain_005.ht
    Total size: 3.06 MiB
    * Rows: 3.06 MiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  1077 rows (17.62 KiB)


In [27]:
to_test = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_brain_005.ht')

In [28]:
to_test.count()

167346

In [29]:
to_test = to_test.annotate(fisher = hl.fisher_exact_test(hl.int32(to_test.controls_non_ref), 
                                                         hl.int32(to_test.controls_ref),
                                                         hl.int32(to_test.sports_non_ref),
                                                         hl.int32(to_test.sports_ref)))

In [30]:
to_test = to_test.filter(to_test.fisher.p_value < 0.5/to_test.count())

In [31]:
to_test.count()

10

In [32]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [33]:
mt_test = mt.filter_rows(hl.is_defined(to_test[mt.row_key]), keep = True)
mt_test = mt_test.annotate_rows(fisher = to_test[mt_test.row_key]['fisher'])

In [34]:
mt_test = mt_test.select_rows(rsid = mt_test.gnomad_v3.rsid,
                                 gnomAD_v3_1_allele_frequency = mt_test.gnomad_v3.v3_nfe.AF,
                                 gnomAD_v3_1_homozygote_count = mt_test.gnomad_v3.v3_nfe.homozygote_count,
                                 predicted_most_severe_consequence = mt_test.gnomad_v3.most_severe_consequence,
                                 gene = mt_test.within_gene,
                                 hpo = mt_test.hpo,
                                 controls_non_ref = mt_test.controls_non_ref,
                                 sports_non_ref = mt_test.sports_non_ref,
                                 controls_ref = mt_test.controls_ref,
                                 sports_ref = mt_test.sports_ref,
                                 percent_controls_non_ref = mt_test.controls_non_ref/47*100,
                                 percent_sports_non_ref = mt_test.sports_non_ref/100*100,
                                 percent_controls_ref = mt_test.controls_ref/47*100,
                                 percent_sports_ref = mt_test.sports_ref/100*100,
                                 percent_sports_genotyped = (mt_test.sports_non_ref + mt_test.sports_ref)/100*100,
                                 percent_controls_genotyped = (mt_test.controls_non_ref + mt_test.controls_ref)/47*100,
                                 fisher = mt_test.fisher.p_value,
                                 )

In [35]:
mt_test = mt_test.annotate_rows(
    samples_het = hl.agg.filter(mt_test.GT.is_het(), hl.agg.collect(mt_test.s)),
    samples_hom_non_ref = hl.agg.filter(mt_test.GT.is_hom_var(), hl.agg.collect(mt_test.s)))

In [36]:
mt_test.count()

(10, 147)

In [37]:
### annotate with frequency in GTS:

gts = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/unrelated-gts.mt')

gts = gts.annotate_rows(gts_non_ref = hl.agg.count_where(gts.GT.is_non_ref()),
                        gts_ref = hl.agg.count_where(gts.GT.is_hom_ref()))

gts = gts.rows()

gts = gts.select(gts_non_ref = gts.gts_non_ref,
                 gts_ref = gts.gts_ref,
                 percent_gts_non_ref = gts.gts_non_ref/40*100,
                 percent_gts_ref = gts.gts_ref/40*100,
                 percent_gts_genotyped = (gts.gts_non_ref + gts.gts_ref)/40*100)

mt_test = mt_test.annotate_rows(gts_non_ref = gts.index(mt_test.row_key).gts_non_ref,
                                gts_ref = gts.index(mt_test.row_key).gts_ref,
                                percent_gts_non_ref = gts.index(mt_test.row_key).percent_gts_non_ref,
                                percent_gts_ref = gts.index(mt_test.row_key).percent_gts_ref,
                                percent_gts_genotyped = gts.index(mt_test.row_key).percent_gts_genotyped)

In [38]:
to_export = mt_test.rows()

to_export = to_export.to_pandas()

In [39]:
to_export.shape

(10, 27)

In [40]:
to_export

Unnamed: 0,locus.contig,locus.position,alleles,rsid,gnomAD_v3_1_allele_frequency,gnomAD_v3_1_homozygote_count,predicted_most_severe_consequence,gene,hpo,controls_non_ref,...,percent_sports_genotyped,percent_controls_genotyped,fisher,samples_het,samples_hom_non_ref,gts_non_ref,gts_ref,percent_gts_non_ref,percent_gts_ref,percent_gts_genotyped
0,chr2,114871252,"[G, C]",rs1438743709,0.004593,32,intron_variant,[DPP10],[None],19,...,94.0,100.0,5.655587e-09,"[B462, S_7213, S_7214, S_7227, S_7229, S_7241,...","[S_7237, WGS_37c]",10,28,25.0,70.0,95.0
1,chr5,170389319,"[G, C]",,0.021391,0,5_prime_UTR_variant,"[KCNIP1, KCNMB1]","[[HP:0000006:Autosomal dominant inheritance, H...",14,...,93.0,100.0,5.948068e-07,"[462, B438, S_7229, S_7241, S_7252, S_7254, S_...",[],5,33,12.5,82.5,95.0
2,chr17,65139057,"[A, G]",,0.04037,182,intron_variant,[RGS9],"[[HP:0000613:Photophobia, HP:0000505:Visual im...",5,...,34.0,95.744682,3.981673e-09,"[B477, S_7237, WGS_180b]","[B102, B156, B24, B382, B399, B427, B432, B437...",0,37,0.0,92.5,92.5
3,chr20,25773860,"[T, C]",rs201133036,0.010995,0,intron_variant,[FAM182B],[None],3,...,94.0,97.872337,1.585263e-07,"[B102, B338, B382, B399, B430, B439, B441, B44...",[],2,37,5.0,92.5,97.5
4,chr20,25776325,"[A, T]",rs1198672321,0.040598,0,intron_variant,[FAM182B],[None],30,...,94.0,97.872337,1.359509e-07,"[475, 479, 490, 492, B156, B24, B427, B439, B4...",[],30,9,75.0,22.5,97.5
5,chr22,20255642,"[A, C]",,0.000214,0,intron_variant,[RTN4R],"[[HP:0000006:Autosomal dominant inheritance, H...",15,...,93.0,97.872337,1.284009e-07,"[B472, S_7212, S_7213, S_7214, S_7229, S_7236,...",[],0,38,0.0,95.0,95.0
6,chr22,23118472,"[G, C]",rs991832091,0.004757,0,intron_variant,"[GNAZ, RSPH14]",[None],15,...,93.0,97.872337,8.339984e-07,"[468, 479, B24, B338, S_7213, S_7236, S_7245, ...",[],1,36,2.5,90.0,92.5
7,chr22,23118477,"[G, C]",,0.003009,0,intron_variant,"[GNAZ, RSPH14]",[None],14,...,93.0,97.872337,4.674184e-07,"[468, 479, B338, S_7213, S_7236, S_7245, S_724...",[],1,36,2.5,90.0,92.5
8,chrX,91806321,"[C, T]",rs2023875,0.0,0,intron_variant,[PCDH11X],[None],12,...,96.0,97.872337,2.061988e-06,"[490, 492, B156, B24, B338, B382, B399, B427, ...",[],18,20,45.0,50.0,95.0
9,chrX,91806352,"[G, A]",,2.2e-05,0,intron_variant,[PCDH11X],[None],11,...,95.0,97.872337,2.349844e-06,"[490, 492, B156, B24, B338, B382, B427, B430, ...",[],19,18,47.5,45.0,92.5


In [41]:
to_export.to_csv('brain_variants_sportsmen_05.csv')

### genes elevated in lungs: https://www.proteinatlas.org/search/tissue_category_rna%3Alung%3BTissue+enriched%2CGroup+enriched%2CTissue+enhanced+AND+sort_by%3Atissue+specific+score+AND+show_columns%3Agroupenriched

In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [None]:
mt.count()

In [None]:
mt = mt.filter_rows((mt.controls_non_ref == 0)|(mt.sports_non_ref == 0), keep=False)

In [None]:
lung_genes = hl.import_table('./gene-lists-and-results/tissue_category_rna_lung_Tissue.tsv')

In [None]:
genes = lung_genes['Gene'].collect()

In [None]:
len(genes)

In [None]:
mt = mt.filter_rows(hl.any(lambda x: hl.literal(genes).contains(x), mt.within_gene))

In [164]:
to_test = mt.rows()
to_test = to_test.select(to_test.controls_non_ref, to_test.sports_non_ref, to_test.controls_ref, to_test.sports_ref)

to_test.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_lung_005.ht')

2021-04-23 19:36:35 Hail: INFO: wrote table with 25923 rows in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_lung_005.ht
    Total size: 538.17 KiB
    * Rows: 538.16 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  1146 rows (19.38 KiB)


In [72]:
to_test = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_lung_005.ht')

In [73]:
to_test.count()

25923

In [74]:
to_test = to_test.annotate(fisher = hl.fisher_exact_test(hl.int32(to_test.controls_non_ref), 
                                                         hl.int32(to_test.controls_ref),
                                                         hl.int32(to_test.sports_non_ref),
                                                         hl.int32(to_test.sports_ref)))

In [75]:
to_test = to_test.filter(to_test.fisher.p_value < 0.3/to_test.count())

In [76]:
to_test.count()

21

In [84]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [85]:
mt_test = mt.filter_rows(hl.is_defined(to_test[mt.row_key]), keep = True)
mt_test = mt_test.annotate_rows(fisher = to_test[mt_test.row_key]['fisher'])

In [86]:
mt_test = mt_test.select_rows(rsid = mt_test.gnomad_v3.rsid,
                                 gnomAD_v3_1_allele_frequency = mt_test.gnomad_v3.v3_nfe.AF,
                                 gnomAD_v3_1_homozygote_count = mt_test.gnomad_v3.v3_nfe.homozygote_count,
                                 predicted_most_severe_consequence = mt_test.gnomad_v3.most_severe_consequence,
                                 gene = mt_test.within_gene,
                                 hpo = mt_test.hpo,
                                 controls_non_ref = mt_test.controls_non_ref,
                                 sports_non_ref = mt_test.sports_non_ref,
                                 controls_ref = mt_test.controls_ref,
                                 sports_ref = mt_test.sports_ref,
                                 percent_controls_non_ref = mt_test.controls_non_ref/47*100,
                                 percent_sports_non_ref = mt_test.sports_non_ref/100*100,
                                 percent_controls_ref = mt_test.controls_ref/47*100,
                                 percent_sports_ref = mt_test.sports_ref/100*100,
                                 percent_sports_genotyped = (mt_test.sports_non_ref + mt_test.sports_ref)/100*100,
                                 percent_controls_genotyped = (mt_test.controls_non_ref + mt_test.controls_ref)/47*100,
                                 fisher = mt_test.fisher.p_value,
                                 )

In [87]:
mt_test = mt_test.filter_rows(hl.is_snp(mt_test.alleles[0], mt_test.alleles[1]))

In [88]:
mt_test = mt_test.annotate_rows(
    samples_het = hl.agg.filter(mt_test.GT.is_het(), hl.agg.collect(mt_test.s)),
    samples_hom_non_ref = hl.agg.filter(mt_test.GT.is_hom_var(), hl.agg.collect(mt_test.s)))

In [89]:
mt_test.count()

(12, 147)

In [91]:
### annotate with frequency in GTS:

gts = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/unrelated-gts.mt')

gts = gts.annotate_rows(gts_non_ref = hl.agg.count_where(gts.GT.is_non_ref()),
                        gts_ref = hl.agg.count_where(gts.GT.is_hom_ref()))

gts = gts.rows()

gts = gts.select(gts_non_ref = gts.gts_non_ref,
                 gts_ref = gts.gts_ref,
                 percent_gts_non_ref = gts.gts_non_ref/40*100,
                 percent_gts_ref = gts.gts_ref/40*100,
                 percent_gts_genotyped = (gts.gts_non_ref + gts.gts_ref)/40*100)

mt_test = mt_test.annotate_rows(gts_non_ref = gts.index(mt_test.row_key).gts_non_ref,
                                gts_ref = gts.index(mt_test.row_key).gts_ref,
                                percent_gts_non_ref = gts.index(mt_test.row_key).percent_gts_non_ref,
                                percent_gts_ref = gts.index(mt_test.row_key).percent_gts_ref,
                                percent_gts_genotyped = gts.index(mt_test.row_key).percent_gts_genotyped)

In [92]:
to_export = mt_test.rows()

to_export = to_export.to_pandas()

In [93]:
to_export.columns

Index(['locus.contig', 'locus.position', 'alleles', 'rsid',
       'gnomAD_v3_1_allele_frequency', 'gnomAD_v3_1_homozygote_count',
       'predicted_most_severe_consequence', 'gene', 'hpo', 'controls_non_ref',
       'sports_non_ref', 'controls_ref', 'sports_ref',
       'percent_controls_non_ref', 'percent_sports_non_ref',
       'percent_controls_ref', 'percent_sports_ref',
       'percent_sports_genotyped', 'percent_controls_genotyped', 'fisher',
       'samples_het', 'samples_hom_non_ref', 'gts_non_ref', 'gts_ref',
       'percent_gts_non_ref', 'percent_gts_ref', 'percent_gts_genotyped'],
      dtype='object')

In [94]:
to_export.shape

(12, 27)

In [95]:
to_export

Unnamed: 0,locus.contig,locus.position,alleles,rsid,gnomAD_v3_1_allele_frequency,gnomAD_v3_1_homozygote_count,predicted_most_severe_consequence,gene,hpo,controls_non_ref,...,percent_sports_genotyped,percent_controls_genotyped,fisher,samples_het,samples_hom_non_ref,gts_non_ref,gts_ref,percent_gts_non_ref,percent_gts_ref,percent_gts_genotyped
0,chr6,32525558,"[A, C]",,0.027921,0,intron_variant,[HLA-DRB5],[None],20,...,28.0,85.106384,1.80675e-06,"[S_7212, S_7229, S_7246, S_7252, S_7261, S_726...","[S_7245, S_7247, S_7255, S_7294, WGS_163b, WGS...",9,20,22.5,50.0,72.5
1,chr6,32525559,"[T, C]",rs201618469,0.027942,0,intron_variant,[HLA-DRB5],[None],18,...,28.0,85.106384,1.083374e-05,"[S_7212, S_7229, S_7246, S_7252, S_7261, S_726...","[S_7245, S_7247, S_7255, S_7294, WGS_163b, WGS...",9,20,22.5,50.0,72.5
2,chr6,32529071,"[C, T]",rs796985052,0.047066,0,intron_variant,[HLA-DRB5],[None],22,...,27.000002,89.361702,9.174138e-07,"[S_7212, S_7213, S_7229, S_7245, S_7247, S_725...",[],18,21,45.0,52.499996,97.5
3,chr6,32529075,"[C, A]",rs1019319406,0.014959,0,intron_variant,[HLA-DRB5],[None],22,...,27.000002,89.361702,9.174138e-07,"[S_7212, S_7229, S_7236, S_7252, S_7255, S_726...",[],21,18,52.499996,45.0,97.5
4,chr6,32529084,"[T, A]",rs965122936,0.016525,0,intron_variant,[HLA-DRB5],[None],23,...,27.000002,89.361702,5.866651e-07,"[S_7212, S_7229, S_7236, S_7246, S_7252, S_725...",[],20,19,50.0,47.5,97.5
5,chr6,32529099,"[G, A]",,0.015727,0,intron_variant,[HLA-DRB5],[None],24,...,26.0,89.361702,2.52871e-07,"[S_7212, S_7229, S_7236, S_7246, S_7252, S_725...",[],21,18,52.499996,45.0,97.5
6,chr6,32529948,"[A, T]",rs114292540,0.005315,0,intron_variant,[HLA-DRB5],[None],22,...,38.0,93.61702,6.675157e-08,"[S_7212, S_7229, S_7246, S_7261, S_7263, S_727...",[],20,20,50.0,50.0,100.0
7,chr6,32529983,"[A, T]",rs115098928,0.003106,0,intron_variant,[HLA-DRB5],[None],19,...,44.0,93.61702,3.162186e-07,"[S_7212, S_7229, S_7246, S_7252, S_7261, S_726...",[],18,22,45.0,55.0,100.0
8,chr6,32530090,"[C, T]",rs752268643,0.034809,0,intron_variant,[HLA-DRB5],[None],19,...,46.0,93.61702,9.810277e-08,"[S_7212, S_7229, S_7246, S_7252, S_7255, S_726...",[],12,24,30.000002,60.000004,90.0
9,chr6,32530133,"[T, C]",rs139583918,0.025449,0,missense_variant,[HLA-DRB5],[None],19,...,44.0,93.61702,3.162186e-07,"[S_7212, S_7229, S_7246, S_7252, S_7255, S_726...",[],12,25,30.000002,62.5,92.5


## rare variants

In [274]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [275]:
mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

In [276]:
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))

In [277]:
mt = mt.annotate_cols(group = hl.cond(mt.s.contains("B"), "sport", "control"))

  """Entry point for launching an IPython kernel.


In [278]:
mt = mt.filter_rows((mt.gnomad_v3.v3_nfe.AF < 0.0001) & (mt.gnomad_v3.v3_nfe.AF > 0))

In [279]:
mt = mt.annotate_rows(controls_non_ref = hl.agg.filter((mt.group == 'control'), hl.agg.count_where(mt.GT.is_non_ref())),
                      sports_non_ref = hl.agg.filter((mt.group == 'sport'), hl.agg.count_where(mt.GT.is_non_ref())))

In [280]:
mt = mt.select_rows(rsid = mt.gnomad_v3.rsid,
                                 gnomAD_v3_1_allele_frequency = mt.gnomad_v3.v3_nfe.AF,
                                 gnomAD_v3_1_homozygote_count = mt.gnomad_v3.v3_nfe.homozygote_count,
                                 predicted_most_severe_consequence = mt.gnomad_v3.most_severe_consequence,
                                 gene = mt.within_gene,
                                 hpo = mt.hpo,
                                 controls_non_ref = mt.controls_non_ref,
                                 sports_non_ref = mt.sports_non_ref)

mt = mt.annotate_rows(
    samples_het = hl.agg.filter(mt.GT.is_het(), hl.agg.collect(mt.s)),
    samples_hom_non_ref = hl.agg.filter(mt.GT.is_hom_var(), hl.agg.collect(mt.s)))

In [281]:
mt = mt.filter_rows(mt.predicted_most_severe_consequence.contains('gain'))

In [282]:
mt = mt.annotate_rows(gts_non_ref = gts.index(mt.row_key).gts_non_ref,
                                gts_ref = gts.index(mt.row_key).gts_ref,
                                percent_gts_non_ref = gts.index(mt.row_key).percent_gts_non_ref,
                                percent_gts_ref = gts.index(mt.row_key).percent_gts_ref,
                                percent_gts_genotyped = gts.index(mt.row_key).percent_gts_genotyped)

In [283]:
mt = mt.filter_rows(mt.sports_non_ref > 1)

In [284]:
mt.count()

(2, 147)

In [285]:
to_export = mt.rows()
to_export = to_export.to_pandas()

In [286]:
to_export.columns

Index(['locus.contig', 'locus.position', 'alleles', 'rsid',
       'gnomAD_v3_1_allele_frequency', 'gnomAD_v3_1_homozygote_count',
       'predicted_most_severe_consequence', 'gene', 'hpo', 'controls_non_ref',
       'sports_non_ref', 'samples_het', 'samples_hom_non_ref', 'gts_non_ref',
       'gts_ref', 'percent_gts_non_ref', 'percent_gts_ref',
       'percent_gts_genotyped'],
      dtype='object')

In [287]:
to_export.to_csv('rare_variants_sportsmen')

In [288]:
to_export

Unnamed: 0,locus.contig,locus.position,alleles,rsid,gnomAD_v3_1_allele_frequency,gnomAD_v3_1_homozygote_count,predicted_most_severe_consequence,gene,hpo,controls_non_ref,sports_non_ref,samples_het,samples_hom_non_ref,gts_non_ref,gts_ref,percent_gts_non_ref,percent_gts_ref,percent_gts_genotyped
0,chr15,89315318,"[C, T]",rs121918164,9.3e-05,0,stop_gained,"[FANCI, POLG]","[[HP:0000072:Hydroureter, HP:0001053:Hypopigme...",0,3,"[B439, B480, B482]",[],0,40,0.0,100.0,100.0
1,chr16,5239977,"[G, T]",rs1472046975,3.2e-05,0,stop_gained,[RBFOX1],[None],2,3,"[B382, B431, B533, S_7213, S_7241]",[],1,38,2.5,95.0,97.5


### variant analysis in TTN (1 - filtering, 2 - fisher)

In [376]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [377]:
mt.count()

(3154875, 147)

In [378]:
mt = mt.filter_rows(mt.within_gene.contains('TTN'))

In [379]:
mt = mt.filter_rows((mt.sports_non_ref == 0), keep=False)

In [383]:
mt = mt.filter_rows(mt.gnomad_v3.v3_nfe.AF < 0.005)

In [384]:
mt.count()

(177, 147)

In [385]:
#select impactful variants

mt.aggregate_rows(hl.agg.counter(mt.gnomad_v3.most_severe_consequence))

{'missense_variant': 112,
 'splice_region_variant': 5,
 'inframe_deletion': 3,
 'synonymous_variant': 49,
 '3_prime_UTR_variant': 8}

In [386]:
to_test = mt.rows()
to_test = to_test.select(to_test.controls_non_ref, to_test.sports_non_ref, to_test.controls_ref, to_test.sports_ref)

to_test.write('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_ttn.ht')

2021-04-23 21:46:08 Hail: INFO: wrote table with 177 rows in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_ttn.ht
    Total size: 65.90 KiB
    * Rows: 65.89 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (21.00 B)
    * Largest partition:  92 rows (1.61 KiB)


In [387]:
to_test = hl.read_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/to_test_ttn.ht')

In [388]:
to_test.count()

177

In [389]:
to_test = to_test.annotate(fisher = hl.fisher_exact_test(hl.int32(to_test.controls_non_ref), 
                                                         hl.int32(to_test.controls_ref),
                                                         hl.int32(to_test.sports_non_ref),
                                                         hl.int32(to_test.sports_ref)))

In [390]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/sporstmen-005-intragenic-filtered-better.mt')

In [391]:
mt_test = mt.filter_rows(hl.is_defined(to_test[mt.row_key]), keep = True)

In [392]:
mt_test = mt_test.annotate_rows(fisher = to_test[mt_test.row_key]['fisher'])

In [393]:
mt_test = mt_test.select_rows(rsid = mt_test.gnomad_v3.rsid,
                                 gnomAD_v3_1_allele_frequency = mt_test.gnomad_v3.v3_nfe.AF,
                                 gnomAD_v3_1_homozygote_count = mt_test.gnomad_v3.v3_nfe.homozygote_count,
                                 predicted_most_severe_consequence = mt_test.gnomad_v3.most_severe_consequence,
                                 gene = mt_test.within_gene,
                                 hpo = mt_test.hpo,
                                 controls_non_ref = mt_test.controls_non_ref,
                                 sports_non_ref = mt_test.sports_non_ref,
                                 controls_ref = mt_test.controls_ref,
                                 sports_ref = mt_test.sports_ref,
                                 percent_controls_non_ref = mt_test.controls_non_ref/47*100,
                                 percent_sports_non_ref = mt_test.sports_non_ref/100*100,
                                 percent_controls_ref = mt_test.controls_ref/47*100,
                                 percent_sports_ref = mt_test.sports_ref/100*100,
                                 percent_sports_genotyped = (mt_test.sports_non_ref + mt_test.sports_ref)/100*100,
                                 percent_controls_genotyped = (mt_test.controls_non_ref + mt_test.controls_ref)/47*100,
                                 fisher = mt_test.fisher.p_value,
                                 )

In [394]:
mt_test = mt_test.filter_rows(hl.is_snp(mt_test.alleles[0], mt_test.alleles[1]))

In [395]:
mt_test = mt_test.annotate_rows(
    samples_het = hl.agg.filter(mt_test.GT.is_het(), hl.agg.collect(mt_test.s)),
    samples_hom_non_ref = hl.agg.filter(mt_test.GT.is_hom_var(), hl.agg.collect(mt_test.s)))

In [396]:
mt_test.count()

(173, 147)

In [397]:
### annotate with frequency in GTS:

gts = hl.read_matrix_table('/net/archive/groups/plggneuromol/sportsmen-wgs/mts/unrelated-gts.mt')

gts = gts.annotate_rows(gts_non_ref = hl.agg.count_where(gts.GT.is_non_ref()),
                        gts_ref = hl.agg.count_where(gts.GT.is_hom_ref()))

gts = gts.rows()

gts = gts.select(gts_non_ref = gts.gts_non_ref,
                 gts_ref = gts.gts_ref,
                 percent_gts_non_ref = gts.gts_non_ref/40*100,
                 percent_gts_ref = gts.gts_ref/40*100,
                 percent_gts_genotyped = (gts.gts_non_ref + gts.gts_ref)/40*100)

mt_test = mt_test.annotate_rows(gts_non_ref = gts.index(mt_test.row_key).gts_non_ref,
                                gts_ref = gts.index(mt_test.row_key).gts_ref,
                                percent_gts_non_ref = gts.index(mt_test.row_key).percent_gts_non_ref,
                                percent_gts_ref = gts.index(mt_test.row_key).percent_gts_ref,
                                percent_gts_genotyped = gts.index(mt_test.row_key).percent_gts_genotyped)

In [398]:
to_export = mt_test.rows()

to_export = to_export.to_pandas()

In [399]:
to_export.columns

Index(['locus.contig', 'locus.position', 'alleles', 'rsid',
       'gnomAD_v3_1_allele_frequency', 'gnomAD_v3_1_homozygote_count',
       'predicted_most_severe_consequence', 'gene', 'hpo', 'controls_non_ref',
       'sports_non_ref', 'controls_ref', 'sports_ref',
       'percent_controls_non_ref', 'percent_sports_non_ref',
       'percent_controls_ref', 'percent_sports_ref',
       'percent_sports_genotyped', 'percent_controls_genotyped', 'fisher',
       'samples_het', 'samples_hom_non_ref', 'gts_non_ref', 'gts_ref',
       'percent_gts_non_ref', 'percent_gts_ref', 'percent_gts_genotyped'],
      dtype='object')

In [400]:
to_export.shape

(173, 27)

In [401]:
to_export

Unnamed: 0,locus.contig,locus.position,alleles,rsid,gnomAD_v3_1_allele_frequency,gnomAD_v3_1_homozygote_count,predicted_most_severe_consequence,gene,hpo,controls_non_ref,...,percent_sports_genotyped,percent_controls_genotyped,fisher,samples_het,samples_hom_non_ref,gts_non_ref,gts_ref,percent_gts_non_ref,percent_gts_ref,percent_gts_genotyped
0,chr2,178525997,"[T, C]",rs72629798,0.003082,0,3_prime_UTR_variant,"[TTN, TTN-AS1]",[[HP:0003687:Centrally nucleated skeletal musc...,0,...,93.0,100.0,0.550771,"[B459, B81]",[],1,38,2.5,95.0,97.5
1,chr2,178526270,"[T, C]",rs72629797,0.000093,0,3_prime_UTR_variant,"[TTN, TTN-AS1]",[[HP:0003687:Centrally nucleated skeletal musc...,0,...,93.0,100.0,1.000000,[B457],[],0,39,0.0,97.5,97.5
2,chr2,178527006,"[G, T]",,0.000774,0,3_prime_UTR_variant,"[TTN, TTN-AS1]",[[HP:0003687:Centrally nucleated skeletal musc...,1,...,93.0,100.0,1.000000,"[B518, WGS_37b]",[],0,40,0.0,100.0,100.0
3,chr2,178527288,"[T, C]",rs55832587,0.001657,0,synonymous_variant,"[TTN, TTN-AS1]",[[HP:0003687:Centrally nucleated skeletal musc...,0,...,93.0,100.0,0.550771,"[B435, B505]",[],0,39,0.0,97.5,97.5
4,chr2,178529113,"[C, T]",rs56324602,0.001642,0,synonymous_variant,"[TTN, TTN-AS1]",[[HP:0003687:Centrally nucleated skeletal musc...,0,...,93.0,100.0,0.550771,"[B435, B505]",[],0,40,0.0,100.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,chr2,178795088,"[C, G]",rs56128843,0.001843,0,missense_variant,[TTN],[[HP:0003687:Centrally nucleated skeletal musc...,0,...,93.0,100.0,0.550771,"[B435, B505]",[],0,39,0.0,97.5,97.5
169,chr2,178800552,"[G, A]",,0.001750,0,synonymous_variant,[TTN],[[HP:0003687:Centrally nucleated skeletal musc...,0,...,93.0,100.0,0.550771,"[B435, B505]",[],0,39,0.0,97.5,97.5
170,chr2,178804577,"[G, A]",rs143623862,0.000062,0,synonymous_variant,[TTN],[[HP:0003687:Centrally nucleated skeletal musc...,0,...,93.0,100.0,1.000000,[B477],[],0,40,0.0,100.0,100.0
171,chr2,178830033,"[C, T]",rs76671760,0.001858,0,3_prime_UTR_variant,"[AC092640.1, CCDC141, TTN]","[[HP:0000771:Gynecomastia, HP:0000054:Micropen...",0,...,93.0,100.0,0.550771,"[B435, B505]",[],0,39,0.0,97.5,97.5


In [403]:
to_export.to_csv('ttn')