In [1]:
import hail as hl
import os

localfs_path = os.environ.get('SCRATCH') + '/hail-temp/'
hl.init(tmp_dir=localfs_path, local_tmpdir=localfs_path, 
        spark_conf={'spark.driver.memory': '15G', 'spark.executor.memory': '30G'})



2023-03-22 17:11:58.844 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.1.3
SparkUI available at http://ac0003:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.109-b71b065e4bb6
LOGGING: writing to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-sportwgs/analysis/variant-analysis-and-exports/hail-20230322-1711-0.2.109-b71b065e4bb6.log


In [71]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis
from scipy import stats
from statsmodels.stats.multitest import fdrcorrection

output_notebook()

### Phenotypes to investigate and their respective phenotypes in genebass:


Many of the phenotypes do not have their exact respective phenotypes, so the closest phenotype was chosen:

panukb phenotype | genebass phenotype
--------------- | ------------------
anti_gout_agent_microtuble_polymerization_inhibitor-both_sexes | none
allopurinol | categorical-20003-both_sexes-1140875408- - Treatment/medication code
immature_reticulocyte_fraction | Immature reticulocyte fraction
both_sexes--platelet_crit | Platelet crit
biobankuk-egfrcreacys-both_sexes--estimated_glomerular_filtration_rate_cystain_c-EUR | none
forced vital capacity best measure | fvc best measure
biobankuk-20002-both_sexes-1309-non_cancer_illness_code_self_reported-EUR | M81 Osteoporosis without pathological fracture

- gene lists were prepared from the genebass results SKAT-O tests based on the previously set genome-wide significance threshold: 2.5 × 10−7
- We also conducted analyses for the 1 x 10-6 and 1 x 10-5 thresholds

### Import and filter genebass:

In [5]:
genebass = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/resources/genebass-500k/results.mt')

In [6]:
pheno_filters = (
    genebass.coding == "1140875408"
) | (
    genebass.description == "Immature reticulocyte fraction"
) | (
    genebass.description == "Platelet crit"
) | (
    genebass.description == "Forced vital capacity (FVC), Best measure"
) | (
    genebass.description_more.contains('M81')
)  

## First lets analyse genes that were significant for missense and low-confidence lof variants

In [7]:
genebass = genebass.filter_cols(pheno_filters)
genebass = genebass.filter_rows(genebass.annotation == 'missense|LC')

In [8]:
genebass.count()



(19403, 5)

### Get gene lists for different cut offs:

In [9]:
genebass = genebass.annotate_cols(
    below_8 = hl.agg.filter((
        genebass.Pvalue < 1e-08),
        hl.agg.collect_as_set(genebass.gene_symbol)
    ),
    below_7 = hl.agg.filter((
        genebass.Pvalue < 2.5e-07), #this is the significance threshold suggested by genebass
        hl.agg.collect_as_set(genebass.gene_symbol)
    ),
    below_6 = hl.agg.filter((
        genebass.Pvalue < 1e-06),
        hl.agg.collect_as_set(genebass.gene_symbol)
    ),
    below_5 = hl.agg.filter((
        genebass.Pvalue < 1e-05),
        hl.agg.collect_as_set(genebass.gene_symbol)
    )
)

In [11]:
genebass = genebass.select_cols(
    genebass.below_8,
    genebass.below_7,
    genebass.below_6,
    genebass.below_5,
    genebass.description,
    genebass.description_more,
    genebass.coding_description
)

In [12]:
genebass = genebass.annotate_cols(
    len_below_5 = hl.len(genebass.below_5),
    len_below_6 = hl.len(genebass.below_6),
    len_below_7 = hl.len(genebass.below_7),
    len_below_8 = hl.len(genebass.below_8)
)

In [13]:
genebass = genebass.annotate_cols(
    description = hl.coalesce(
        genebass.coding_description,
        genebass.description
    )
)

### The matrix table below has already been filtered and annotated, see preprocessing/joint-with-gts/genotype-and-filter.ipynb

In [3]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-sportwgs/data/joint/full-healthy.mt')

In [4]:
mt.count()

(2639210, 237)

In [5]:
proper_controls = ['WGS_37b', 'WGS_37c', 'WGS_163d', 'WGS_7120', 'WGS_7142', 'WGS_7143', 'WGS_7152',
 'WGS_7153', 'WGS_85b', 'WGS_147c', 'WGS_180b', 'WGS_185c', 'WGS_6819', 'S_7213', 'S_7227', 'S_7241', 'S_7246', 'S_7254', 'S_7274',
                   'S_7307', '494', '462', '468', '492', '490'] 

In [6]:
len(proper_controls)

25

### Only keep intragenic variants

In [26]:
mt = mt.filter_rows(mt.within_gene == hl.empty_array(hl.tstr), keep = False)

### Remove intervals that have regions that may be attributed to alternative loci

In [27]:
bed = hl.import_bed(
    '/net/pr2/projects/plgrid/plggneuromol/matzieb/projects/imdik-zekanowski-sportwgs/data/prs-data/all-alt-scaffold-placement-GRCh38.p14.bed',
    reference_genome='GRCh38'
)

2023-03-06 10:40:38.405 Hail: INFO: wrote table with 260 rows in 1 partition to /net/ascratch/people/plggosborcz/hail-temp/persist_tableOLmIkK7FAu
2023-03-06 10:40:38.807 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (user-supplied)
  Loading field 'f1' as type int32 (user-supplied)
  Loading field 'f2' as type int32 (user-supplied)
  Loading field 'f3' as type str (user-supplied)
  Loading field 'f4' as type str (user-supplied)
  Loading field 'f5' as type str (not specified)
  Loading field 'f6' as type str (not specified)


In [28]:
mt = mt.filter_rows(hl.is_defined(bed[mt.locus]), keep = False)
mt.write('/net/ascratch/people/plggosborcz/no-alt.mt', overwrite = True)

2023-03-06 10:40:48.556 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-06 10:40:49.289 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-06 10:44:17.343 Hail: INFO: wrote matrix table with 1682209 rows and 237 columns in 14999 partitions to /net/ascratch/people/plggosborcz/no-alt.mt


In [7]:
mt = hl.read_matrix_table('/net/ascratch/people/plggosborcz/no-alt.mt')

### Only keep proprerly healthy controls

In [8]:
mt.aggregate_cols(hl.agg.counter(mt.group))

{'1kg': 98, 'control': 39, 'sport': 100}

In [31]:
mt = mt.filter_cols(
    (mt.group == '1kg') | (mt.group == 'sport') | (hl.literal(proper_controls)).contains(mt.s)
)

In [32]:
mt.write('/net/ascratch/people/plggosborcz/filtered.mt', overwrite = True)
mt = hl.read_matrix_table('/net/ascratch/people/plggosborcz/filtered.mt')

2023-03-06 10:49:11.467 Hail: INFO: wrote matrix table with 1682209 rows and 220 columns in 14999 partitions to /net/ascratch/people/plggosborcz/filtered.mt


In [33]:
mt.count()

(1682209, 220)

In [None]:
mt.naive_coalesce(200).write('/net/ascratch/people/plggosborcz/repartitioned.mt', overwrite = True)

[Stage 13:=>                                                     (4 + 30) / 200]

In [9]:
mt = hl.read_matrix_table('/net/ascratch/people/plggosborcz/repartitioned.mt')

## calculate per-gene burden in 6 ways:
 - sum of CADD for variants with CADD > 20
 - sum of CADD for variants with CADD > 16
 - sum of CADD for variants with CADD > 10
 - any CADD > 16
 - any CADD > 20
 - any CADD > 30
 - any HC lof

In [None]:
mt = mt.explode_rows(mt.within_gene)
mt.write('/net/ascratch/people/plggosborcz/exploded.mt', overwrite = True)

In [10]:
mt = hl.read_matrix_table('/net/ascratch/people/plggosborcz/exploded.mt')

In [11]:
mt = mt.filter_cols(mt.s != '494')

In [51]:
# edit here, we need 3 more options - continous will be for logistic regression (for phenotype) - 

mt = mt.annotate_entries(
    cadd_10 = hl.if_else(
        (mt.cadd.cadd_score > 10),
        (mt.GT.n_alt_alleles()*mt.cadd.cadd_score),
        hl.int(0)
    ),
    cadd_16 = hl.if_else(
        (mt.cadd.cadd_score > 16),
        (mt.GT.n_alt_alleles()*mt.cadd.cadd_score),
        hl.int(0)
    ),
    cadd_20 = hl.if_else(
        (mt.cadd.cadd_score > 20),
        (mt.GT.n_alt_alleles()*mt.cadd.cadd_score),
        hl.int(0)
    ),
    cadd_30 = hl.if_else(
        (mt.cadd.cadd_score > 30),
        (mt.GT.n_alt_alleles()*mt.cadd.cadd_score),
        hl.int(0)
    ),
    is_lof = hl.if_else(
        mt.vep.vep.transcript_consequences.lof.contains('HC'),
        mt.GT.n_alt_alleles(),
        hl.int(0)
    )                           
)

In [52]:
mt.write('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/for_burden.mt', overwrite = True)

2023-03-06 11:45:04.506 Hail: INFO: wrote matrix table with 1914483 rows and 219 columns in 200 partitions to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/for_burden.mt


In [12]:
mt = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/for_burden.mt')

In [63]:
   per_gene = mt.group_rows_by(
    mt.within_gene).aggregate(
    cadd_above_10_sum = hl.agg.sum(mt.cadd_10), # these will be investigated with logistic regression
    cadd_above_20_sum = hl.agg.sum(mt.cadd_20),
    cadd_above_16_sum = hl.agg.sum(mt.cadd_16),
    any_cadd_above_16 = hl.agg.any(mt.cadd_16 > 0), # these will be investigated with fisher exact
    any_cadd_above_20 = hl.agg.any(mt.cadd_20 > 0),
    any_cadd_above_30 = hl.agg.any(mt.cadd_30 > 0),
    any_lof = hl.agg.any(mt.is_lof > 1)   
)

In [64]:
per_gene = per_gene.naive_coalesce(1)
per_gene.write('/net/ascratch/people/plggosborcz/per_gene.mt', overwrite = True)

2023-03-06 15:06:28.267 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-06 15:06:48.189 Hail: INFO: wrote matrix table with 36210 rows and 219 columns in 1 partition to /net/ascratch/people/plggosborcz/per_gene.mt


In [13]:
per_gene = hl.read_matrix_table('/net/ascratch/people/plggosborcz/per_gene.mt')

In [76]:
per_gene = per_gene.filter_rows(
    hl.agg.any(
        (
            per_gene.cadd_above_10_sum > 0
        ) | (
            per_gene.cadd_above_20_sum > 0
        ) | (
            per_gene.cadd_above_16_sum > 0
        ) | (
            per_gene.any_cadd_above_16 > 0
        ) | (
            per_gene.any_cadd_above_20 > 0
        ) | (
            per_gene.any_cadd_above_30 > 0
        ) | (
            per_gene.any_lof > 0
        )
    )
)

In [77]:
per_gene.count()

(20565, 219)

### Perform PCA on variant burden across cohorts

In [78]:
eigenvalues_10, scores_10, _ = hl.pca(
    per_gene.cadd_above_10_sum,
    k=2
)

eigenvalues_20, scores_20, _ = hl.pca(
    per_gene.cadd_above_20_sum,
    k=2
)

eigenvalues_16, scores_16, _ = hl.pca(
    per_gene.cadd_above_16_sum,
    k=2
)

eigenvalues_any_16, scores_any_16, _ = hl.pca(
    per_gene.any_cadd_above_16,
    k=2
)

eigenvalues_any_20, scores_any_20, _ = hl.pca(
    per_gene.any_cadd_above_20,
    k=2
)

eigenvalues_any_30, scores_any_30, _ = hl.pca(
    per_gene.any_cadd_above_30,
    k=2
)

eigenvalues_any_lof, scores_any_lof, _ = hl.pca(
    per_gene.any_lof,
    k=2
)

2023-03-06 15:20:53.983 Hail: INFO: pca: running PCA with 2 components...
2023-03-06 15:20:57.264 Hail: INFO: wrote table with 0 rows in 0 partitions to /net/ascratch/people/plggosborcz/hail-temp/persist_tableI8pSu6GrbQ
    Total size: 4.67 KiB
    * Rows: 0.00 B
    * Globals: 4.67 KiB
    * Smallest partition: N/A
    * Largest partition:  N/A
2023-03-06 15:20:57.859 Hail: INFO: pca: running PCA with 2 components...
2023-03-06 15:20:59.205 Hail: INFO: wrote table with 0 rows in 0 partitions to /net/ascratch/people/plggosborcz/hail-temp/persist_tableewe3uZyU6q
    Total size: 4.67 KiB
    * Rows: 0.00 B
    * Globals: 4.67 KiB
    * Smallest partition: N/A
    * Largest partition:  N/A
2023-03-06 15:20:59.768 Hail: INFO: pca: running PCA with 2 components...
2023-03-06 15:21:01.906 Hail: INFO: wrote table with 0 rows in 0 partitions to /net/ascratch/people/plggosborcz/hail-temp/persist_tableVeM4q7Sxaw
    Total size: 4.67 KiB
    * Rows: 0.00 B
    * Globals: 4.67 KiB
    * Smallest p

In [79]:
per_gene = per_gene.annotate_cols(
    pca_10_sum = scores_10[per_gene.s].scores,
    pca_20_sum = scores_20[per_gene.s].scores,
    pca_16_sum = scores_16[per_gene.s].scores,
    pca_16_any = scores_any_16[per_gene.s].scores,
    pca_20_any = scores_any_20[per_gene.s].scores,
    pca_30_any = scores_any_30[per_gene.s].scores,
    pca_lof = scores_any_lof[per_gene.s].scores
)

In [84]:
p = hl.plot.scatter(per_gene.pca_10_sum[0],
                    per_gene.pca_10_sum[1],
                    label=per_gene.group,
                    title='PCA - sum of variants with CADD over 10', xlabel='PC1', ylabel='PC2')
show(p)

  tmp_data = {c: v.values for c, v in _df.iteritems()}


In [85]:
p2 = hl.plot.scatter(per_gene.pca_20_sum[0],
                     per_gene.pca_20_sum[1],
                     label=per_gene.group,
                     title='PCA - sum of variants with CADD over 20',
                     xlabel='PC1',
                     ylabel='PC2')
show(p2)

  tmp_data = {c: v.values for c, v in _df.iteritems()}


In [86]:
p3 = hl.plot.scatter(per_gene.pca_16_sum[0],
                     per_gene.pca_16_sum[1],
                     label=per_gene.group,
                     title='PCA - sum of variants with CADD over 16',
                     xlabel='PC1',
                     ylabel='PC2')
show(p3)

  tmp_data = {c: v.values for c, v in _df.iteritems()}


In [89]:
p4 = hl.plot.scatter(per_gene.pca_16_any[0],
                     per_gene.pca_16_any[1],
                     label=per_gene.group,
                     title='PCA - any variant with CADD over 16',
                     xlabel='PC1',
                     ylabel='PC2')
show(p4) # be carefull with this analysis - dunno whats going on

  tmp_data = {c: v.values for c, v in _df.iteritems()}


In [90]:
p5 = hl.plot.scatter(per_gene.pca_20_any[0],
                     per_gene.pca_20_any[1],
                     label=per_gene.group,
                     title='PCA - any variant with CADD over 20',
                     xlabel='PC1',
                     ylabel='PC2')
show(p5) # be carefull with this analysis - dunno whats going on

  tmp_data = {c: v.values for c, v in _df.iteritems()}


In [93]:
p7 = hl.plot.scatter(per_gene.pca_lof[0],
                     per_gene.pca_lof[1],
                     label=per_gene.group,
                     title='PCA - any variant with lof',
                     xlabel='PC1',
                     ylabel='PC2')
show(p7)

  tmp_data = {c: v.values for c, v in _df.iteritems()}


### Annotate the per gene table with phenotypes:

In [14]:
burden_opts = ['cadd_above_10_sum', 'cadd_above_20_sum',
               'cadd_above_16_sum', 'any_cadd_above_16', 
               'any_cadd_above_20', 'any_cadd_above_30',
               'any_lof']

In [95]:
gb = genebass.cols()
gb = gb.naive_coalesce(1)
gb.write('/net/ascratch/people/plggosborcz/gb.ht', overwrite = True)

2023-03-06 15:39:23.295 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2023-03-06 15:39:35.182 Hail: INFO: Coerced sorted dataset====>(999 + 1) / 1000]
2023-03-06 15:39:35.674 Hail: INFO: wrote table with 5 rows in 1 partition to /net/ascratch/people/plggosborcz/gb.ht


In [15]:
gb = hl.read_table('/net/ascratch/people/plggosborcz/gb.ht')

In [97]:
per_gene = per_gene.key_rows_by(per_gene.within_gene)

In [98]:
options = ['below_8','below_7','below_6','below_5']

for o in options:
    res = gb.explode(gb[o])
    res = res.key_by(res[o])
    
    per_gene = per_gene.annotate_rows(
        **{
        o: res.index(per_gene.row_key, all_matches= True)['description']
        }
    )

In [99]:
per_gene = per_gene.naive_coalesce(1)

In [100]:
per_gene.write('/net/ascratch/people/plggosborcz/per-gene-pheno.ht', overwrite = True)

2023-03-06 15:42:49.238 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-06 15:42:49.428 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-06 15:42:49.616 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-06 15:42:49.792 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-03-06 15:42:57.191 Hail: INFO: wrote matrix table with 20565 rows and 219 columns in 1 partition to /net/ascratch/people/plggosborcz/per-gene-pheno.ht


In [17]:
mt = hl.read_matrix_table('/net/ascratch/people/plggosborcz/per-gene-pheno.ht')

### Look for differences in burden in any individual gene from the below_5 list

In [264]:
mt_single = mt.filter_rows(hl.is_defined(mt.below_5))

In [265]:
mt_single.count()

(95, 219)

In [268]:
mt_single.rows().show(n_rows=100)

within_gene,below_8,below_7,below_6,below_5
str,array<str>,array<str>,array<str>,array<str>
"""ABCG2""",,"[""allopurinol""]","[""allopurinol""]","[""allopurinol""]"
"""AC126283.2""",,,"[""Platelet crit""]","[""Platelet crit""]"
"""ACVRL1""",,,"[""Platelet crit""]","[""Platelet crit""]"
"""ADAMTS1""","[""Forced vital capacity (FVC), Best measure""]","[""Forced vital capacity (FVC), Best measure""]","[""Forced vital capacity (FVC), Best measure""]","[""Forced vital capacity (FVC), Best measure""]"
"""ADAMTS6""","[""Forced vital capacity (FVC), Best measure""]","[""Forced vital capacity (FVC), Best measure""]","[""Forced vital capacity (FVC), Best measure""]","[""Forced vital capacity (FVC), Best measure""]"
"""AHI1""",,,"[""Platelet crit""]","[""Platelet crit""]"
"""AL451136.1""",,,,"[""Immature reticulocyte fraction""]"
"""ALDH16A1""","[""allopurinol""]","[""allopurinol""]","[""allopurinol""]","[""allopurinol""]"
"""AMOTL1""",,,"[""Platelet crit""]","[""Platelet crit""]"
"""AMPD3""",,"[""Immature reticulocyte fraction""]","[""Immature reticulocyte fraction""]","[""Immature reticulocyte fraction""]"


In [260]:
gene_list = list(
    set(
        mt_single.explode_rows(
            mt_single.below_5
        )['within_gene'].collect()
    )
)

In [261]:
mt_single = mt_single.entries()

In [262]:
mt_single = mt_single.select(
    genebass_pheno = mt_single.below_5,
    group = mt_single.group,
    sport_type = mt_single.sport_phenotypes.type,
    cadd_above_10_sum =  mt_single['cadd_above_10_sum'],
    cadd_above_20_sum =  mt_single['cadd_above_20_sum'],
    cadd_above_16_sum =  mt_single['cadd_above_16_sum'],
    any_cadd_above_16 = mt_single['any_cadd_above_16'],
    any_cadd_above_20 = mt_single['any_cadd_above_20'],
    any_lof = mt_single['any_lof'] 
)

In [263]:
mt_single.show()

within_gene,s,genebass_pheno,group,sport_type,cadd_above_10_sum,cadd_above_20_sum,cadd_above_16_sum,any_cadd_above_16,any_cadd_above_20,any_lof
str,str,array<str>,str,str,float64,float64,float64,bool,bool,bool
"""ABCG2""","""462""","[""allopurinol""]","""control""",,40.7,0.0,0.0,False,False,False
"""ABCG2""","""468""","[""allopurinol""]","""control""",,30.7,0.0,0.0,False,False,False
"""ABCG2""","""490""","[""allopurinol""]","""control""",,30.7,0.0,0.0,False,False,False
"""ABCG2""","""492""","[""allopurinol""]","""control""",,44.0,0.0,0.0,False,False,False
"""ABCG2""","""B102""","[""allopurinol""]","""sport""","""speed""",40.7,0.0,0.0,False,False,False
"""ABCG2""","""B156""","[""allopurinol""]","""sport""","""endurance""",47.3,0.0,0.0,False,False,False
"""ABCG2""","""B24""","[""allopurinol""]","""sport""","""speed""",30.4,0.0,0.0,False,False,False
"""ABCG2""","""B338""","[""allopurinol""]","""sport""","""endurance""",44.0,0.0,0.0,False,False,False
"""ABCG2""","""B382""","[""allopurinol""]","""sport""","""endurance""",44.0,0.0,0.0,False,False,False
"""ABCG2""","""B399""","[""allopurinol""]","""sport""","""speed""",40.7,0.0,0.0,False,False,False


In [223]:
mt_single = mt_single.to_pandas()

In [224]:
def ttest(df, groups, group_field, burden_field):
    pval = stats.ttest_ind(
        list(pd.to_numeric(df[df[group_field] == groups[0]][burden_field])),
        list(pd.to_numeric(df[df[group_field] == groups[1]][burden_field]))
    ).pvalue
    return pval

In [225]:
mt_single_grouped = mt_single.groupby('within_gene')

In [226]:
burden_opts = [
    'cadd_above_10_sum',
    'cadd_above_20_sum',
    'cadd_above_16_sum',
    'any_cadd_above_16',
    'any_cadd_above_20',
    'any_lof'  
]

comparisons = [
    [['sport','control'],'group'],
    [['control','1kg'],'group'],
    [['1kg','sport'],'group'],
    [['endurance','speed'],'sport_type']
]

groups = [['sport','group'],
          ['control','group'],
          ['1kg','group'],
          ['endurance','sport_type'],
          ['speed','sport_type']]

In [227]:
out = {
    f'{o}_{groups[0]}_vs_{groups[1]}_p_value': mt_single_grouped.apply(
        lambda df: ttest(df, groups, group_field, o)
    )
    for o in burden_opts
    for groups, group_field in comparisons
    }

  pval = stats.ttest_ind(


In [228]:
stds = {
    f'{o}_{group}_std': mt_single_grouped.apply(
        lambda df: df[df[group_col] == group][o].std()
    )
    for o in burden_opts
    for group, group_col in groups
}

means = {
    f'{o}_{group}_mean': mt_single_grouped.apply(
        lambda df: df[df[group_col] == group][o].mean()
    )
    for o in burden_opts
    for group, group_col in groups
}

In [229]:
out.update(means)

In [230]:
out.update(stds)

In [231]:
df = pd.DataFrame(out)

In [232]:
for o in burden_opts:
    for groups, group_field in comparisons:
        df[
            f'{o}_{groups[0]}_vs_{groups[1]}_fdr'
        ] = fdrcorrection(
            df[
                f'{o}_{groups[0]}_vs_{groups[1]}_p_value'
            ].fillna(1),
            alpha=0.1    
        )[1]

In [251]:
df[df['cadd_above_10_sum_endurance_vs_speed_p_value']<0.05]['cadd_above_10_sum_endurance_vs_speed_fdr']

within_gene
ATXN2      0.271080
CCDC124    0.271080
CRTAC1     0.271080
DDR2       0.451267
GRAMD2A    0.231861
IHH        0.494486
JAK2       0.540567
Name: cadd_above_10_sum_endurance_vs_speed_fdr, dtype: float64

For final results we will export cadd_above_10_sum 

In [248]:
len(df.columns.str.contains('cadd_above_10_sum'))

108

In [249]:
len(df.columns)

108

In [256]:
df.loc[:, df.columns.str.contains('cadd_above_10_sum')].to_csv('burden-results.csv')

In [198]:
mt_single = mt.filter_rows(hl.is_defined(mt.below_6))

In [199]:
mt_single.count()

(75, 219)

In [200]:
gene_list = list(
    set(
        mt_single.explode_rows(
            mt_single.below_6
        )['within_gene'].collect()
    )
)

In [201]:
mt_single = mt_single.entries()

In [202]:
mt_single = mt_single.select(
    genebass_pheno = mt_single.below_6,
    group = mt_single.group,
    sport_type = mt_single.sport_phenotypes.type,
    cadd_above_10_sum =  mt_single['cadd_above_10_sum'],
    cadd_above_20_sum =  mt_single['cadd_above_20_sum'],
    cadd_above_16_sum =  mt_single['cadd_above_16_sum'],
    any_cadd_above_16 = mt_single['any_cadd_above_16'],
    any_cadd_above_20 = mt_single['any_cadd_above_20'],
    any_lof = mt_single['any_lof'] 
)

In [203]:
mt_single = mt_single.to_pandas()

In [204]:
def ttest(df, groups, group_field, burden_field):
    pval = stats.ttest_ind(
        list(pd.to_numeric(df[df[group_field] == groups[0]][burden_field])),
        list(pd.to_numeric(df[df[group_field] == groups[1]][burden_field]))
    ).pvalue
    return pval

In [205]:
mt_single_grouped = mt_single.groupby('within_gene')

In [206]:
burden_opts = [
    'cadd_above_10_sum',
    'cadd_above_20_sum',
    'cadd_above_16_sum',
    'any_cadd_above_16',
    'any_cadd_above_20',
    'any_lof'  
]

comparisons = [
    [['sport','control'],'group'],
    [['control','1kg'],'group'],
    [['1kg','sport'],'group'],
    [['endurance','speed'],'sport_type']
]

groups = [['sport','group'],
          ['control','group'],
          ['1kg','group'],
          ['endurance','sport_type'],
          ['speed','sport_type']]

In [207]:
out = {
    f'{o}_{groups[0]}_vs_{groups[1]}_p_value': mt_single_grouped.apply(
        lambda df: ttest(df, groups, group_field, o)
    )
    for o in burden_opts
    for groups, group_field in comparisons
    }

  pval = stats.ttest_ind(


In [208]:
stds = {
    f'{o}_{group}_std': mt_single_grouped.apply(
        lambda df: df[df[group_col] == group][o].std()
    )
    for o in burden_opts
    for group, group_col in groups
}

means = {
    f'{o}_{group}_mean': mt_single_grouped.apply(
        lambda df: df[df[group_col] == group][o].mean()
    )
    for o in burden_opts
    for group, group_col in groups
}

In [209]:
out.update(means)

In [210]:
out.update(stds)

In [211]:
df = pd.DataFrame(out)

In [212]:
for o in burden_opts:
    for groups, group_field in comparisons:
        df[
            f'{o}_{groups[0]}_vs_{groups[1]}_fdr'
        ] = fdrcorrection(
            df[
                f'{o}_{groups[0]}_vs_{groups[1]}_p_value'
            ].fillna(1),
            alpha=0.1    
        )[1]

In [213]:
df.to_csv('above_6.csv')

In [215]:
df[df['cadd_above_10_sum_endurance_vs_speed_p_value']<0.05]['cadd_above_10_sum_endurance_vs_speed_fdr']

within_gene
ATXN2    0.698933
IHH      0.698933
JAK2     0.698933
Name: cadd_above_10_sum_endurance_vs_speed_fdr, dtype: float64

In [216]:
df[df['cadd_above_10_sum_sport_vs_control_p_value']<0.05]['cadd_above_10_sum_sport_vs_control_fdr']

within_gene
DOCK6    0.627423
JAK2     0.164103
SPHK1    0.188562
WDR6     0.633704
Name: cadd_above_10_sum_sport_vs_control_fdr, dtype: float64

For final results we will export cadd_above_10_sum 