In [1]:
import hail as hl
import sys
import os
import pandas as pd
import numpy as np

from IPython.display import display
from IPython.display import HTML

display(HTML("<style>.container { width:100% !important; }</style>"))

hl.init(tmp_dir='temp2', spark_conf={'spark.driver.memory': '30G', 'spark.executor.memory': '30G'}, default_reference='GRCh38')

KeyboardInterrupt: 

## This part starts unfiltered matrix tables per chromosome, sportsmen and healthy unrelated GTS are kept

MCT1
chr1:112,911,847-112,957,593 (chr1:112913924)



In [None]:
chr1 = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/joint/dense-all_chr1.mt')

mct1 = hl.filter_intervals(chr1, [(hl.parse_locus_interval('chr1:112911847-112957593', reference_genome='GRCh38'))])
mct1 = mct1.naive_coalesce(5)
mct1.write('mct1.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/mct1.mt')

rpmk = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/repeatmasker-extended-keyed.ht')
cov = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/gnomad/gnomad-cov-keyed.ht')

mt = mt.filter_rows(hl.is_defined(rpmk[mt.locus]), keep = False)
mt = mt.filter_rows(hl.is_defined(cov[mt.locus]), keep = True)

mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'GTS'))))

mt = mt.annotate_rows(dp_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.DP)),
                      gq_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.GQ)),
                      hwe = hl.agg.group_by(mt.group, hl.agg.hardy_weinberg_test(mt.GT)))

mt = mt.annotate_rows(n_below_dp_3 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.DP < 3)),
                      n_below_gq_30 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GQ <30)))

snp1 = mt.filter_rows(mt.locus.position == 112913924)

mt = mt.filter_rows((mt.dp_qc.get('GTS', hl.struct(mean=6.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.dp_qc.get('sport', hl.struct(mean=6.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.gq_qc.get('GTS', hl.struct(mean=60.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.gq_qc.get('sport', hl.struct(mean=60.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.n_below_dp_3.get('sport', 0) < 3) &
                    (mt.n_below_gq_30.get('sport', 0) < 30) &
                    (mt.n_below_dp_3.get('GTS', 0) < 3) &
                    (mt.n_below_gq_30.get('GTS', 0) <30))

### At this stage we have 1) single SNP 2) other variants in the mt. First we will select unrelated individuals from GTS cohort:

healthy_unrelated = ['S_7212', 'S_7213','S_7227','S_7255','S_7237','S_7245','S_7246','S_7229','S_7254','WGS_147c','S_7261','S_7263','S_7269','S_7274','S_7294','S_7306','WGS_37b','WGS_37c','WGS_85b','WGS_7118',
'WGS_7120','WGS_7142','WGS_7143','WGS_7152','WGS_7153','WGS_163d','WGS_180b','WGS_6819','WGS_D6813','WGS_D6815','462','468','475','476','477','478','479','482','490','492','494']

mt = mt.filter_cols(((mt.s.contains('B')) | hl.literal(healthy_unrelated).contains(mt.s)))
mt = mt.filter_cols(mt.s == 'B454', keep = False)

mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/mct1-selected-4.mt')

snp1 = snp1.filter_cols(((snp1.s.contains('B')) | hl.literal(healthy_unrelated).contains(snp1.s)))
snp1 = snp1.filter_cols(snp1.s == 'B454', keep = False)

snp1.count()

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/mct1-selected-4.mt')

mt = mt.union_rows(snp1)

mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/mct1-joined.mt')

### annotate matrix table with important info:

In [None]:
genes = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/genecode_v32.ht')
genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))
hpo = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/hpo.tsv', impute = True, no_header=True)

start = genes['hg38.knownGene.txStart']
stop =  genes['hg38.knownGene.txEnd']

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38', includes_start=False))

genes = genes.key_by(genes['hg38.kgXref.geneSymbol'])

hpo = hpo.key_by(hpo.f0)

genes = genes.annotate(hpo = hpo.index(genes['hg38.kgXref.geneSymbol'], all_matches = True)['f1'])
genes = genes.key_by(genes.interval)

gnomad = hl.read_table('/net/archive/groups/plggneuromol/ifpan-gosborcz-ukb/raw/gnomad/gnomad.genomes.v3.1.1.sites.ht/')
cadd = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-full.ht')
vep = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/vep38/grch38_context_vep_annotated.ht')
sport_pheno = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/sporstmen-pheno.csv', impute = True, key='sample_id', quote ="\"")
poles = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/polish-genomes/polish-genomes.mt')

mt = mt.distinct_by_row()
mt = mt.key_rows_by(mt.locus, mt.alleles)
mt = hl.split_multi_hts(mt)
mt = mt.annotate_rows(gnomad_v_3_1 = gnomad[mt.row_key])
mt = mt.annotate_rows(within_gene = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))
mt = mt.annotate_rows(hpo = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hpo'])))
mt = mt.annotate_rows(cadd = cadd[mt.row_key])
mt = mt.annotate_rows(vep = vep[mt.row_key])
mt = mt.annotate_cols(sport_phenotypes = sport_pheno[mt.s])
mt = mt.annotate_rows(polish_af = poles.rows()[mt.row_key]['info'])

mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/mct1.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/mct1.mt')

mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'control'))))

mt = mt.annotate_rows(het_non_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_het())))
mt = mt.annotate_rows(hom_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_hom_ref())))
mt = mt.annotate_rows(hom_non_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_hom_var())))

mt = mt.annotate_rows(het_non_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_het())))
mt = mt.annotate_rows(hom_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_hom_ref())))
mt = mt.annotate_rows(hom_non_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_hom_var())))

sport_AC_non_ref = hl.int32(mt.het_non_refs.get('sport', 0) + (mt.hom_non_refs.get('sport', 0)*2))
sport_AC_ref = hl.int32(mt.het_non_refs.get('sport', 0) + (mt.hom_refs.get('sport', 0)*2))
control_AC_non_ref = hl.int32(mt.het_non_refs.get('control', 0) + (mt.hom_non_refs.get('control', 0)*2))
control_AC_ref = hl.int32(mt.het_non_refs.get('control', 0) + (mt.hom_refs.get('control', 0)*2))
speed_AC_non_ref = hl.int32(mt.het_non_refs_e_vs_s.get('speed', 0) + (mt.hom_non_refs_e_vs_s.get('speed', 0)*2))
speed_AC_ref = hl.int32(mt.het_non_refs_e_vs_s.get('speed', 0) + (mt.hom_refs_e_vs_s.get('speed', 0)*2))
endurance_AC_non_ref = hl.int32(mt.het_non_refs_e_vs_s.get('endurance', 0) + (mt.hom_non_refs_e_vs_s.get('endurance', 0)*2))
endurance_AC_ref = hl.int32(mt.het_non_refs_e_vs_s.get('endurance', 0) + (mt.hom_refs_e_vs_s.get('endurance', 0)*2))
gnomad_AC_non_ref = mt.gnomad_v_3_1.freq.AC[2]
gnomad_AC_ref = mt.gnomad_v_3_1.freq.AN[2] - mt.gnomad_v_3_1.freq.AC[2]
polish_AC_non_ref = hl.int32(mt.polish_af.AC[0])
polish_AC_ref = hl.int32(mt.polish_af.AN - mt.polish_af.AC[0])

mt = mt.annotate_rows(fisher_sport_vs_control = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, control_AC_non_ref, control_AC_ref),
                              fisher_e_vs_s = hl.fisher_exact_test(speed_AC_non_ref, speed_AC_ref, endurance_AC_non_ref, endurance_AC_ref),
                              fisher_sport_vs_gnomad = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, gnomad_AC_non_ref, gnomad_AC_ref),
                              fisher_sport_vs_polish = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, polish_AC_non_ref, polish_AC_ref))

mt = mt.drop(mt['a_index'], mt['was_split'])

In [None]:
mt = mt.transmute_rows(DP_stats = mt.dp_qc,
                             GQ_stats = mt.gq_qc, 
                             hwe_controls_p_value = mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=1)).p_value,
                             hwe_sport_p_value = mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=1)).p_value,
                             gnomad_v3_nfe_af = mt.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = mt.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = mt.cadd.cadd_score,
                             within_gene = mt.within_gene,
                             hpo = mt.hpo,
                             fisher_sport_vs_control  = mt.fisher_sport_vs_control,
                             fisher_e_vs_s = mt.fisher_e_vs_s ,
                             fisher_sport_vs_gnomad  = mt.fisher_sport_vs_gnomad,
                             fisher_sport_vs_polish = mt.fisher_sport_vs_polish,                         
                             het_sport = mt.het_non_refs.get('sport', 0),
                             het_controls = mt.het_non_refs.get('control', 0),
                             hom_ref_sport = mt.hom_refs.get('sport', 0),
                             hom_ref_controls = mt.hom_refs.get('control', 0),
                             hom_var_sport = mt.hom_non_refs.get('sport', 0),
                             how_var_controls = mt.hom_non_refs.get('control',0),
                             het_endurance = mt.het_non_refs_e_vs_s.get('endurance', 0),
                             het_speed = mt.het_non_refs_e_vs_s.get('speed', 0),
                             hom_ref_endurance = mt.hom_refs_e_vs_s.get('endurance', 0),
                             hom_ref_speed = mt.hom_refs_e_vs_s.get('speed', 0),
                             hom_var_endurance = mt.hom_non_refs_e_vs_s.get('endurance', 0),
                             how_var_speed = mt.hom_non_refs_e_vs_s.get('speed',0),
                             most_severe_consequence = mt.vep.vep.most_severe_consequence,
                             transcript_consequences = mt.vep.vep.transcript_consequences,
                             intergenic_consequences = mt.vep.vep.intergenic_consequences,
                             motif_feature_consequences = mt.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = mt.vep.vep.regulatory_feature_consequences,
                             polish_af = mt.polish_af)

mt = mt.select_entries(mt.GT)

mt = mt.make_table()
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/mct1-to-export7.mt')

In [67]:
mt = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/mct1-to-export7.mt')

In [None]:
                 het_sport = mt.het_non_refs.get('sport', 0),
                             het_controls = mt.het_non_refs.get('control', 0),
                             hom_ref_sport = mt.hom_refs.get('sport', 0),
                             hom_ref_controls = mt.hom_refs.get('control', 0),
                             hom_var_sport = mt.hom_non_refs.get('sport', 0),
                             how_var_controls = mt.hom_non_refs.get('control',0),
                             het_endurance = mt.het_non_refs_e_vs_s.get('endurance', 0),
                             het_speed = mt.het_non_refs_e_vs_s.get('speed', 0),
                             hom_ref_endurance = mt.hom_refs_e_vs_s.get('endurance', 0),
                             hom_ref_speed = mt.hom_refs_e_vs_s.get('speed', 0),
                             hom_var_endurance = mt.hom_non_refs_e_vs_s.get('endurance', 0),
                             how_var_speed = mt.hom_non_refs_e_vs_s.get('speed',0),

In [60]:
to_export = mt.flatten()
to_export.export('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/mct1-single.csv')
to_export.export('mct1-single.csv')

2021-12-06 12:22:57 Hail: INFO: merging 6 files totalling 641.5K...
2021-12-06 12:22:57 Hail: INFO: while writing:
    /net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/mct1-single.csv
  merge time: 21.888ms
2021-12-06 12:22:57 Hail: INFO: merging 6 files totalling 641.5K...
2021-12-06 12:22:57 Hail: INFO: while writing:
    mct1-single.csv
  merge time: 19.666ms


## burden analysis mct1

In [2]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/snp-results/mct1.mt')

mt = mt.annotate_entries(burden_entry = hl.if_else(mt.GT.is_non_ref(), ((mt.GT.n_alt_alleles())*(mt.cadd.cadd_score)), 0))

mt = mt.annotate_cols(burden_all = hl.agg.sum(mt.burden_entry))

columns = mt.cols()

columns.aggregate(hl.agg.group_by(columns.group, hl.agg.stats(columns.burden_all)))

columns.aggregate(hl.agg.group_by(columns.sport_phenotypes.type, hl.agg.stats(columns.burden_all)))

group = np.array(columns.group.collect())
pheno = np.array(columns.sport_phenotypes.type.collect())
burden_all_test = np.array(columns.burden_all.collect())

from scipy import stats

stats.ttest_ind(burden_all_test[group == 'sport'], burden_all_test[group == 'GTS'])

stats.ttest_ind(burden_all_test[pheno == 'endurance'], burden_all_test[pheno == 'speed'])

##  NRF-2
chr15:50,275,389-50,359,306 (chr15:50329637)


In [35]:
chr15 = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/joint/dense-all_chr15.mt')

nrf2 = hl.filter_intervals(chr15, [(hl.parse_locus_interval('chr15:50275389-50359306', reference_genome='GRCh38'))])
nrf2 = nrf2.naive_coalesce(5)
nrf2.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/nrf.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/nrf.mt')

rpmk = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/repeatmasker-extended-keyed.ht')
cov = hl.read_table('/net/archive/groups/plggneuromol/ifpan-gosborcz-ukb/raw/gnomad/gnomad.genomes.r3.0.1.coverage-repartitioned.ht')

cov = cov.filter(cov.over_1 > 0.9)

mt = mt.filter_rows(hl.is_defined(rpmk[mt.locus]), keep = False)
mt = mt.filter_rows(hl.is_defined(cov[mt.locus]), keep = True)

mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'GTS'))))

mt = mt.annotate_rows(dp_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.DP)),
                      gq_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.GQ)),
                      hwe = hl.agg.group_by(mt.group, hl.agg.hardy_weinberg_test(mt.GT)))

mt = mt.annotate_rows(n_below_dp_3 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.DP < 3)),
                      n_below_gq_30 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GQ <30)))

snp1 = mt.filter_rows(mt.locus.position == 50329637)

mt = mt.filter_rows((mt.dp_qc.get('GTS', hl.struct(mean=6.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.dp_qc.get('sport', hl.struct(mean=6.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.gq_qc.get('GTS', hl.struct(mean=60.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.gq_qc.get('sport', hl.struct(mean=60.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.n_below_dp_3.get('sport', 0) < 3) &
                    (mt.n_below_gq_30.get('sport', 0) < 30) &
                    (mt.n_below_dp_3.get('GTS', 0) < 3) &
                    (mt.n_below_gq_30.get('GTS', 0) <30))

### At this stage we have 1) single SNP 2) other variants in the mt. First we will select unrelated individuals from GTS cohort:

healthy_unrelated = ['S_7212', 'S_7213','S_7227','S_7255','S_7237','S_7245','S_7246','S_7229','S_7254','WGS_147c','S_7261','S_7263','S_7269','S_7274','S_7294','S_7306','WGS_37b','WGS_37c','WGS_85b','WGS_7118',
'WGS_7120','WGS_7142','WGS_7143','WGS_7152','WGS_7153','WGS_163d','WGS_180b','WGS_6819','WGS_D6813','WGS_D6815','462','468','475','476','477','478','479','482','490','492','494']

mt = mt.filter_cols(((mt.s.contains('B')) | hl.literal(healthy_unrelated).contains(mt.s)))
mt = mt.filter_cols(mt.s == 'B454', keep = False)

mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/nrf2-selected-4.mt')

snp1 = snp1.filter_cols(((snp1.s.contains('B')) | hl.literal(healthy_unrelated).contains(snp1.s)))
snp1 = snp1.filter_cols(snp1.s == 'B454', keep = False)

snp1.count()

mt = mt.union_rows(snp1)

mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/nrf2-joined-2.mt')

### annotate matrix table with important info:

In [92]:
genes = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/genecode_v32.ht')
genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))
hpo = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/hpo.tsv', impute = True, no_header=True)

start = genes['hg38.knownGene.txStart']
stop =  genes['hg38.knownGene.txEnd']

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38', includes_start=False))

genes = genes.key_by(genes['hg38.kgXref.geneSymbol'])

hpo = hpo.key_by(hpo.f0)

genes = genes.annotate(hpo = hpo.index(genes['hg38.kgXref.geneSymbol'], all_matches = True)['f1'])
genes = genes.key_by(genes.interval)

gnomad = hl.read_table('/net/archive/groups/plggneuromol/ifpan-gosborcz-ukb/raw/gnomad/gnomad.genomes.v3.1.1.sites.ht/')
cadd = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-full.ht')
vep = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/vep38/grch38_context_vep_annotated.ht')
sport_pheno = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/sporstmen-pheno.csv', impute = True, key='sample_id', quote ="\"")
poles = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/polish-genomes/polish-genomes.mt')

mt = mt.distinct_by_row()
mt = mt.key_rows_by(mt.locus, mt.alleles)
mt = hl.split_multi_hts(mt)
mt = mt.annotate_rows(gnomad_v_3_1 = gnomad[mt.row_key])
mt = mt.annotate_rows(within_gene = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))
mt = mt.annotate_rows(hpo = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hpo'])))
mt = mt.annotate_rows(cadd = cadd[mt.row_key])
mt = mt.annotate_rows(vep = vep[mt.row_key])
mt = mt.annotate_cols(sport_phenotypes = sport_pheno[mt.s])
mt = mt.annotate_rows(polish_af = poles.rows()[mt.row_key]['info'])

mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/nrf2-2.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/nrf2-2.mt')

2021-12-03 23:11:00 Hail: INFO: Reading table to impute column types
2021-12-03 23:11:00 Hail: INFO: Finished type imputation
  Loading field 'f0' as type str (imputed)
  Loading field 'f1' as type str (imputed)
2021-12-03 23:11:01 Hail: INFO: Reading table to impute column types
2021-12-03 23:11:02 Hail: INFO: Finished type imputation
  Loading field 'sport' as type str (imputed)
  Loading field 'type' as type str (imputed)
  Loading field 'age' as type int32 (imputed)
  Loading field 'sample_id' as type str (imputed)


In [14]:
mt.count()

(180, 142)

In [None]:
mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'control'))))

mt = mt.annotate_rows(het_non_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_het())))
mt = mt.annotate_rows(hom_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_hom_ref())))
mt = mt.annotate_rows(hom_non_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_hom_var())))

mt = mt.annotate_rows(het_non_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_het())))
mt = mt.annotate_rows(hom_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_hom_ref())))
mt = mt.annotate_rows(hom_non_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_hom_var())))

sport_AC_non_ref = hl.int32(mt.het_non_refs.get('sport', 0) + (mt.hom_non_refs.get('sport', 0)*2))
sport_AC_ref = hl.int32(mt.het_non_refs.get('sport', 0) + (mt.hom_refs.get('sport', 0)*2))
control_AC_non_ref = hl.int32(mt.het_non_refs.get('control', 0) + (mt.hom_non_refs.get('control', 0)*2))
control_AC_ref = hl.int32(mt.het_non_refs.get('control', 0) + (mt.hom_refs.get('control', 0)*2))
speed_AC_non_ref = hl.int32(mt.het_non_refs_e_vs_s.get('speed', 0) + (mt.hom_non_refs_e_vs_s.get('speed', 0)*2))
speed_AC_ref = hl.int32(mt.het_non_refs_e_vs_s.get('speed', 0) + (mt.hom_refs_e_vs_s.get('speed', 0)*2))
endurance_AC_non_ref = hl.int32(mt.het_non_refs_e_vs_s.get('endurance', 0) + (mt.hom_non_refs_e_vs_s.get('endurance', 0)*2))
endurance_AC_ref = hl.int32(mt.het_non_refs_e_vs_s.get('endurance', 0) + (mt.hom_refs_e_vs_s.get('endurance', 0)*2))
gnomad_AC_non_ref = mt.gnomad_v_3_1.freq.AC[2]
gnomad_AC_ref = mt.gnomad_v_3_1.freq.AN[2] - mt.gnomad_v_3_1.freq.AC[2]
polish_AC_non_ref = hl.int32(mt.polish_af.AC[0])
polish_AC_ref = hl.int32(mt.polish_af.AN - mt.polish_af.AC[0])

mt = mt.annotate_rows(fisher_sport_vs_control = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, control_AC_non_ref, control_AC_ref),
                              fisher_e_vs_s = hl.fisher_exact_test(speed_AC_non_ref, speed_AC_ref, endurance_AC_non_ref, endurance_AC_ref),
                              fisher_sport_vs_gnomad = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, gnomad_AC_non_ref, gnomad_AC_ref),
                              fisher_sport_vs_polish = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, polish_AC_non_ref, polish_AC_ref))

mt = mt.drop(mt['a_index'], mt['was_split'])

mt = mt.transmute_rows(DP_stats = mt.dp_qc,
                             GQ_stats = mt.gq_qc, 
                             hwe_controls_p_value = mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=1)).p_value,
                             hwe_sport_p_value = mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=1)).p_value,
                             gnomad_v3_nfe_af = mt.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = mt.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = mt.cadd.cadd_score,
                             within_gene = mt.within_gene,
                             hpo = mt.hpo,
                             fisher_sport_vs_control  = mt.fisher_sport_vs_control,
                             fisher_e_vs_s = mt.fisher_e_vs_s ,
                             fisher_sport_vs_gnomad  = mt.fisher_sport_vs_gnomad,
                             fisher_sport_vs_polish = mt.fisher_sport_vs_polish,                         
                             het_sport = mt.het_non_refs.get('sport', 0),
                             het_controls = mt.het_non_refs.get('control', 0),
                             hom_ref_sport = mt.hom_refs.get('sport', 0),
                             hom_ref_controls = mt.hom_refs.get('control', 0),
                             hom_var_sport = mt.hom_non_refs.get('sport', 0),
                             how_var_controls = mt.hom_non_refs.get('control',0),
                             het_endurance = mt.het_non_refs_e_vs_s.get('endurance', 0),
                             het_speed = mt.het_non_refs_e_vs_s.get('speed', 0),
                             hom_ref_endurance = mt.hom_refs_e_vs_s.get('endurance', 0),
                             hom_ref_speed = mt.hom_refs_e_vs_s.get('speed', 0),
                             hom_var_endurance = mt.hom_non_refs_e_vs_s.get('endurance', 0),
                             how_var_speed = mt.hom_non_refs_e_vs_s.get('speed',0),
                             most_severe_consequence = mt.vep.vep.most_severe_consequence,
                             transcript_consequences = mt.vep.vep.transcript_consequences,
                             intergenic_consequences = mt.vep.vep.intergenic_consequences,
                             motif_feature_consequences = mt.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = mt.vep.vep.regulatory_feature_consequences,
                             polish_af = mt.polish_af)

mt = mt.select_entries(mt.GT)

mt = mt.make_table()
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/nrf2-to-export-.mt')

In [59]:
mt = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/nrf2-to-export-.mt')

to_export = mt.flatten()
to_export.export('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/nrf2-single.csv')
to_export.export('nrf2-single.csv')

2021-12-06 12:22:24 Hail: INFO: merging 6 files totalling 1.6M...
2021-12-06 12:22:24 Hail: INFO: while writing:
    /net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/nrf2-single.csv
  merge time: 23.134ms
2021-12-06 12:22:25 Hail: INFO: merging 6 files totalling 1.6M...
2021-12-06 12:22:25 Hail: INFO: while writing:
    nrf2-single.csv
  merge time: 22.982ms


## burden analysis nrf2

In [2]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/nrf2-2.mt')

In [3]:
mt = mt.annotate_entries(burden_entry = hl.if_else(mt.GT.is_non_ref(), ((mt.GT.n_alt_alleles())*(mt.cadd.cadd_score)), 0))

In [4]:
mt = mt.annotate_cols(burden_all = hl.agg.sum(mt.burden_entry))

In [5]:
columns = mt.cols()

2021-12-06 10:35:19 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


In [16]:
columns.aggregate(hl.agg.group_by(columns.group, hl.agg.stats(columns.burden_all)))

frozendict({'GTS': Struct(mean=158.22380487804878, stdev=96.92767397299689, min=0.0, max=339.37, n=41, sum=6487.176), 'sport': Struct(mean=170.28698019801982, stdev=99.26955702799451, min=1.726, max=382.05400000000003, n=101, sum=17198.985)})

In [17]:
columns.aggregate(hl.agg.group_by(columns.sport_phenotypes.type, hl.agg.stats(columns.burden_all)))

frozendict({'endurance': Struct(mean=172.9305416666667, stdev=101.70455621225425, min=4.116, max=382.05400000000003, n=48, sum=8300.666000000001), 'speed': Struct(mean=167.89281132075473, stdev=96.94936371448567, min=1.726, max=381.068, n=53, sum=8898.319000000001), None: Struct(mean=158.22380487804878, stdev=96.92767397299689, min=0.0, max=339.37, n=41, sum=6487.176)})

In [18]:
group = np.array(columns.group.collect())
pheno = np.array(columns.sport_phenotypes.type.collect())
burden_all_test = np.array(columns.burden_all.collect())

2021-12-05 20:02:10 Hail: INFO: Coerced sorted dataset
2021-12-05 20:02:11 Hail: INFO: Coerced sorted dataset
2021-12-05 20:02:11 Hail: INFO: Coerced sorted dataset


In [19]:
from scipy import stats

In [20]:
stats.ttest_ind(burden_all_test[group == 'sport'], burden_all_test[group == 'GTS'])

Ttest_indResult(statistic=0.6560199036456881, pvalue=0.5128880293578033)

In [21]:
stats.ttest_ind(burden_all_test[pheno == 'endurance'], burden_all_test[pheno == 'speed'])

Ttest_indResult(statistic=0.25223951377962145, pvalue=0.8013788737234933)

## MYBPC3
chr11:47,331,406-47,352,702 (chr11:47333236)


In [None]:
chr11 = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/joint/dense-all_chr11.mt')

myb = hl.filter_intervals(chr11, [(hl.parse_locus_interval('chr11:47331406-47352702', reference_genome='GRCh38'))])
myb = myb.naive_coalesce(5)
myb.write('myb.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/myb.mt')

rpmk = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/repeatmasker-extended-keyed.ht')
cov = hl.read_table('/net/archive/groups/plggneuromol/ifpan-gosborcz-ukb/raw/gnomad/gnomad.genomes.r3.0.1.coverage.ht')

cov = cov.filter(cov.over_1 > 0.9)


mt = mt.filter_rows(hl.is_defined(rpmk[mt.locus]), keep = False)
mt = mt.filter_rows(hl.is_defined(cov[mt.locus]), keep = True)

mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'GTS'))))

mt = mt.annotate_rows(dp_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.DP)),
                      gq_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.GQ)),
                      hwe = hl.agg.group_by(mt.group, hl.agg.hardy_weinberg_test(mt.GT)))

mt = mt.annotate_rows(n_below_dp_3 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.DP < 3)),
                      n_below_gq_30 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GQ <30)))

snp1 = mt.filter_rows(mt.locus.position == 47333236)

mt = mt.filter_rows((mt.dp_qc.get('GTS', hl.struct(mean=6.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.dp_qc.get('sport', hl.struct(mean=6.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.gq_qc.get('GTS', hl.struct(mean=60.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.gq_qc.get('sport', hl.struct(mean=60.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.n_below_dp_3.get('sport', 0) < 3) &
                    (mt.n_below_gq_30.get('sport', 0) < 30) &
                    (mt.n_below_dp_3.get('GTS', 0) < 3) &
                    (mt.n_below_gq_30.get('GTS', 0) <30))

### At this stage we have 1) single SNP 2) other variants in the mt. First we will select unrelated individuals from GTS cohort:

healthy_unrelated = ['S_7212', 'S_7213','S_7227','S_7255','S_7237','S_7245','S_7246','S_7229','S_7254','WGS_147c','S_7261','S_7263','S_7269','S_7274','S_7294','S_7306','WGS_37b','WGS_37c','WGS_85b','WGS_7118',
'WGS_7120','WGS_7142','WGS_7143','WGS_7152','WGS_7153','WGS_163d','WGS_180b','WGS_6819','WGS_D6813','WGS_D6815','462','468','475','476','477','478','479','482','490','492','494']

mt = mt.filter_cols(((mt.s.contains('B')) | hl.literal(healthy_unrelated).contains(mt.s)))
mt = mt.filter_cols(mt.s == 'B454', keep = False)

mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/myb-selected-4.mt')

snp1 = snp1.filter_cols(((snp1.s.contains('B')) | hl.literal(healthy_unrelated).contains(snp1.s)))
snp1 = snp1.filter_cols(snp1.s == 'B454', keep = False)


mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/myb-selected-4.mt')
mt = mt.union_rows(snp1)

mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/myb-joined.mt')

genes = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/genecode_v32.ht')
genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))
hpo = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/hpo.tsv', impute = True, no_header=True)

start = genes['hg38.knownGene.txStart']
stop =  genes['hg38.knownGene.txEnd']

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38', includes_start=False))

genes = genes.key_by(genes['hg38.kgXref.geneSymbol'])

hpo = hpo.key_by(hpo.f0)

genes = genes.annotate(hpo = hpo.index(genes['hg38.kgXref.geneSymbol'], all_matches = True)['f1'])
genes = genes.key_by(genes.interval)

gnomad = hl.read_table('/net/archive/groups/plggneuromol/ifpan-gosborcz-ukb/raw/gnomad/gnomad.genomes.v3.1.1.sites.ht/')
cadd = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-full.ht')
vep = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/vep38/grch38_context_vep_annotated.ht')
sport_pheno = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/sporstmen-pheno.csv', impute = True, key='sample_id', quote ="\"")
poles = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/polish-genomes/polish-genomes.mt')

mt = mt.distinct_by_row()
mt = mt.key_rows_by(mt.locus, mt.alleles)
mt = hl.split_multi_hts(mt)
mt = mt.annotate_rows(gnomad_v_3_1 = gnomad[mt.row_key])
mt = mt.annotate_rows(within_gene = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))
mt = mt.annotate_rows(hpo = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hpo'])))
mt = mt.annotate_rows(cadd = cadd[mt.row_key])
mt = mt.annotate_rows(vep = vep[mt.row_key])
mt = mt.annotate_cols(sport_phenotypes = sport_pheno[mt.s])
mt = mt.annotate_rows(polish_af = poles.rows()[mt.row_key]['info'])

mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/myb.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/myb.mt')

mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'control'))))

mt = mt.annotate_rows(het_non_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_het())))
mt = mt.annotate_rows(hom_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_hom_ref())))
mt = mt.annotate_rows(hom_non_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_hom_var())))

mt = mt.annotate_rows(het_non_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_het())))
mt = mt.annotate_rows(hom_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_hom_ref())))
mt = mt.annotate_rows(hom_non_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_hom_var())))

sport_AC_non_ref = hl.int32(mt.het_non_refs.get('sport', 0) + (mt.hom_non_refs.get('sport', 0)*2))
sport_AC_ref = hl.int32(mt.het_non_refs.get('sport', 0) + (mt.hom_refs.get('sport', 0)*2))
control_AC_non_ref = hl.int32(mt.het_non_refs.get('control', 0) + (mt.hom_non_refs.get('control', 0)*2))
control_AC_ref = hl.int32(mt.het_non_refs.get('control', 0) + (mt.hom_refs.get('control', 0)*2))
speed_AC_non_ref = hl.int32(mt.het_non_refs_e_vs_s.get('speed', 0) + (mt.hom_non_refs_e_vs_s.get('speed', 0)*2))
speed_AC_ref = hl.int32(mt.het_non_refs_e_vs_s.get('speed', 0) + (mt.hom_refs_e_vs_s.get('speed', 0)*2))
endurance_AC_non_ref = hl.int32(mt.het_non_refs_e_vs_s.get('endurance', 0) + (mt.hom_non_refs_e_vs_s.get('endurance', 0)*2))
endurance_AC_ref = hl.int32(mt.het_non_refs_e_vs_s.get('endurance', 0) + (mt.hom_refs_e_vs_s.get('endurance', 0)*2))
gnomad_AC_non_ref = mt.gnomad_v_3_1.freq.AC[2]
gnomad_AC_ref = mt.gnomad_v_3_1.freq.AN[2] - mt.gnomad_v_3_1.freq.AC[2]
polish_AC_non_ref = hl.int32(mt.polish_af.AC[0])
polish_AC_ref = hl.int32(mt.polish_af.AN - mt.polish_af.AC[0])

mt = mt.annotate_rows(fisher_sport_vs_control = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, control_AC_non_ref, control_AC_ref),
                              fisher_e_vs_s = hl.fisher_exact_test(speed_AC_non_ref, speed_AC_ref, endurance_AC_non_ref, endurance_AC_ref),
                              fisher_sport_vs_gnomad = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, gnomad_AC_non_ref, gnomad_AC_ref),
                              fisher_sport_vs_polish = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, polish_AC_non_ref, polish_AC_ref))

mt = mt.drop(mt['a_index'], mt['was_split'])

mt = mt.transmute_rows(DP_stats = mt.dp_qc,
                             GQ_stats = mt.gq_qc, 
                             hwe_controls_p_value = mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=1)).p_value,
                             hwe_sport_p_value = mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=1)).p_value,
                             gnomad_v3_nfe_af = mt.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = mt.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = mt.cadd.cadd_score,
                             within_gene = mt.within_gene,
                             hpo = mt.hpo,
                             fisher_sport_vs_control  = mt.fisher_sport_vs_control,
                             fisher_e_vs_s = mt.fisher_e_vs_s ,
                             fisher_sport_vs_gnomad  = mt.fisher_sport_vs_gnomad,
                             fisher_sport_vs_polish = mt.fisher_sport_vs_polish,                         
                             het_sport = mt.het_non_refs.get('sport', 0),
                             het_controls = mt.het_non_refs.get('control', 0),
                             hom_ref_sport = mt.hom_refs.get('sport', 0),
                             hom_ref_controls = mt.hom_refs.get('control', 0),
                             hom_var_sport = mt.hom_non_refs.get('sport', 0),
                             how_var_controls = mt.hom_non_refs.get('control',0),
                             het_endurance = mt.het_non_refs_e_vs_s.get('endurance', 0),
                             het_speed = mt.het_non_refs_e_vs_s.get('speed', 0),
                             hom_ref_endurance = mt.hom_refs_e_vs_s.get('endurance', 0),
                             hom_ref_speed = mt.hom_refs_e_vs_s.get('speed', 0),
                             hom_var_endurance = mt.hom_non_refs_e_vs_s.get('endurance', 0),
                             how_var_speed = mt.hom_non_refs_e_vs_s.get('speed',0),
                             most_severe_consequence = mt.vep.vep.most_severe_consequence,
                             transcript_consequences = mt.vep.vep.transcript_consequences,
                             intergenic_consequences = mt.vep.vep.intergenic_consequences,
                             motif_feature_consequences = mt.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = mt.vep.vep.regulatory_feature_consequences,
                             polish_af = mt.polish_af)

mt = mt.select_entries(mt.GT)

mt = mt.make_table()
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/myb-to-export.mt')

In [57]:
mt = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/myb-to-export.mt')

In [58]:
to_export = mt.flatten()
to_export.export('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/myb-single.csv')
to_export.export('myb-single.csv')

2021-12-06 12:21:41 Hail: INFO: merging 4 files totalling 434.0K...
2021-12-06 12:21:41 Hail: INFO: while writing:
    /net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/myb-single.csv
  merge time: 15.812ms
2021-12-06 12:21:42 Hail: INFO: merging 4 files totalling 434.0K...
2021-12-06 12:21:42 Hail: INFO: while writing:
    myb-single.csv
  merge time: 15.486ms


In [72]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/myb.mt')

mt = mt.annotate_entries(burden_entry = hl.if_else(mt.GT.is_non_ref(), ((mt.GT.n_alt_alleles())*(mt.cadd.cadd_score)), 0))

mt = mt.annotate_cols(burden_all = hl.agg.sum(mt.burden_entry))

columns = mt.cols()

In [73]:
columns.aggregate(hl.agg.group_by(columns.group, hl.agg.stats(columns.burden_all)))

frozendict({'GTS': Struct(mean=17.330243902439022, stdev=10.469869032144945, min=0.0, max=44.47500000000001, n=41, sum=710.54), 'sport': Struct(mean=20.57479207920792, stdev=13.945145838668003, min=0.0, max=89.811, n=101, sum=2078.054)})

In [75]:
columns.aggregate(hl.agg.group_by(columns.sport_phenotypes.type, hl.agg.stats(columns.burden_all)))

frozendict({'endurance': Struct(mean=20.0169375, stdev=12.786128364700305, min=0.0, max=57.098, n=48, sum=960.813), 'speed': Struct(mean=21.08001886792453, stdev=14.89930124800788, min=0.0, max=89.811, n=53, sum=1117.241), None: Struct(mean=17.330243902439022, stdev=10.469869032144945, min=0.0, max=44.47500000000001, n=41, sum=710.54)})

In [76]:
group = np.array(columns.group.collect())
pheno = np.array(columns.sport_phenotypes.type.collect())
burden_all_test = np.array(columns.burden_all.collect())

from scipy import stats

stats.ttest_ind(burden_all_test[group == 'sport'], burden_all_test[group == 'GTS'])

2021-12-06 13:20:48 Hail: INFO: Coerced sorted dataset
2021-12-06 13:20:48 Hail: INFO: Coerced sorted dataset
2021-12-06 13:20:49 Hail: INFO: Coerced sorted dataset


Ttest_indResult(statistic=1.3344378470157399, pvalue=0.18422675688029458)

In [77]:
stats.ttest_ind(burden_all_test[pheno == 'endurance'], burden_all_test[pheno == 'speed'])

Ttest_indResult(statistic=-0.3790645445539198, pvalue=0.7054517209972033)

## HFE
chr6:26,087,281-26,098,343 (chr6:26090951)

In [45]:
chr6 = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/joint/dense-all_chr6.mt')

hfe = hl.filter_intervals(chr6, [(hl.parse_locus_interval('chr6:26087281-26098343', reference_genome='GRCh38'))])
hfe = hfe.naive_coalesce(5)
hfe.write('hfe-3.mt')

2021-12-06 12:12:42 Hail: INFO: wrote matrix table with 220 rows and 789 columns in 3 partitions to hfe-3.mt
    Total size: 268.43 KiB
    * Rows/entries: 265.21 KiB
    * Columns: 3.22 KiB
    * Globals: 11.00 B
    * Smallest partition: 2 rows (4.95 KiB)
    * Largest partition:  134 rows (155.24 KiB)


In [46]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-3.mt')

rpmk = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/repeatmasker-extended-keyed.ht')
cov = hl.read_table('/net/archive/groups/plggneuromol/ifpan-gosborcz-ukb/raw/gnomad/gnomad.genomes.r3.0.1.coverage.ht')

cov = cov.filter(cov.over_1 > 0.9)

mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'GTS'))))

mt = mt.annotate_rows(dp_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.DP)),
                      gq_qc = hl.agg.group_by(mt.group, hl.agg.stats(mt.GQ)),
                      hwe = hl.agg.group_by(mt.group, hl.agg.hardy_weinberg_test(mt.GT)))

mt = mt.annotate_rows(n_below_dp_3 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.DP < 3)),
                      n_below_gq_30 = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GQ <30)))

In [47]:
snp1 = mt.filter_rows(mt.locus.position == 26090951)

mt = mt.filter_rows(hl.is_defined(rpmk[mt.locus]), keep = False)
mt = mt.filter_rows(hl.is_defined(cov[mt.locus]), keep = True)

mt = mt.filter_rows((mt.dp_qc.get('GTS', hl.struct(mean=6.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.dp_qc.get('sport', hl.struct(mean=6.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 5) &
                    (mt.gq_qc.get('GTS', hl.struct(mean=60.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.gq_qc.get('sport', hl.struct(mean=60.0, stdev=0.0, min=0.0, max=0.0, n=0, sum=0.0)).mean > 50) &
                    (mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=0.5)).p_value > 0.05) &
                    (mt.n_below_dp_3.get('sport', 0) < 3) &
                    (mt.n_below_gq_30.get('sport', 0) < 30) &
                    (mt.n_below_dp_3.get('GTS', 0) < 3) &
                    (mt.n_below_gq_30.get('GTS', 0) <30))

### At this stage we have 1) single SNP 2) other variants in the mt. First we will select unrelated individuals from GTS cohort:

healthy_unrelated = ['S_7212', 'S_7213','S_7227','S_7255','S_7237','S_7245','S_7246','S_7229','S_7254','WGS_147c','S_7261','S_7263','S_7269','S_7274','S_7294','S_7306','WGS_37b','WGS_37c','WGS_85b','WGS_7118',
'WGS_7120','WGS_7142','WGS_7143','WGS_7152','WGS_7153','WGS_163d','WGS_180b','WGS_6819','WGS_D6813','WGS_D6815','462','468','475','476','477','478','479','482','490','492','494']

mt = mt.filter_cols(((mt.s.contains('B')) | hl.literal(healthy_unrelated).contains(mt.s)))
mt = mt.filter_cols(mt.s == 'B454', keep = False)

mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-selected-5.mt')

snp1 = snp1.filter_cols(((snp1.s.contains('B')) | hl.literal(healthy_unrelated).contains(snp1.s)))
snp1 = snp1.filter_cols(snp1.s == 'B454', keep = False)


mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-selected-5.mt')
mt = mt.union_rows(snp1)

mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-joined-5.mt')

genes = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/genecode_v32.ht')
genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))
hpo = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/hpo.tsv', impute = True, no_header=True)

start = genes['hg38.knownGene.txStart']
stop =  genes['hg38.knownGene.txEnd']

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38', includes_start=False))

genes = genes.key_by(genes['hg38.kgXref.geneSymbol'])

hpo = hpo.key_by(hpo.f0)

genes = genes.annotate(hpo = hpo.index(genes['hg38.kgXref.geneSymbol'], all_matches = True)['f1'])
genes = genes.key_by(genes.interval)

gnomad = hl.read_table('/net/archive/groups/plggneuromol/ifpan-gosborcz-ukb/raw/gnomad/gnomad.genomes.v3.1.1.sites.ht/')
cadd = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-full.ht')
vep = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/vep38/grch38_context_vep_annotated.ht')
sport_pheno = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/sporstmen-pheno.csv', impute = True, key='sample_id', quote ="\"")
poles = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/polish-genomes/polish-genomes.mt')

mt = mt.distinct_by_row()
mt = mt.key_rows_by(mt.locus, mt.alleles)
mt = hl.split_multi_hts(mt)
mt = mt.annotate_rows(gnomad_v_3_1 = gnomad[mt.row_key])
mt = mt.annotate_rows(within_gene = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))
mt = mt.annotate_rows(hpo = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hpo'])))
mt = mt.annotate_rows(cadd = cadd[mt.row_key])
mt = mt.annotate_rows(vep = vep[mt.row_key])
mt = mt.annotate_cols(sport_phenotypes = sport_pheno[mt.s])
mt = mt.annotate_rows(polish_af = poles.rows()[mt.row_key]['info'])

mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/hfe-5.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/hfe-5.mt')

mt = mt.annotate_cols(group = hl.if_else(
     mt.s.contains('B'), 'sport', hl.if_else(
     mt.s.contains('HG'), '1kg', hl.if_else(
           mt.s.contains('NA'), '1kg', 'control'))))

mt = mt.annotate_rows(het_non_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_het())))
mt = mt.annotate_rows(hom_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_hom_ref())))
mt = mt.annotate_rows(hom_non_refs = hl.agg.group_by(mt.group, hl.agg.count_where(mt.GT.is_hom_var())))

mt = mt.annotate_rows(het_non_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_het())))
mt = mt.annotate_rows(hom_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_hom_ref())))
mt = mt.annotate_rows(hom_non_refs_e_vs_s = hl.agg.group_by(mt.sport_phenotypes.type, hl.agg.count_where(mt.GT.is_hom_var())))

sport_AC_non_ref = hl.int32(mt.het_non_refs.get('sport', 0) + (mt.hom_non_refs.get('sport', 0)*2))
sport_AC_ref = hl.int32(mt.het_non_refs.get('sport', 0) + (mt.hom_refs.get('sport', 0)*2))
control_AC_non_ref = hl.int32(mt.het_non_refs.get('control', 0) + (mt.hom_non_refs.get('control', 0)*2))
control_AC_ref = hl.int32(mt.het_non_refs.get('control', 0) + (mt.hom_refs.get('control', 0)*2))
speed_AC_non_ref = hl.int32(mt.het_non_refs_e_vs_s.get('speed', 0) + (mt.hom_non_refs_e_vs_s.get('speed', 0)*2))
speed_AC_ref = hl.int32(mt.het_non_refs_e_vs_s.get('speed', 0) + (mt.hom_refs_e_vs_s.get('speed', 0)*2))
endurance_AC_non_ref = hl.int32(mt.het_non_refs_e_vs_s.get('endurance', 0) + (mt.hom_non_refs_e_vs_s.get('endurance', 0)*2))
endurance_AC_ref = hl.int32(mt.het_non_refs_e_vs_s.get('endurance', 0) + (mt.hom_refs_e_vs_s.get('endurance', 0)*2))
gnomad_AC_non_ref = mt.gnomad_v_3_1.freq.AC[2]
gnomad_AC_ref = mt.gnomad_v_3_1.freq.AN[2] - mt.gnomad_v_3_1.freq.AC[2]
polish_AC_non_ref = hl.int32(mt.polish_af.AC[0])
polish_AC_ref = hl.int32(mt.polish_af.AN - mt.polish_af.AC[0])

mt = mt.annotate_rows(fisher_sport_vs_control = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, control_AC_non_ref, control_AC_ref),
                              fisher_e_vs_s = hl.fisher_exact_test(speed_AC_non_ref, speed_AC_ref, endurance_AC_non_ref, endurance_AC_ref),
                              fisher_sport_vs_gnomad = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, gnomad_AC_non_ref, gnomad_AC_ref),
                              fisher_sport_vs_polish = hl.fisher_exact_test(sport_AC_non_ref, sport_AC_ref, polish_AC_non_ref, polish_AC_ref))

mt = mt.drop(mt['a_index'], mt['was_split'])

mt = mt.transmute_rows(DP_stats = mt.dp_qc,
                             GQ_stats = mt.gq_qc, 
                             hwe_controls_p_value = mt.hwe.get('GTS', hl.struct(het_freq_hwe=0.0, p_value=1)).p_value,
                             hwe_sport_p_value = mt.hwe.get('sport', hl.struct(het_freq_hwe=0.0, p_value=1)).p_value,
                             gnomad_v3_nfe_af = mt.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = mt.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = mt.cadd.cadd_score,
                             within_gene = mt.within_gene,
                             hpo = mt.hpo,
                             fisher_sport_vs_control  = mt.fisher_sport_vs_control,
                             fisher_e_vs_s = mt.fisher_e_vs_s ,
                             fisher_sport_vs_gnomad  = mt.fisher_sport_vs_gnomad,
                             fisher_sport_vs_polish = mt.fisher_sport_vs_polish,                         
                             het_sport = mt.het_non_refs.get('sport', 0),
                             het_controls = mt.het_non_refs.get('control', 0),
                             hom_ref_sport = mt.hom_refs.get('sport', 0),
                             hom_ref_controls = mt.hom_refs.get('control', 0),
                             hom_var_sport = mt.hom_non_refs.get('sport', 0),
                             how_var_controls = mt.hom_non_refs.get('control',0),
                             het_endurance = mt.het_non_refs_e_vs_s.get('endurance', 0),
                             het_speed = mt.het_non_refs_e_vs_s.get('speed', 0),
                             hom_ref_endurance = mt.hom_refs_e_vs_s.get('endurance', 0),
                             hom_ref_speed = mt.hom_refs_e_vs_s.get('speed', 0),
                             hom_var_endurance = mt.hom_non_refs_e_vs_s.get('endurance', 0),
                             how_var_speed = mt.hom_non_refs_e_vs_s.get('speed',0),
                             most_severe_consequence = mt.vep.vep.most_severe_consequence,
                             transcript_consequences = mt.vep.vep.transcript_consequences,
                             intergenic_consequences = mt.vep.vep.intergenic_consequences,
                             motif_feature_consequences = mt.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = mt.vep.vep.regulatory_feature_consequences,
                             polish_af = mt.polish_af)

mt = mt.select_entries(mt.GT)

mt = mt.make_table()
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-to-export-5.mt')

mt = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-to-export-5.mt')

2021-12-06 12:13:43 Hail: INFO: wrote matrix table with 34 rows and 142 columns in 3 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-selected-5.mt
    Total size: 36.04 KiB
    * Rows/entries: 35.39 KiB
    * Columns: 647.00 B
    * Globals: 11.00 B
    * Smallest partition: 1 rows (2.22 KiB)
    * Largest partition:  24 rows (23.19 KiB)
2021-12-06 12:13:45 Hail: INFO: wrote matrix table with 35 rows and 142 columns in 4 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-joined-5.mt
2021-12-06 12:13:45 Hail: INFO: Reading table to impute column types
2021-12-06 12:13:46 Hail: INFO: Finished type imputation
  Loading field 'f0' as type str (imputed)
  Loading field 'f1' as type str (imputed)
2021-12-06 12:13:46 Hail: INFO: Reading table to impute column types
2021-12-06 12:13:47 Hail: INFO: Finished type imputation
  Loading field 'sport' as type str (imputed)
  Loading field

In [56]:
mt = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/preprocessing/joint-with-gts/hfe-to-export-5.mt')

In [55]:
to_export = mt.flatten()
to_export.export('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/hfe-single.csv')
to_export.export('hfe-single.csv')

2021-12-06 12:19:04 Hail: INFO: merging 4 files totalling 543.2K...
2021-12-06 12:19:04 Hail: INFO: while writing:
    /net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/hfe-single.csv
  merge time: 17.113ms
2021-12-06 12:19:04 Hail: INFO: merging 4 files totalling 543.2K...
2021-12-06 12:19:04 Hail: INFO: while writing:
    hfe-single.csv
  merge time: 16.688ms


In [61]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/results/hail-mts/hfe-5.mt')

mt = mt.annotate_entries(burden_entry = hl.if_else(mt.GT.is_non_ref(), ((mt.GT.n_alt_alleles())*(mt.cadd.cadd_score)), 0))
mt = mt.annotate_cols(burden_all = hl.agg.sum(mt.burden_entry))

columns = mt.cols()

In [62]:
columns.aggregate(hl.agg.group_by(columns.group, hl.agg.stats(columns.burden_all)))

frozendict({'GTS': Struct(mean=19.969707317073173, stdev=15.678183358620721, min=0.0, max=50.456, n=41, sum=818.758), 'sport': Struct(mean=18.69779207920792, stdev=14.1567776059558, min=0.0, max=64.626, n=101, sum=1888.477)})

In [63]:
columns.aggregate(hl.agg.group_by(columns.sport_phenotypes.type, hl.agg.stats(columns.burden_all)))

frozendict({'endurance': Struct(mean=19.082604166666666, stdev=13.058676797343697, min=0.0, max=51.019999999999996, n=48, sum=915.965), 'speed': Struct(mean=18.349283018867926, stdev=15.073975139451637, min=0.0, max=64.626, n=53, sum=972.5120000000001), None: Struct(mean=19.969707317073173, stdev=15.678183358620721, min=0.0, max=50.456, n=41, sum=818.758)})

In [64]:
group = np.array(columns.group.collect())
pheno = np.array(columns.sport_phenotypes.type.collect())
burden_all_test = np.array(columns.burden_all.collect())

2021-12-06 12:25:49 Hail: INFO: Coerced sorted dataset
2021-12-06 12:25:49 Hail: INFO: Coerced sorted dataset
2021-12-06 12:25:50 Hail: INFO: Coerced sorted dataset


In [65]:
from scipy import stats

stats.ttest_ind(burden_all_test[group == 'sport'], burden_all_test[group == 'GTS'])

Ttest_indResult(statistic=-0.46673132195131334, pvalue=0.6414176157837872)

In [66]:
stats.ttest_ind(burden_all_test[pheno == 'endurance'], burden_all_test[pheno == 'speed'])

Ttest_indResult(statistic=0.25747195646956944, pvalue=0.7973488688651403)