In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

#This line will hide code by default when the notebook is exported as HTML
#di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

import hail as hl
hl.init(tmp_dir='/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/temp', spark_conf={'spark.driver.memory': '30G', 'spark.executor.memory': '30G'}, default_reference='GRCh38') 

2022-02-02 10:25:16 WARN  NativeCodeLoader:60 - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.1.2
SparkUI available at http://p0604.prometheus:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.79-f141af259254
LOGGING: writing to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/analysis/burden-and-family/hail-20220202-1025-0.2.79-f141af259254.log


In [2]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()
import openpyxl


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis

output_notebook() 

## 1. Filtering for quality 

In [None]:
rpmk = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/repeatmasker-extended-keyed.ht')
cov = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/gnomad/gnomad-cov-keyed.ht')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-unfiltered.mt')
mt = mt.filter_rows(hl.is_defined(rpmk[mt.locus]), keep = False)

mt.checkpoint('/net/scratch/people/plggosborcz/temp-mts/gts-rpmk.mt')

mt = mt.filter_rows(hl.is_defined(cov[mt.locus]), keep = True)
mt.checkpoint('/net/scratch/people/plggosborcz/temp-mts/gts-cov.mt')

mt = mt.annotate_rows(dp_qc = hl.agg.stats(mt.DP),
                     gq_qc = hl.agg.stats(mt.GQ),
                     hwe = hl.agg.hardy_weinberg_test(mt.GT))

mt = mt.annotate_rows(n_below_dp_3 = hl.agg.count_where(mt.DP < 3),
                      n_below_gq_30 = hl.agg.count_where(mt.GQ <30))

mt.checkpoint('/net/scratch/people/plggosborcz/temp-mts/gts-qc.mt')

mt = mt.filter_rows((mt.dp_qc.mean > 5) &
                    (mt.gq_qc.mean > 50) &
                    (mt.hwe.p_value > 0.05) &
                    (mt.n_below_dp_3 < 3) &
                    (mt.n_below_gq_30 < 30))

mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/gts-fams-filtered.mt')

2021-11-17 11:38:55 Hail: INFO: wrote matrix table with 19391598 rows and 124 columns in 97220 partitions to /net/scratch/people/plggosborcz/temp-mts/gts-rpmk.mt
    Total size: 14.27 GiB
    * Rows/entries: 14.27 GiB
    * Columns: 570.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  6173 rows (6.94 MiB)


## 2. Split multiallelic variants and annotate with gnomad and other databases

In [8]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/gts-fams-filtered.mt')
mt = mt.distinct_by_row()
mt.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/to-delete.mt')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/to-delete.mt')

mt = mt.key_rows_by(mt.locus, mt.alleles)
mt = hl.split_multi_hts(mt)
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-split.mt')

gnomad = hl.read_table('/net/archive/groups/plggneuromol/ifpan-gosborcz-ukb/raw/gnomad/gnomad.genomes.v3.1.1.sites.ht/')

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-split.mt')

mt = mt.annotate_rows(gnomad_v_3_1 = gnomad[mt.row_key])

mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-gnomad.mt')

genes = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/genecode_v32.ht')
genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))
hpo = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/hpo.tsv', impute = True, no_header=True)

start = genes['hg38.knownGene.txStart']
stop =  genes['hg38.knownGene.txEnd']

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38', includes_start=False))

genes = genes.key_by(genes['hg38.kgXref.geneSymbol'])

hpo = hpo.key_by(hpo.f0)

genes = genes.annotate(hpo = hpo.index(genes['hg38.kgXref.geneSymbol'], all_matches = True)['f1'])
genes = genes.key_by(genes.interval)

mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-gnomad.mt')

mt = mt.annotate_rows(within_gene = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))
mt = mt.annotate_rows(hpo = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hpo'])))
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-anno.mt')

2021-11-18 11:51:46 Hail: INFO: wrote matrix table with 16691465 rows and 124 columns in 97220 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/to-delete.mt
    Total size: 12.50 GiB
    * Rows/entries: 12.50 GiB
    * Columns: 570.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  2747 rows (2.41 MiB)


<hail.matrixtable.MatrixTable at 0x2b0da8fed7b8>

## 3. Annotate with phenotypes + create a separate mt for each of the larger families

In [3]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fams-anno.mt')

In [4]:
pheno = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/pheno/GTS-coded-corrected-june-2021.csv', impute = True, key='ID', delimiter = ',', quote ="\"")

2022-01-05 10:57:35 Hail: INFO: Reading table to impute column types
2022-01-05 10:57:38 Hail: INFO: Finished type imputation            (0 + 1) / 1]
  Loading field 'ID' as type str (imputed)
  Loading field 'family' as type str (imputed)
  Loading field 'sex' as type str (imputed)
  Loading field 'kinship' as type str (imputed)
  Loading field 'disease' as type str (imputed)
  Loading field 'phenotype' as type str (imputed)
  Loading field 'add_pheno' as type str (imputed)
  Loading field 'heavy_tics' as type str (imputed)
  Loading field 'heavy_tics_familial' as type str (imputed)
  Loading field 'GTS_ASD_group' as type str (imputed)
  Loading field 'nonCTD' as type str (imputed)


In [5]:
mt = mt.annotate_cols(phenotypes = pheno[mt.s])

### 3.1 get CADD

In [161]:
cadds = []
cadd_per_chr = !ls /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-per-chr/

for f in cadd_per_chr:
    cadd = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-per-chr/'+f)
    cadds.append(cadd)

In [165]:
cadd_all = hl.Table.union(*cadds)
cadd_all.write('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-full.ht')

2021-11-20 14:47:16 Hail: INFO: wrote table with 8812917339 rows in 2603 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-full.ht
    Total size: 165.53 GiB
    * Rows: 165.53 GiB
    * Globals: 11.00 B
    * Smallest partition: 198873 rows (3.75 MiB)
    * Largest partition:  6688041 rows (110.36 MiB)


## 4. For each of the families get intragenic variants with cadd > 10

In [6]:
fams = list(set(mt.phenotypes.family.collect()))

In [7]:
fams.sort()

In [8]:
cadd = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/cadd-full.ht')

In [None]:
for f in fams:
    fam = mt.filter_cols(mt.phenotypes.family == f)
    fam = fam.filter_rows(hl.agg.any(fam.GT.is_non_ref()))
    fam = fam.naive_coalesce(50)
    fam = fam.filter_rows(fam.within_gene == hl.empty_array(hl.tstr), keep = False)
    fam = fam.annotate_rows(cadd = cadd[fam.row_key])
    fam = fam.filter_rows(fam.cadd.cadd_score > 10)

    fam.write('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-filtered'+f+'.mt')

2022-01-04 11:28:16 Hail: INFO: wrote matrix table with 106628 rows and 7 columns in 50 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-filteredA.mt
    Total size: 744.59 MiB
    * Rows/entries: 744.59 MiB
    * Columns: 166.00 B
    * Globals: 11.00 B
    * Smallest partition: 64 rows (207.89 KiB)
    * Largest partition:  2906 rows (20.44 MiB)
2022-01-04 11:57:04 Hail: INFO: wrote matrix table with 113819 rows and 8 columns in 50 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-filteredB.mt
    Total size: 791.62 MiB
    * Rows/entries: 791.62 MiB
    * Columns: 206.00 B
    * Globals: 11.00 B
    * Smallest partition: 88 rows (241.57 KiB)
    * Largest partition:  3231 rows (22.41 MiB)
2022-01-04 12:22:29 Hail: INFO: wrote matrix table with 116218 rows and 9 columns in 50 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-filteredC.mt
    Total size: 798.4

### Additional annotation with vep:

In [8]:
vep = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/vep38/grch38_context_vep_annotated.ht')

In [None]:
for f in fams:
    fam = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-filtered'+f+'.mt')
    fam = fam.annotate_rows(vep = vep[fam.row_key])
    fam.write('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vep'+f+'.mt')

2022-01-05 11:42:34 Hail: INFO: wrote matrix table with 101960 rows and 7 columns in 50 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vepE.mt
2022-01-05 12:26:32 Hail: INFO: wrote matrix table with 120694 rows and 11 columns in 50 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vepF.mt
2022-01-05 13:13:40 Hail: INFO: wrote matrix table with 93060 rows and 4 columns in 50 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vepG.mt
2022-01-05 13:58:46 Hail: INFO: wrote matrix table with 105758 rows and 6 columns in 50 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vepH.mt
2022-01-05 14:45:45 Hail: INFO: wrote matrix table with 104716 rows and 6 columns in 50 partitions to /net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vepI.mt
2022-01-05 15:32:22 Hail: INFO: wrote matrix table with 9976

## 5. For each of the families find pathogenic variants 

* first in all of the genes (maf < 0.001 or not in Gnomad)
* then in selected genes (maf < 0.05 or not in Gnomad)

In [3]:
fams = ['A', 'B', 'C','D', 'E', 'F', 'G', 'H', 'I', 'J', 'R', 'S', 'T', 'U', 'W', 'X', 'Y']
fams_all_genes = ['A_all_genes', 'B_all_genes', 'C_all_genes','D_all_genes',
                  'E_all_genes', 'F_all_genes', 'G_all_genes', 'H_all_genes',
                  'I_all_genes', 'J_all_genes', 'R_all_genes', 'S_all_genes',
                  'T_all_genes', 'U_all_genes', 'W_all_genes', 'X_all_genes', 'Y_all_genes']
fams_selected_genes = ['A_selected_genes', 'B_selected_genes', 'C_selected_genes','D_selected_genes',
                  'E_selected_genes', 'F_selected_genes', 'G_selected_genes', 'H_selected_genes',
                  'I_selected_genes', 'J_selected_genes', 'R_selected_genes', 'S_selected_genes',
                  'T_selected_genes', 'U_selected_genes', 'W_selected_genes', 'X_selected_genes', 'Y_selected_genes']
fams_description = ['A_description', 'B_description', 'C_description','D_description',
                  'E_description', 'F_description', 'G_description', 'H_description',
                  'I_description', 'J_description', 'R_description', 'S_description',
                  'T_description', 'U_description', 'W_description', 'X_description', 'Y_description']

fams1 = []
fams2 = []
fams_to_export_all = []
fams_to_export_selected = []
descriptions_to_export = []

In [3]:
genes = pd.read_csv('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/burden-and-family/all-genes-burde-and-family.csv', names=['gene'])

In [4]:
all_genes = set(genes.gene)
all_genes = list({x.replace('\xa0\xa0\xa0', '') for x in all_genes})

In [6]:
genes_fams1 = []
genes_fams2 = []

for f in fams:
    
    fam = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vep'+f+'.mt')
    ctrl = fam.aggregate_cols(hl.agg.counter(fam.phenotypes.disease))['NO']
    gts = fam.aggregate_cols(hl.agg.counter(fam.phenotypes.disease))['YES']

    fam = fam.annotate_rows(segregation = hl.agg.group_by(fam.phenotypes.disease, hl.agg.count_where(fam.GT.is_non_ref())))
    fam = fam.filter_rows(((fam.segregation.get('NO', 0))/ctrl < 0.33) & ((fam.segregation.get('YES', 0))/gts > 0.66))
    
    fam = fam.filter_rows(hl.is_snp(fam.alleles[0], fam.alleles[1]))
    
    fam1 = fam.filter_rows(hl.if_else(hl.is_defined(fam.gnomad_v_3_1.freq.AF[2]), fam.gnomad_v_3_1.freq.AF[2] < 0.001, True))
    genes_fams1.append(fam1.within_gene.collect())
    fams1.append(fam1)
    
    fam2 = fam.filter_rows((hl.any(lambda x: hl.literal(all_genes).contains(x), fam.within_gene)))
    fam2 = fam2.filter_rows(hl.if_else(hl.is_defined(fam2.gnomad_v_3_1.freq.AF[2]), fam2.gnomad_v_3_1.freq.AF[2] < 0.05, True))
    genes_fams2.append(fam2.within_gene.collect())
    fams2.append(fam2)



In [7]:
for idx, gene_list in enumerate(genes_fams1):
    gene_list = [item for sublist in gene_list for item in sublist] # first flatten the gene list in each family
    genes_fams1[idx] = list(set(gene_list))

In [8]:
genes_fams1 = [item for sublist in genes_fams1 for item in sublist]

In [9]:
seen = set()
dupes_1 = [x for x in genes_fams1 if x in seen or seen.add(x)]    

In [10]:
len(set(dupes_1))

393

In [11]:
# optional - export only genes with multiple variants:

for idx, fam in enumerate(fams1):
    fam = fam.filter_rows((hl.any(lambda x: hl.literal(dupes_1).contains(x), fam.within_gene)))
    fams1[idx] = fam

In [12]:
for idx, gene_list in enumerate(genes_fams2):
    gene_list = [item for sublist in gene_list for item in sublist] # first flatten the gene list in each family
    genes_fams2[idx] = list(set(gene_list))

In [13]:
genes_fams2 = [item for sublist in genes_fams2 for item in sublist]

In [14]:
len(set(genes_fams2))

446

In [15]:
seen = set()
dupes_2 = [x for x in genes_fams2 if x in seen or seen.add(x)]    

In [16]:
len(set(dupes_2))

172

In [17]:
for idx, fam in enumerate(fams2):
    fam = fam.filter_rows((hl.any(lambda x: hl.literal(dupes_2).contains(x), fam.within_gene)))
    fams2[idx] = fam

In [18]:
for fam in fams1:
    fam = fam.drop(fam['a_index'], fam['was_split'])
    fam = fam.transmute_rows(DP_stats = fam.dp_qc,
                             GQ_stats =fam.gq_qc, 
                             gnomad_v3_nfe_af = fam.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = fam.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = fam.cadd.cadd_score,
                             within_gene = fam.within_gene,
                             hpo = fam.hpo,
                             non_refs_healthy = fam.segregation.get('NO', 0),
                             non_refs_gts = fam.segregation.get('YES', 0),
                             most_severe_consequence = fam.vep.vep.most_severe_consequence,
                             transcript_consequences = fam.vep.vep.transcript_consequences,
                             intergenic_consequences = fam.vep.vep.intergenic_consequences,
                             motif_feature_consequences = fam.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = fam.vep.vep.regulatory_feature_consequences)

    fam = fam.select_entries(fam.GT)
    fams_to_export_all.append(fam.make_table().to_pandas())
    descriptions_to_export.append(fam.cols().to_pandas())

for idx, f in enumerate(fams_to_export_all):
    fams_to_export_all[idx] = f[f.columns.drop(list(f.filter(regex='.phased')))]
    
    

for fam in fams2:
    fam = fam.drop(fam['a_index'], fam['was_split'])
    fam = fam.transmute_rows(DP_stats = fam.dp_qc,
                             GQ_stats =fam.gq_qc, 
                             gnomad_v3_nfe_af = fam.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = fam.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = fam.cadd.cadd_score,
                             within_gene = fam.within_gene,
                             hpo = fam.hpo,
                             non_refs_healthy = fam.segregation.get('NO', 0),
                             non_refs_gts = fam.segregation.get('YES', 0),
                             most_severe_consequence = fam.vep.vep.most_severe_consequence,
                             transcript_consequences = fam.vep.vep.transcript_consequences,
                             intergenic_consequences = fam.vep.vep.intergenic_consequences,
                             motif_feature_consequences = fam.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = fam.vep.vep.regulatory_feature_consequences)
    
    fam = fam.select_entries(fam.GT)
    fams_to_export_selected.append(fam.make_table().to_pandas())

for idx, f in enumerate(fams_to_export_selected):
    fams_to_export_selected[idx] = f[f.columns.drop(list(f.filter(regex='.phased')))]

2022-01-12 18:56:21 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'

In [252]:
list_of_tables = [fams_to_export_all, fams_to_export_selected, descriptions_to_export]
concatenated_list = [val for tup in zip(*list_of_tables) for val in tup]

sheet_names = [fams_all_genes, fams_selected_genes, fams_description]
concatenated_list_sheets = [val for tup in zip(*sheet_names) for val in tup]

with pd.ExcelWriter('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/families-11-2021-duplicate-genes.xlsx') as writer:
    for idx, res in enumerate(concatenated_list):
        res.to_excel(writer, sheet_name=str(concatenated_list_sheets[idx]), header=True, index=False)

In [43]:
loci_all = []
loci_selected = []

for f in fams_to_export_all:
    loci_all.append(list(f['locus.contig']+':'+f['locus.position'].astype(str)))
    f['locus'] = f['locus.contig']+':'+f['locus.position'].astype(str)
    
for f in fams_to_export_selected:
    loci_selected.append(list(f['locus.contig']+':'+f['locus.position'].astype(str)))
    f['locus'] = f['locus.contig']+':'+f['locus.position'].astype(str)

loci_all = [item for sublist in loci_all for item in sublist]
loci_selected = [item for sublist in loci_selected for item in sublist]

seen = set()
dupes_loci_all = [x for x in loci_all if x in seen or seen.add(x)]    

seen = set()
dupes_loci_selected = [x for x in loci_all if x in seen or seen.add(x)]    

In [48]:
fams_to_export_all_filtered = []

for f in fams_to_export_all:
    f_filtered = f[f.locus.isin(dupes_loci_all)]
    fams_to_export_all_filtered.append(f_filtered)
    
fams_to_export_selected_filtered = []

for f in fams_to_export_selected:
    f_filtered = f[f.locus.isin(dupes_loci_selected)]
    fams_to_export_selected_filtered.append(f_filtered)

In [49]:
list_of_tables = [fams_to_export_all_filtered, fams_to_export_selected_filtered, descriptions_to_export]
concatenated_list = [val for tup in zip(*list_of_tables) for val in tup]

sheet_names = [fams_all_genes, fams_selected_genes, fams_description]
concatenated_list_sheets = [val for tup in zip(*sheet_names) for val in tup]

with pd.ExcelWriter('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/families-11-2021-duplicated-variants.xlsx') as writer:
    for idx, res in enumerate(concatenated_list):
        res.to_excel(writer, sheet_name=str(concatenated_list_sheets[idx]), header=True, index=False)

## 6. For each of the larger families find pathogenic variants according to the schema by KF

In [5]:
schema = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/burden-and-family/familiespattern.csv', impute = True, key='sample', delimiter = '\;', quote ="\"")

2022-02-02 10:25:24 Hail: WARN: Name collision: field 'sample' already in object dict. 
  This field must be referenced with __getitem__ syntax: obj['sample']
2022-02-02 10:25:24 Hail: INFO: Reading table to impute column types
2022-02-02 10:25:28 Hail: INFO: Finished type imputation
  Loading field 'sample' as type str (imputed)
  Loading field 'family' as type str (imputed)
  Loading field 'mutation_option_1' as type str (imputed)
  Loading field 'mutation_option_2' as type str (imputed)


In [6]:
#fams = ['A', 'B', 'C','D', 'E', 'F', 'G', 'H', 'H2', 'I', 'J', 'R', 'S', 'T', 'T2', 'U', 'W', 'X', 'Y']  # I duplicated H and T tables in their folder
fams = ['A', 'B', 'C','D', 'E', 'F', 'G', 'H', 'I', 'J', 'R', 'S', 'T', 'U', 'W', 'X', 'Y'] 
#fams_all_genes = ['A_all_genes', 'B_all_genes', 'C_all_genes','D_all_genes',
#                  'E_all_genes', 'F_all_genes', 'G_all_genes', 'H1_all_genes', 'H2_all_genes',
#                  'I_all_genes', 'J_all_genes', 'R_all_genes', 'S_all_genes',
#                  'T1_all_genes', 'T2_all_genes', 'U_all_genes', 'W_all_genes', 'X_all_genes', 'Y_all_genes']
fams_all_genes = ['A_all_genes', 'B_all_genes', 'C_all_genes','D_all_genes',
                  'E_all_genes', 'F_all_genes', 'G_all_genes', 'H1_all_genes',
                  'I_all_genes', 'J_all_genes', 'R_all_genes', 'S_all_genes',
                  'T1_all_genes', 'U_all_genes', 'W_all_genes', 'X_all_genes', 'Y_all_genes']
#fams_selected_genes = ['A_selected_genes', 'B_selected_genes', 'C_selected_genes','D_selected_genes',
#                  'E_selected_genes', 'F_selected_genes', 'G_selected_genes', 'H1_selected_genes', 'H2_selected_genes',
#                  'I_selected_genes', 'J_selected_genes', 'R_selected_genes', 'S_selected_genes',
#                  'T1_selected_genes', 'T2_selected_genes', 'U_selected_genes', 'W_selected_genes', 'X_selected_genes', 'Y_selected_genes']
fams_selected_genes = ['A_selected_genes', 'B_selected_genes', 'C_selected_genes','D_selected_genes',
                  'E_selected_genes', 'F_selected_genes', 'G_selected_genes', 'H1_selected_genes',
                  'I_selected_genes', 'J_selected_genes', 'R_selected_genes', 'S_selected_genes',
                  'T1_selected_genes', 'U_selected_genes', 'W_selected_genes', 'X_selected_genes', 'Y_selected_genes']

#fams_description = ['A_description', 'B_description', 'C_description','D_description',
#                  'E_description', 'F_description', 'G_description', 'H1_description', 'H2_description',
#                  'I_description', 'J_description', 'R_description', 'S_description',
#                  'T1_description', 'T2_description', 'U_description', 'W_description', 'X_description', 'Y_description']

fams_description = ['A_description', 'B_description', 'C_description','D_description',
                  'E_description', 'F_description', 'G_description', 'H1_description',
                  'I_description', 'J_description', 'R_description', 'S_description',
                  'T1_description', 'U_description', 'W_description', 'X_description', 'Y_description']


fams1 = []
fams2 = []
fams_to_export_all = []
fams_to_export_selected = []
descriptions_to_export = []

In [7]:
genes_fams1 = []
genes_fams2 = []

for f in fams:

    fam = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vep'+f+'.mt')
    fam = fam.annotate_cols(schema = schema[fam.s])
    fam = fam.annotate_rows(segregation = hl.agg.group_by(fam.schema.mutation_option_1, hl.agg.count_where(fam.GT.is_non_ref())))
    with_mutation = fam.aggregate_cols((hl.agg.counter(fam.schema.mutation_option_1))['YES'])
    fam = fam.filter_rows((fam.segregation.get('NO', 0) == 0) & (fam.segregation.get('YES', 0) == with_mutation))
    
    fam = fam.filter_rows(hl.is_snp(fam.alleles[0], fam.alleles[1]))
    
    fam1 = fam.filter_rows(hl.if_else(hl.is_defined(fam.gnomad_v_3_1.freq.AF[2]), fam.gnomad_v_3_1.freq.AF[2] < 0.001, True))

    genes_fams1.append(fam1.within_gene.collect())
    fams1.append(fam1)
    
    fam2 = fam.filter_rows((hl.any(lambda x: hl.literal(all_genes).contains(x), fam.within_gene)))
    fam2 = fam2.filter_rows(hl.if_else(hl.is_defined(fam2.gnomad_v_3_1.freq.AF[2]), fam2.gnomad_v_3_1.freq.AF[2] < 0.05, True))
    genes_fams2.append(fam2.within_gene.collect())
    fams2.append(fam2)



In [9]:
#add H2 and T2 option:

#fam = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vep'+fams[8]+'.mt')
#fam = fam.annotate_cols(schema = schema[fam.s])
#fam = fam.annotate_rows(segregation = hl.agg.group_by(fam.schema.mutation_option_2, hl.agg.count_where(fam.GT.is_non_ref())))
#with_mutation = fam.aggregate_cols((hl.agg.counter(fam.schema.mutation_option_2))['YES'])
#fam = fam.filter_rows((fam.segregation.get('NO', 0) == 0) & (fam.segregation.get('YES', 0) == with_mutation))
    
#fam = fam.filter_rows(hl.is_snp(fam.alleles[0], fam.alleles[1]))
    
#fam1 = fam.filter_rows(hl.if_else(hl.is_defined(fam.gnomad_v_3_1.freq.AF[2]), fam.gnomad_v_3_1.freq.AF[2] < 0.001, True))

#fams1[8] = fam1
#genes_fams1.append(fam1.within_gene.collect())
    
#fam2 = fam.filter_rows((hl.any(lambda x: hl.literal(all_genes).contains(x), fam.within_gene)))
#fam2 = fam2.filter_rows(hl.if_else(hl.is_defined(fam2.gnomad_v_3_1.freq.AF[2]), fam2.gnomad_v_3_1.freq.AF[2] < 0.05, True))
#fams2[8] = fam2
#genes_fams2.append(fam2.within_gene.collect())

#fam = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/joint-gts-only/fam-vep'+fams[14]+'.mt')
#fam = fam.annotate_cols(schema = schema[fam.s])
#fam = fam.annotate_rows(segregation = hl.agg.group_by(fam.schema.mutation_option_2, hl.agg.count_where(fam.GT.is_non_ref())))
#with_mutation = fam.aggregate_cols((hl.agg.counter(fam.schema.mutation_option_2))['YES'])
#fam = fam.filter_rows((fam.segregation.get('NO', 0) == 0) & (fam.segregation.get('YES', 0) == with_mutation))
    
#fam = fam.filter_rows(hl.is_snp(fam.alleles[0], fam.alleles[1]))
    
#fam1 = fam.filter_rows(hl.if_else(hl.is_defined(fam.gnomad_v_3_1.freq.AF[2]), fam.gnomad_v_3_1.freq.AF[2] < 0.001, True))
#fams1[14] = fam1
#genes_fams1.append(fam1.within_gene.collect())
    
#fam2 = fam.filter_rows((hl.any(lambda x: hl.literal(all_genes).contains(x), fam.within_gene)))
#fam2 = fam2.filter_rows(hl.if_else(hl.is_defined(fam2.gnomad_v_3_1.freq.AF[2]), fam2.gnomad_v_3_1.freq.AF[2] < 0.05, True))
#fams2[14] = fam2
#genes_fams2.append(fam2.within_gene.collect())



In [8]:
for idx, gene_list in enumerate(genes_fams1):
    gene_list = [item for sublist in gene_list for item in sublist] # first flatten the gene list in each family
    genes_fams1[idx] = list(set(gene_list))

In [9]:
genes_fams1 = [item for sublist in genes_fams1 for item in sublist]

In [10]:
seen = set()
dupes_1 = [x for x in genes_fams1 if x in seen or seen.add(x)]    

In [11]:
len(set(dupes_1))

113

In [12]:
# export only genes with multiple variants:
for idx, fam in enumerate(fams1):
    fam = fam.filter_rows((hl.any(lambda x: hl.literal(dupes_1).contains(x), fam.within_gene)))
    fams1[idx] = fam

In [13]:
for idx, gene_list in enumerate(genes_fams2):
    gene_list = [item for sublist in gene_list for item in sublist] # first flatten the gene list in each family
    genes_fams2[idx] = list(set(gene_list))

In [14]:
genes_fams2 = [item for sublist in genes_fams2 for item in sublist]

In [15]:
len(set(genes_fams2))

306

In [16]:
seen = set()
dupes_2 = [x for x in genes_fams2 if x in seen or seen.add(x)]    

In [17]:
len(set(dupes_2))

77

In [18]:
for idx, fam in enumerate(fams2):
    fam = fam.filter_rows((hl.any(lambda x: hl.literal(dupes_2).contains(x), fam.within_gene)))
    fams2[idx] = fam

In [19]:
for fam in fams1:
    fam = fam.drop(fam['a_index'], fam['was_split'])
    fam = fam.transmute_rows(DP_stats = fam.dp_qc,
                             GQ_stats =fam.gq_qc, 
                             gnomad_v3_nfe_af = fam.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = fam.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = fam.cadd.cadd_score,
                             within_gene = fam.within_gene,
                             hpo = fam.hpo,
                             non_refs_healthy = fam.segregation.get('NO', 0),
                             non_refs_gts = fam.segregation.get('YES', 0),
                             most_severe_consequence = fam.vep.vep.most_severe_consequence,
                             transcript_consequences = fam.vep.vep.transcript_consequences,
                             intergenic_consequences = fam.vep.vep.intergenic_consequences,
                             motif_feature_consequences = fam.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = fam.vep.vep.regulatory_feature_consequences)

    fam = fam.select_entries(fam.GT)
    fams_to_export_all.append(fam.make_table().to_pandas())
    descriptions_to_export.append(fam.cols().to_pandas())

for idx, f in enumerate(fams_to_export_all):
    fams_to_export_all[idx] = f[f.columns.drop(list(f.filter(regex='.phased')))]
    
    

for fam in fams2:
    fam = fam.drop(fam['a_index'], fam['was_split'])
    fam = fam.transmute_rows(DP_stats = fam.dp_qc,
                             GQ_stats =fam.gq_qc, 
                             gnomad_v3_nfe_af = fam.gnomad_v_3_1.freq.AF[2],
                             gnomad_v3_nfe_homozygote_count = fam.gnomad_v_3_1.freq.homozygote_count[2],
                             cadd = fam.cadd.cadd_score,
                             within_gene = fam.within_gene,
                             hpo = fam.hpo,
                             non_refs_healthy = fam.segregation.get('NO', 0),
                             non_refs_gts = fam.segregation.get('YES', 0),
                             most_severe_consequence = fam.vep.vep.most_severe_consequence,
                             transcript_consequences = fam.vep.vep.transcript_consequences,
                             intergenic_consequences = fam.vep.vep.intergenic_consequences,
                             motif_feature_consequences = fam.vep.vep.motif_feature_consequences,
                             regulatory_feature_consequences = fam.vep.vep.regulatory_feature_consequences)
    
    fam = fam.select_entries(fam.GT)
    fams_to_export_selected.append(fam.make_table().to_pandas())

for idx, f in enumerate(fams_to_export_selected):
    fams_to_export_selected[idx] = f[f.columns.drop(list(f.filter(regex='.phased')))]

2022-02-02 10:27:37 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2022-02-02 10:27:37 Hail: INFO: Coerced sorted dataset
2022-02-02 10:27:59 Hail: INFO: Coerced sorted dataset           (39 + 11) / 50]
2022-02-02 10:28:04 Hail: INFO: Coerced sorted dataset=====>      (44 + 6) / 50]
2022-02-02 10:28:24 Hail: INFO: Coerced sorted dataset           (38 + 12) / 50]
2022-02-02 10:28:29 Hail: INFO: Coerced sorted dataset====>       (43 + 7) / 50]
2022-02-02 10:28:51 Hail: INFO: Coerced sorted dataset           (37 + 12) / 50]

In [20]:
list_of_tables = [fams_to_export_all, fams_to_export_selected, descriptions_to_export]
concatenated_list = [val for tup in zip(*list_of_tables) for val in tup]

sheet_names = [fams_all_genes, fams_selected_genes, fams_description]
concatenated_list_sheets = [val for tup in zip(*sheet_names) for val in tup]

with pd.ExcelWriter('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/families-11-2021-schema-duplicated-genes.xlsx') as writer:
    for idx, res in enumerate(concatenated_list):
        res.to_excel(writer, sheet_name=str(concatenated_list_sheets[idx]), header=True, index=False)

In [21]:
loci_all = []
loci_selected = []

for f in fams_to_export_all:
    loci_all.append(list(f['locus.contig']+':'+f['locus.position'].astype(str)))
    f['locus'] = f['locus.contig']+':'+f['locus.position'].astype(str)
    
for f in fams_to_export_selected:
    loci_selected.append(list(f['locus.contig']+':'+f['locus.position'].astype(str)))
    f['locus'] = f['locus.contig']+':'+f['locus.position'].astype(str)

loci_all = [item for sublist in loci_all for item in sublist]
loci_selected = [item for sublist in loci_selected for item in sublist]

seen = set()
dupes_loci_all = [x for x in loci_all if x in seen or seen.add(x)]    

seen = set()
dupes_loci_selected = [x for x in loci_selected if x in seen or seen.add(x)]    

In [22]:
seen = set()
dupes_loci_all = [x for x in loci_all if x in seen or seen.add(x)]    

In [23]:
dupes_loci_all

['chr19:53803962', 'chrY:11322439', 'chr4:109086168', 'chr5:76482747']

In [24]:
fams_to_export_all_filtered = []

for f in fams_to_export_all:
    f_filtered = f[f.locus.isin(dupes_loci_all)]
    fams_to_export_all_filtered.append(f_filtered)
    
fams_to_export_selected_filtered = []

for f in fams_to_export_selected:
    f_filtered = f[f.locus.isin(dupes_loci_selected)]
    fams_to_export_selected_filtered.append(f_filtered)

In [25]:
list_of_tables = [fams_to_export_all_filtered, fams_to_export_selected_filtered, descriptions_to_export]
concatenated_list = [val for tup in zip(*list_of_tables) for val in tup]

sheet_names = [fams_all_genes, fams_selected_genes, fams_description]
concatenated_list_sheets = [val for tup in zip(*sheet_names) for val in tup]

with pd.ExcelWriter('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/results/burden-and-family/families-11-2021-schema-duplicated-variants.xlsx') as writer:
    for idx, res in enumerate(concatenated_list):
        res.to_excel(writer, sheet_name=str(concatenated_list_sheets[idx]), header=True, index=False)

## 7. Find variants that are common across multiple families