In [1]:
import os
localfs_path = os.environ.get('SCRATCH_LOCAL') + '/'

In [2]:
os.environ['_JAVA_OPTIONS'] = f'-Djava.io.tmpdir={localfs_path}'

import hail as hl

hl.init(
    tmp_dir=(localfs_path+'tmp_hail'),
    spark_conf={'spark.driver.memory': '30G', 'spark.executor.memory': '10G'}, # I don't know what should be here
    default_reference='GRCh38'
) 

Picked up _JAVA_OPTIONS: -Djava.io.tmpdir=/localfs/4609352/
Picked up _JAVA_OPTIONS: -Djava.io.tmpdir=/localfs/4609352/


23/08/27 18:31:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
Running on Apache Spark version 3.3.2
SparkUI available at http://ac0767:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.113-cf32652c5077
LOGGING: writing to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/preprocessing/oligogenic-model/hail-20230827-1831-0.2.113-cf32652c5077.log


In [3]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

In [4]:
def run_pca(mtx, mtx_path, suffix):
    mtx = hl.variant_qc(mtx)
    mtx = mtx.checkpoint(localfs_path+'variant_qced_'+suffix+mtx_path)
    mtx = hl.read_matrix_table(localfs_path+'variant_qced_'+suffix+mtx_path)
    for_pca = mtx.filter_rows(mtx.variant_qc.AF[1] > 0.05)
    pruned_variant_table = hl.ld_prune(for_pca.GT, r2=0.2, bp_window_size=500000)
    for_pca = for_pca.filter_rows(hl.is_defined(pruned_variant_table[for_pca.row_key]))

    for_pca = for_pca.checkpoint(localfs_path+'for_pca_20_'+suffix+mtx_path)
    for_pca = hl.read_matrix_table(localfs_path+'for_pca_20_'+suffix+mtx_path)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(for_pca.GT, k=20)
    
    mtx = mtx.annotate_cols(pruned_scores = pcs[mtx.s].scores)
    mtx = mtx.checkpoint(localfs_path+'after_pca_20_'+suffix+mtx_path)
    
    return(mtx)

def run_pca_no_filter(mtx, mtx_path, suffix):
    
    for_pca = mtx.sample_rows(0.1)

    for_pca = for_pca.checkpoint(localfs_path+'subset_'+suffix+mtx_path)
    for_pca = hl.read_matrix_table(localfs_path+'subset_'+suffix+mtx_path)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(for_pca.GT, k=20)
    
    mtx = mtx.annotate_cols(scores_no_filter = pcs[mtx.s].scores)
    mtx = mtx.checkpoint(localfs_path+'after_pca_no_filters_'+suffix+mtx_path)
    
    return(mtx)

def remove_pca_outliers(mtx, field, last_score, mtx_path, suffix):
    
    mtx = mtx.annotate_globals(
            st = mtx.aggregate_cols(
                hl.agg.array_agg(
                    lambda pc: hl.agg.stats(pc),
                    mtx[field])
            )
        )

    mtx = mtx.annotate_cols(
            pc_outliers=hl.map(
                lambda s, st: hl.int((s > st['mean'] + (10 * st['stdev'])) | (s < st['mean'] - (10 * st['stdev']))),
                mtx[field][0:last_score],
                mtx.st
            )
        )

    mtx = mtx.filter_cols(
        hl.sum(mtx.pc_outliers) ==  0
    )
    
    mtx = mtx.checkpoint(localfs_path+'no_outliers'+mtx_path+suffix+'.mt')
    
    return(mtx)

### select matrices for skat:
1. sportsmen vs GTS 40
2. polish zeros vs GTS 40

In [5]:
test = hl.read_matrix_table(localfs_path+'polesunion4.mt')

In [6]:
test.aggregate_cols(hl.agg.counter(test.sex))

{'F': 450, 'M': 2550, None: 185}

In [7]:
mt_1 = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/gts_and_s.mt')
mt_2 = hl.read_matrix_table(localfs_path+'polesunion4.mt')

In [8]:
mts = [mt_1, mt_2]

In [9]:
pheno = hl.import_table(
    '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/pheno/GTS-coded-corrected-june-2021.csv',
    impute=True,
    delimiter=',',
    quote="\""
)

pheno = pheno.key_by(pheno.ID)

2023-08-27 08:55:39.387 Hail: INFO: Reading table to impute column types
2023-08-27 08:55:40.307 Hail: INFO: Finished type imputation
  Loading field 'ID' as type str (imputed)
  Loading field 'family' as type str (imputed)
  Loading field 'sex' as type str (imputed)
  Loading field 'kinship' as type str (imputed)
  Loading field 'disease' as type str (imputed)
  Loading field 'phenotype' as type str (imputed)
  Loading field 'add_pheno' as type str (imputed)
  Loading field 'heavy_tics' as type str (imputed)
  Loading field 'heavy_tics_familial' as type str (imputed)
  Loading field 'GTS_ASD_group' as type str (imputed)
  Loading field 'nonCTD' as type str (imputed)


In [10]:
for idx, mt in enumerate(mts):
    mt = mt.annotate_cols(
    group = hl.if_else(
            mt.s.contains('B'),
            'local_controls',
            hl.if_else(
                (mt.s.contains('NA') | mt.s.contains('HG')),
                '1kg_controls',
                hl.if_else(
                    mt.s.contains('polish'),
                    'polish_controls',
                    'GTS'
                )
            )
        )
    )
    
    mt = mt.annotate_cols(phenotypes = pheno[mt.s])
    mts[idx] = mt

In [12]:
#sportsmen vs GTS 40
s_vs_gts = mts[0].filter_cols((mts[0].group == 'local_controls') | (mts[0].phenotypes.family == '.'))
s_vs_gts = s_vs_gts.filter_rows(hl.agg.any(s_vs_gts.GT.is_non_ref()))
#s_vs_gts = s_vs_gts.checkpoint(localfs_path+'s_vs_gts.mt')    

# polish no zeros vs GTS 40
pw_vs_gts = mts[1].filter_cols((mts[1].group == 'polish_controls') | (mts[1].phenotypes.family == '.'))
pw_vs_gts = pw_vs_gts.filter_rows(hl.agg.any(pw_vs_gts.GT.is_non_ref()))
pw_vs_gts = pw_vs_gts.checkpoint(localfs_path+'pw_vs_gts_2.mt') 


2023-08-27 09:39:16.422 Hail: INFO: wrote matrix table with 3041614 rows and 3040 columns in 489 partitions to /localfs/4579427/pw_vs_gts_2.mt


In [13]:
#s_vs_gts = hl.read_matrix_table(localfs_path+'s_vs_gts.mt')  
pw_vs_gts = hl.read_matrix_table(localfs_path+'pw_vs_gts_2.mt') 

In [14]:
#s_vs_gts = s_vs_gts.naive_coalesce(500)
pw_vs_gts = pw_vs_gts.naive_coalesce(500)

In [15]:
#s_vs_gts = s_vs_gts.checkpoint(localfs_path+'s_vs_gts_rep.mt') 

In [16]:
pw_vs_gts = pw_vs_gts.checkpoint(localfs_path+'pw_vs_gts_rep_2.mt') 

2023-08-27 10:04:47.995 Hail: INFO: wrote matrix table with 3041614 rows and 3040 columns in 489 partitions to /localfs/4579427/pw_vs_gts_rep_2.mt


In [6]:
pw_vs_gts = hl.read_matrix_table(localfs_path+'pw_vs_gts_rep_2.mt') 

In [8]:
pw_vs_gts.aggregate_cols(hl.agg.counter(pw_vs_gts.sex))

{'F': 450, 'M': 2550, None: 40}

In [None]:
#for idx, mt in enumerate(mts):
#    mts[idx] = run_pca(mt, mts_paths[idx], 'six_')
    
for_pca = hl.read_matrix_table(localfs_path+'for_pca_20_five_pw_vs_gts')
eigenvalues, pcs, _ = hl.hwe_normalized_pca(for_pca.GT, k=20)

In [11]:
pw_vs_gts = pw_vs_gts.annotate_cols(pruned_scores = pcs[pw_vs_gts.s].scores)
pw_vs_gts = pw_vs_gts.checkpoint(localfs_path+'after_pca_20_five_pw_vs_gts')

2023-08-27 16:59:21.159 Hail: INFO: wrote matrix table with 3041614 rows and 3040 columns in 489 partitions to /localfs/4579427/after_pca_20_five_pw_vs_gts


In [13]:
mts_paths = [
    #'s_vs_gts',
    'pw_vs_gts'
]

for mt_path in mts_paths:
    mt = hl.read_matrix_table('/localfs/4579427/after_pca_20_five_pw_vs_gts')
    mt.write(
        '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/redone_after_pca_20_more_'+mt_path+'.mt',
        overwrite = True
    )

2023-08-27 17:31:04.535 Hail: INFO: wrote matrix table with 3041614 rows and 3040 columns in 489 partitions to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/redone_after_pca_20_more_pw_vs_gts.mt


In [18]:
mts_paths = [
    #'s_vs_gts',
    'pw_vs_gts'
]

mts = []

for mt_path in mts_paths:
    mt = hl.read_matrix_table(
        '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/redone_after_pca_20_more_'+mt_path+'.mt'
    )
    
    mts.append(mt)

In [17]:
mts_2 = []

for idx, mt in enumerate(mts):
     mts_2.append(run_pca_no_filter(mt, mts_paths[idx], 'ttt'))

2023-08-27 17:32:21.685 Hail: INFO: wrote matrix table with 303736 rows and 3040 columns in 489 partitions to /localfs/4579427/subset_tttpw_vs_gts
    Total size: 3.18 GiB
    * Rows/entries: 3.18 GiB
    * Columns: 499.12 KiB
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  1759 rows (18.69 MiB)
2023-08-27 17:32:27.970 Hail: INFO: hwe_normalize: found 300136 variants after filtering out monomorphic sites.
2023-08-27 17:32:34.865 Hail: INFO: pca: running PCA with 20 components.../ 489]
2023-08-27 17:33:44.358 Hail: INFO: wrote table with 0 rows in 0 partitions to /localfs/4579427/tmp_hail/persist_tables7HjGscFri
    Total size: 496.21 KiB
    * Rows: 0.00 B
    * Globals: 496.21 KiB
    * Smallest partition: N/A
    * Largest partition:  N/A
2023-08-27 17:36:36.743 Hail: INFO: wrote matrix table with 3041614 rows and 3040 columns in 489 partitions to /localfs/4579427/after_pca_no_filters_tttpw_vs_gts


In [20]:
mts_paths = [
    'pw_vs_gts'
]

In [21]:
mts = []

for mt_path in mts_paths:
    mt = hl.read_matrix_table(localfs_path+'after_pca_no_filters_ttt'+mt_path)
    mts.append(mt)

In [23]:
mts_2 = []

for mt_path in mts_paths:
    
    mt = hl.read_matrix_table(
        '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/redone_after_pca_20_more_'+mt_path+'.mt'
    ) 
    
    mts_2.append(mt)

In [24]:
for idx, mt in enumerate(mts):
    
    mt = mt.annotate_cols(pruned_scores = mts_2[idx].cols()[mt.s].pruned_scores) #these are the pruned scores
    
    mt.write(
        '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/'+mts_paths[idx]+'_pca.mt',
        overwrite = True
    ) 

2023-08-27 17:54:05.560 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2023-08-27 18:07:53.303 Hail: INFO: wrote matrix table with 3041614 rows and 3040 columns in 489 partitions to /net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/pw_vs_gts_pca.mt


In [5]:
mts_paths = [
    'pw_vs_gts'
]

mts = []

for path in mts_paths:

    mt = hl.read_matrix_table(
        '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/'+path+'_pca.mt'
    )
    
    mts.append(mt)

In [6]:
mts[0].aggregate_cols(hl.agg.counter(mts[0].sex))

{'F': 450, 'M': 2550, None: 40}

In [7]:
# remove PCA outliers

mtx_s_2 = []

for idx, mt in enumerate(mts):
    
    mtx = remove_pca_outliers(mt, 'pruned_scores', 2, mts_paths[idx], '_no_outs_pruned_ttt')
    mtx_2 = remove_pca_outliers(mt, 'scores_no_filter', 2, mts_paths[idx], '_no_outs_subs_ttt')
    
    mtx = mtx.drop(
        mtx['scores_no_filter'],
        mtx['pruned_scores']
    )
    
    mtx_2 = mtx_2.drop(
        mtx_2['scores_no_filter'],
        mtx_2['pruned_scores']
    )
    
    mts[idx] = mtx 
    mtx_s_2.append(mtx_2)

2023-08-27 18:34:59.132 Hail: INFO: wrote matrix table with 3041614 rows and 3038 columns in 489 partitions to /localfs/4609352/no_outlierspw_vs_gts_no_outs_pruned_ttt.mt
2023-08-27 18:37:05.244 Hail: INFO: wrote matrix table with 3041614 rows and 3038 columns in 489 partitions to /localfs/4609352/no_outlierspw_vs_gts_no_outs_subs_ttt.mt


In [8]:
for mt in mts:
    print(mt.count()) 
    print(mt.aggregate_cols(hl.agg.counter(mt.sex)))

(3041614, 3038)
{'F': 450, 'M': 2550, None: 38}


In [10]:
mt_og = hl.read_matrix_table(
        '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/'+path+'_pca.mt'
    )

In [11]:
og_samples = mt_og.filter_cols(mt_og.group == 'GTS').s.collect()

In [12]:
to_keep = mts[0].filter_cols(mts[0].group == 'GTS').s.collect()

In [13]:
len(og_samples) # no outliers

40

In [14]:
len(to_keep)

38

In [15]:
import numpy as np

In [16]:
np.setdiff1d(og_samples,to_keep) #rem

array(['WGS_5', 'WGS_8'], dtype='<U8')

In [17]:
mts[0].aggregate_cols(hl.agg.counter(mts[0].sex))

{'F': 450, 'M': 2550, None: 38}