In [1]:
import os
localfs_path = os.environ.get('SCRATCH_LOCAL') + '/'

In [None]:
os.environ['_JAVA_OPTIONS'] = f'-Djava.io.tmpdir={localfs_path}'

import hail as hl

hl.init(
    tmp_dir=(localfs_path+'tmp_hail'),
    spark_conf={'spark.driver.memory': '30G', 'spark.executor.memory': '10G'}, # I don't know what should be here
    default_reference='GRCh38'
) 

Picked up _JAVA_OPTIONS: -Djava.io.tmpdir=/localfs/3948668/


In [None]:
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

### in this notebook I am using the matrixtable with outliers already preliminarly removed (in 4-1)

In [None]:
def run_pca_no_filter(mtx, mtx_path, suffix):
    
    for_pca = mtx.sample_rows(0.1)

    for_pca = for_pca.checkpoint(localfs_path+'subset_'+suffix+mtx_path)
    for_pca = hl.read_matrix_table(localfs_path+'subset_'+suffix+mtx_path)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(for_pca.GT, k=20)
    
    mtx = mtx.annotate_cols(scores_no_filter = pcs[mtx.s].scores)
    mtx = mtx.checkpoint(localfs_path+'after_pca_no_filters_'+suffix+mtx_path)
    
    return(mtx)

def remove_pca_outliers(mtx, field, last_score, mtx_path, suffix):
    
    mtx = mtx.annotate_globals(
            st = mtx.aggregate_cols(
                hl.agg.array_agg(
                    lambda pc: hl.agg.stats(pc),
                    mtx[field])
            )
        )

    mtx = mtx.annotate_cols(
            pc_outliers=hl.map(
                lambda s, st: hl.int((s > st['mean'] + (10 * st['stdev'])) | (s < st['mean'] - (10 * st['stdev']))),
                mtx[field][0:last_score],
                mtx.st
            )
        )

    mtx = mtx.filter_cols(
        hl.sum(mtx.pc_outliers) ==  0
    )
    
    mtx = mtx.checkpoint(localfs_path+'no_outliers'+mtx_path+suffix+'.mt')
    
    return(mtx)

### read in the polish without zeros subset the controls randomly allocated (to have ~100 control samples)

In [None]:
pw_vs_gts = hl.read_matrix_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/mts/oligogenic-model/pw_vs_gts.mt')

In [None]:
samples_to_exclude = ['S_7288', 'S_7289', 'S_7290', 'WGS_6827', 'WGS_6835']
#these are low quality samples to exclude

pw_vs_gts = pw_vs_gts.filter_cols(
    hl.literal(samples_to_exclude).contains(pw_vs_gts.s),
    keep = False
)

In [None]:
pw_vs_gts.count()

In [None]:
# I am going to divide the controls into 9 ~equal groups:
pw_vs_gts = pw_vs_gts.annotate_cols(
    subset_no = hl.if_else(
        pw_vs_gts.group == 'GTS',
        0,
        hl.rand_int32(1, 10, seed=0)
    )
)

In [9]:
pw_vs_gts.aggregate_cols(hl.agg.counter(pw_vs_gts.subset_no))

{0: 37, 1: 99, 2: 123, 3: 118, 4: 104, 5: 101, 6: 109, 7: 123, 8: 107, 9: 116}

In [10]:
mts = []

for i in list(range(1, 10)):
    mt = pw_vs_gts.filter_cols(
        (pw_vs_gts.subset_no == 0) | (pw_vs_gts.subset_no == i)
    )
    
    mts.append(mt)

In [11]:
mts_paths = []

for i in list(range(1,10)):
    mts_paths.append(str(i)+'.mt')

In [None]:
for idx, mt in enumerate(mts):
    mts[idx] = run_pca_no_filter(mt, mts_paths[idx], 'gts_vs_pol_subseted')

2023-07-03 09:02:35.055 Hail: INFO: wrote matrix table with 302129 rows and 136 columns in 878 partitions to /localfs/3878085/subset_gts_vs_pol_subseted1.mt
    Total size: 2.25 GiB
    * Rows/entries: 2.25 GiB
    * Columns: 65.82 KiB
    * Globals: 743.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  1156 rows (8.98 MiB)
2023-07-03 09:03:17.552 Hail: INFO: hwe_normalize: found 278176 variants after filtering out monomorphic sites.
2023-07-03 09:03:22.008 Hail: INFO: pca: running PCA with 20 components.../ 878]
2023-07-03 09:04:28.503 Hail: INFO: wrote table with 0 rows in 0 partitions to /localfs/3878085/tmp_hail/persist_tableFwrWZQ98VA
    Total size: 22.41 KiB
    * Rows: 0.00 B
    * Globals: 22.41 KiB
    * Smallest partition: N/A
    * Largest partition:  N/A
2023-07-03 09:05:59.981 Hail: INFO: wrote matrix table with 3031891 rows and 136 columns in 878 partitions to /localfs/3878085/after_pca_no_filters_gts_vs_pol_subseted1.mt
    Total size: 22.24 GiB
