In [1]:
from IPython.display import display
from IPython.display import HTML
import IPython.core.display as di # Example: di.display_html('<h3>%s:</h3>' % str, raw=True)

#This line will hide code by default when the notebook is exported as HTML
#di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz')

Running on Apache Spark version 2.4.3
SparkUI available at http://p0615.prometheus:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.62-84fa81b9ea3d
LOGGING: writing to /net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/analysis/hail-20211008-1042-0.2.62-84fa81b9ea3d.log


In [3]:
from hail.plot import show
from pprint import pprint
from bokeh.layouts import gridplot
hl.plot.output_notebook()


import numpy as np
import pandas as pd
from functools import reduce
from itertools import chain

from bokeh.plotting import output_notebook, show, figure
from bokeh.palettes import viridis

output_notebook()

### import vcfs and write to matrix tables (mts)

In [5]:
files = !ls /net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/joint-vcf/vcf-joint-gts/*gz

In [6]:
names = []

for i in files:
    i = i.split('/')
    i = i[9]
    i = i.split('.v')
    i = i[0]
    names.append(i)

In [None]:
for idx, f in enumerate(files):
    hl.import_vcf(f, reference_genome='GRCh38', 
                  force_bgz = True, array_elements_required = False).write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/'+names[idx]+'.mt')

In [23]:
mts = []
for i in names:
    i = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/'+i+'.mt')
    mts.append(i)

### filter the files with the repeatmasker track (removes difficult to sequence parts)

In [10]:
rpmk = hl.import_bed('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/repeatmasker_all', #this file is stored in another project
                     reference_genome='GRCh38', skip_invalid_intervals=True)

2021-02-11 09:59:39 Hail: INFO: Reading table with no type imputation
  Loading column 'f0' as type 'str' (user-specified)
  Loading column 'f1' as type 'int32' (user-specified)
  Loading column 'f2' as type 'int32' (user-specified)



In [None]:
rpmk.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/repeatmasker_all.ht')

In [20]:
rpmk = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/repeatmasker_all.ht')

In [12]:
for idx, mt in enumerate(mts):
    print(mt.count())
    mt = mt.filter_rows(hl.is_defined(rpmk[mt.locus]), keep = False)
    mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/rpkm-'+names[idx]+'.mt')
    mts[idx] = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/rpkm-'+names[idx]+'.mt')

### filter with gnomad coverage (90% over 1) and annotate with gnomad

In [None]:
table = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/gnomad.genomes.r3.0.coverage.summary.tsv.bgz', impute = True)

In [7]:
mts = []
for i in names:
    i = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/rpkm-'+i+'.mt')
    mts.append(i)

In [6]:
table = table.select(table.locus, table.over_1)
table = table.annotate(locus = hl.parse_locus(table.locus, reference_genome='GRCh38'))
table = table.filter(table.over_1 >0.9)
table = table.key_by(table.locus)

In [8]:
for idx, mt in enumerate(mts):
    print(mt.count())
    mt = mt.filter_rows(hl.is_defined(table[mt.locus]), keep = True)
    mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/cov-'+names[idx]+'.mt')
    mts[idx] = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/cov-'+names[idx]+'.mt')

In [9]:
gnomad = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/gnomad/gnomad_selected_filtered.ht')

In [None]:
for idx, mt in enumerate(mts):
    mts[idx] = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/cov-'+names[idx]+'.mt')
    mt = mt.select_rows(mt.rsid, mt.info.AC, mt.info.AF, mt.info.AN)
    mt = hl.split_multi_hts(mt)
    mt = mt.annotate_rows(gnomad_v3 = gnomad[mt.row_key])
    mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/gnmd-'+names[idx]+'.mt')
    mts[idx] = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/gnmd-'+names[idx]+'.mt')

### annotate with genes and hpo

In [21]:
genes = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/genecode_v32.ht')

genes = genes.filter(hl.is_valid_contig(genes['hg38.knownGene.chrom'], reference_genome='GRCh38'))

hpo = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/hpo.tsv', impute = True, no_header=True)

start = genes['hg38.knownGene.txStart']
stop =  genes['hg38.knownGene.txEnd']

genes = genes.transmute(interval = 
                        hl.locus_interval(genes['hg38.knownGene.chrom'], 
                                          start,
                                          stop,
                                          reference_genome='GRCh38', includes_start=False))

genes = genes.key_by(genes['hg38.kgXref.geneSymbol'])

hpo = hpo.key_by(hpo.f0)

genes = genes.annotate(hpo = hpo.index(genes['hg38.kgXref.geneSymbol'], all_matches = True)['f1'])

genes = genes.key_by(genes.interval)

for idx, mt in enumerate(mts):
    mts[idx] = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/gnmd-'+names[idx]+'.mt')
    mt = mt.annotate_rows(within_gene = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hg38.kgXref.geneSymbol'])))
    mt = mt.annotate_rows(hpo = hl.array(hl.set(genes.index(mt.locus, all_matches=True)['hpo'])))
    mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/hpo-'+names[idx]+'.mt')
    mts[idx] = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/hpo-'+names[idx]+'.mt')

In [18]:
mts = []

for m in names:
    f = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/hpo-'+m+'.mt')
    mts.append(f)

In [49]:
mts = mts[0:9]+mts[10:]

In [50]:
mt_full = hl.MatrixTable.union_rows(*mts)

In [54]:
mt_full.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint.mt')

2021-02-11 19:48:32 Hail: INFO: wrote matrix table with 10751439 rows and 287 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/joint.mt


In [8]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint.mt')

In [5]:
mt.count()

(10751439, 287)

In [57]:
mt_subset = mt.sample_rows(0.001)

In [59]:
mt_subset.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-subset.mt')

2021-02-11 19:50:15 Hail: INFO: wrote matrix table with 10709 rows and 287 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/joint-subset.mt


In [10]:
mt_subset = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-subset.mt')

### PCA on all the samples

In [None]:
eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

mt_subset = mt_subset.annotate_cols(group = hl.cond(mt_subset.s.contains("B"), "sport", "gts"))

mt_subset = mt_subset.annotate_cols(scores = pcs[mt_subset.s].scores)

p = hl.plot.scatter(mt_subset.scores[0],
                    mt_subset.scores[1],
                    label=mt_subset.group,
                    title='PCA', xlabel='PC1', ylabel='PC2')
show(p)

### Create a separate matrix with GTS

In [None]:
to_keep = ['S_7156','WGS_108','WGS_113','WGS_122','WGS_139','WGS_13','WGS_141','WGS_149','WGS_154','WGS_15','WGS_168','WGS_171a','WGS_182a','WGS_183',
           'WGS_196','WGS_200a','WGS_201','WGS_202a','WGS_22','WGS_57a','WGS_5','WGS_6827','WGS_6857','WGS_6880','WGS_6955','WGS_6958','WGS_7039','WGS_7050',
           'WGS_7117','WGS_7168','WGS_7176','WGS_76','WGS_77','WGS_81','WGS_82','WGS_83','WGS_86','WGS_87','WGS_88','WGS_8']

In [None]:
len(to_keep)

In [8]:
gts = mt.filter_cols(hl.set(to_keep).contains(mt.s), keep=True)

In [9]:
gts.count()

(10751439, 40)

In [10]:
gts_subset = gts.sample_rows(0.001)

In [11]:
gts_subset.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/gts-subset.mt')

2021-03-18 14:50:06 Hail: INFO: wrote matrix table with 10709 rows and 40 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/gts-subset.mt
    Total size: 5.60 MiB
    * Rows/entries: 5.60 MiB
    * Columns: 224.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  10 rows (8.52 KiB)


In [12]:
pc_rel = hl.pc_relate(gts_subset.GT, 0.001, k=1, statistics='kin')

2021-03-18 15:14:18 Hail: INFO: hwe_normalized_pca: running PCA using 6705 variants.
2021-03-18 15:14:57 Hail: INFO: pca: running PCA with 1 components...
2021-03-18 15:19:30 Hail: INFO: Wrote all 3 blocks of 10709 x 40 matrix with block size 4096.


In [13]:
pairs = pc_rel.filter(pc_rel['kin'] > 0.2)

In [14]:
related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j,
                                                       keep=False)

2021-03-18 15:19:32 Hail: INFO: wrote matrix with 2 rows and 10709 columns as 3 blocks of size 4096 to /net/scratch/people/plggosborcz/pcrelate-write-read-ZWF7qwu8GTzQluXGpF5brR.bm
2021-03-18 15:19:33 Hail: INFO: wrote matrix with 10709 rows and 40 columns as 3 blocks of size 4096 to /net/scratch/people/plggosborcz/pcrelate-write-read-G9Ly0ykYjQJVumF3Rx2IjR.bm
2021-03-18 15:19:34 Hail: INFO: wrote matrix with 40 rows and 40 columns as 1 block of size 4096 to /net/scratch/people/plggosborcz/pcrelate-write-read-R9YPy0EzWVawPXnFZP77Zl.bm
2021-03-18 15:19:34 Hail: INFO: wrote matrix with 40 rows and 40 columns as 1 block of size 4096 to /net/scratch/people/plggosborcz/pcrelate-write-read-CdQ9oinEgOBY8n76xKzODs.bm
2021-03-18 15:19:34 Hail: INFO: wrote matrix with 40 rows and 40 columns as 1 block of size 4096 to /net/scratch/people/plggosborcz/pcrelate-write-read-9wQMv87M56K9NOnxGjAAyy.bm
2021-03-18 15:19:34 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-03-18 15:19:35 Hail

In [16]:
related_samples_to_remove.show()

#### There are no related samples in this GTS cohort

In [65]:
gts.count()

10751439

In [20]:
gts.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/unrelated-gts.mt')

2021-03-18 16:11:30 Hail: INFO: wrote matrix table with 10751439 rows and 40 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/unrelated-gts-samples.mt
    Total size: 3.14 GiB
    * Rows/entries: 3.14 GiB
    * Columns: 224.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  6444 rows (5.47 MiB)
2021-03-18 16:12:36 Hail: INFO: wrote matrix table with 10751439 rows and 40 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/unrelated-gts.mt
    Total size: 3.14 GiB
    * Rows/entries: 3.14 GiB
    * Columns: 224.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  6444 rows (5.47 MiB)


### remove samples that are not controls

In [None]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint.mt')

In [15]:
mt.count()

(10751439, 287)

In [44]:
controls_to_keep = ['S_7212','S_7213','S_7214','S_7227','S_7236','S_7255','S_7237','S_7239','S_7241','S_7245','S_7246',
'S_7247','S_7229','S_7252','S_7254','WGS_147c','S_7261','S_7263','S_7269','S_7274','S_7291',
'S_7294','S_7306','S_7307','WGS_37b','WGS_37c','WGS_85b','WGS_7118','WGS_7120','WGS_7142','WGS_7143',
'WGS_7152','WGS_7153','WGS_163b','WGS_163d','S_7250','WGS_180b','WGS_6819','WGS_185c',
'WGS_6812','WGS_6814','WGS_D6813','WGS_D6815','462','468','475','476','477','478','479','490','492','494']

In [45]:
len(controls_to_keep)

53

In [19]:
sportsmen=['B102','B156','B24','B338','B382','B399','B427','B430','B431','B432','B433','B435','B436','B437','B438','B439','B441','B442','B443','B444','B445','B446','B448','B449','B450','B451','B452','B453','B454',
'B455','B456','B457','B458','B459','B461','B462','B463','B464','B465','B466','B467','B468','B469','B470','B471','B472','B473','B474','B475','B476','B477','B478','B479','B480','B481','B482','B483','B485','B486','B487','B488',
'B489','B490','B491','B492','B493','B494','B495','B496','B497','B498','B499','B501','B502','B503','B504','B505','B506','B507','B508','B509','B515','B518','B522','B523','B524','B525','B526','B527','B528','B529','B530','B531',
'B532','B533','B534','B535','B536','B537','B538','B539','B81']

In [26]:
to_keep = controls_to_keep + sportsmen

In [27]:
mt = mt.filter_cols(hl.set(to_keep).contains(mt.s), keep=True)

In [28]:
mt.count()

(10751439, 155)

In [29]:
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-healthy.mt')

2021-03-29 16:14:29 Hail: INFO: wrote matrix table with 10751439 rows and 155 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/joint-healthy.mt
    Total size: 9.28 GiB
    * Rows/entries: 9.28 GiB
    * Columns: 691.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  6444 rows (9.05 MiB)


## extra control of genotype quality

In [15]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-healthy.mt')

In [29]:
rpmk = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-gts/data/external-data/repeatmasker_all.ht')

rpmk = rpmk.annotate(larger_int = hl.if_else(rpmk.interval.start.position == 1,
                                             hl.locus(rpmk.interval.start.contig, (rpmk.interval.start.position), reference_genome='GRCh38'),
                                            hl.locus(rpmk.interval.start.contig, (rpmk.interval.start.position-1), reference_genome='GRCh38')))
                                        
                     

rpmk = rpmk.key_by(rpmk.larger_int)

rpmk = rpmk.select()

rpmk.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/repeatmasker_expanded.ht')

rpmk = hl.read_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/repeatmasker_expanded.ht')

mt = mt.filter_rows(hl.is_defined(rpmk[mt.locus]), keep = False)

In [36]:
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-healthy-filtered.mt')

2021-03-30 10:05:28 Hail: INFO: wrote matrix table with 9824949 rows and 155 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/joint-healthy-filtered.mt
    Total size: 8.47 GiB
    * Rows/entries: 8.47 GiB
    * Columns: 691.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  13167 rows (8.86 MiB)


In [66]:
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-healthy-filtered.mt')

In [67]:
mt.count()

(9824949, 155)

In [40]:
mt_subset = mt.sample_rows(0.001)
mt_subset.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-subset-healthy.mt')

2021-03-30 10:09:44 Hail: INFO: wrote matrix table with 9843 rows and 155 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/joint-subset-healthy.mt
    Total size: 11.55 MiB
    * Rows/entries: 11.55 MiB
    * Columns: 691.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  15 rows (17.17 KiB)


### At this time we realised there was a mistake in genotyping of some of the samples. These are replaced in the matrix table at this point

In [48]:
mt_subset = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-subset-healthy.mt')
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-healthy-filtered.mt')

In [8]:
samples_to_remove = hl.set(["B522", "B506", "B507", "B508", "B509",
                            "B523", "B478", "B449"]) # sample B449 could not be corrected, thus it is permanently removed

In [9]:
mt = mt.filter_cols(
     samples_to_remove.contains(mt.s), keep=False)

In [10]:
mt_subset = mt_subset.filter_cols(
     samples_to_remove.contains(mt_subset.s), keep=False)

In [11]:
mt.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-incomplete-lists.mt')
mt_subset.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-incomplete-lists-subset.mt')

2021-04-15 13:02:25 Hail: INFO: wrote matrix table with 9824949 rows and 147 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/joint-incomplete-lists.mt
    Total size: 8.03 GiB
    * Rows/entries: 8.03 GiB
    * Columns: 659.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  13167 rows (8.43 MiB)
2021-04-15 13:02:42 Hail: INFO: wrote matrix table with 9843 rows and 147 columns in 2054 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/joint-incomplete-lists-subset.mt
    Total size: 11.01 MiB
    * Rows/entries: 11.01 MiB
    * Columns: 659.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  15 rows (16.21 KiB)


In [6]:
mt_subset = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-incomplete-lists-subset.mt')
mt = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/joint-incomplete-lists.mt')

In [7]:
mt.count()

(9824949, 147)

In [None]:
for f in ['B506','B478', 'B507', 'B508', 'B509', 'B522', 'B523']:
    hl.import_vcf('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/vcf/'+f+'.vcf.gz',
                  reference_genome='GRCh38',
                  force_bgz = True, array_elements_required = False).write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/'+f+'.mt')


2021-04-21 22:34:37 Hail: INFO: Coerced sorted dataset
2021-04-21 22:47:55 Hail: INFO: wrote matrix table with 3084456838 rows and 1 column in 147 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/B478.mt
    Total size: 52.06 GiB
    * Rows/entries: 52.06 GiB
    * Columns: 16.00 B
    * Globals: 11.00 B
    * Smallest partition: 19979803 rows (351.18 MiB)
    * Largest partition:  27959649 rows (456.71 MiB)
2021-04-21 23:19:21 Hail: INFO: wrote matrix table with 3084456838 rows and 1 column in 147 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/B478-filtered.mt
    Total size: 52.06 GiB
    * Rows/entries: 52.06 GiB
    * Columns: 16.00 B
    * Globals: 11.00 B
    * Smallest partition: 19979803 rows (351.18 MiB)
    * Largest partition:  27959649 rows (456.71 MiB)
2021-04-21 23:39:38 Hail: INFO: Coerced sorted dataset
2021-04-21 23:52:16 Hail: INFO: wrote matrix table with 3084402782 rows and 1 column in 141 partitions to /net/archive/groups/plggneuro

In [None]:
for f in ['B506','B478', 'B507', 'B508', 'B509', 'B522', 'B523']:
    sample = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/'+f+'.mt')
    sample = sample.filter_rows(hl.is_defined(mt.index_rows(sample.row_key)), keep = True)
    sample.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/'+f+'-filtered.mt')

In [9]:
mts = []

for f in ['B506','B478', 'B507', 'B508', 'B509', 'B522', 'B523']:
    mts.append(hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/'+f+'-filtered.mt'))

In [None]:
final = mt

for idx, f in enumerate(mts):
    final = final.union_cols(f, row_join_type ='outer')
    final.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/to-delete'+str(idx)+'.mt')

In [11]:
final = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/to-delete3.mt')

In [12]:
for idx, f in enumerate(mts):
    final = final.union_cols(f, row_join_type ='outer')
    final.checkpoint('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/'+str+'.mt')

2021-04-23 14:31:14 Hail: INFO: wrote matrix table with 9824949 rows and 152 columns in 2776 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to-delete4.mt
    Total size: 10.37 GiB
    * Rows/entries: 10.37 GiB
    * Columns: 690.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  13167 rows (10.59 MiB)
2021-04-23 14:34:19 Hail: INFO: wrote matrix table with 9824949 rows and 153 columns in 2918 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to-delete5.mt
    Total size: 10.39 GiB
    * Rows/entries: 10.39 GiB
    * Columns: 693.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  13167 rows (10.59 MiB)
2021-04-23 14:38:53 Hail: INFO: wrote matrix table with 9824949 rows and 154 columns in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/to-delete6.mt
    Total size: 10.41 GiB
    * Rows/entries: 10.41 GiB
    * Columns: 697.00 B
    * Globals:

In [14]:
final.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/hail-mts/sportsmen-with-corrected-samples.mt')

2021-04-23 14:44:19 Hail: INFO: wrote matrix table with 9824949 rows and 154 columns in 3061 partitions to /net/archive/groups/plggneuromol/sportsmen-wgs/mts/sportsmen-with-corrected-samples.mt
    Total size: 8.14 GiB
    * Rows/entries: 8.14 GiB
    * Columns: 676.00 B
    * Globals: 11.00 B
    * Smallest partition: 0 rows (20.00 B)
    * Largest partition:  13167 rows (8.55 MiB)
