In [1]:
import hail as hl 
import subprocess
import os

### Part 1: by region

In [2]:
# obtain the paths of the pdo region hail tables 
paths = ! gsutil ls gs://imary116/second_run/output/region
    
# for some reason it is also saving the bucket "region" path so removed it from the list  
paths = paths[1:]
# len(paths) # 25 hts for each of the 25 pdos 

In [3]:
# read all "region" pdo hail tables as one 
tables = [hl.read_table(t) for t in paths]
region_ht = hl.Table.union(*tables)

# sanity check 
region_ht.count() #397441080
# the above # is correct because we are looking at 204445 regions (bed file) across all 25 pdos (1944 samples in total) 
# 204445*1944 = 397441080

Initializing Hail with default parameters...
Running on Apache Spark version 3.1.1
SparkUI available at http://mty-m.c.daly-neale-sczmeta.internal:39417
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.65-367cf1874d85
LOGGING: writing to /home/hail/hail-20210625-1234-0.2.65-367cf1874d85.log
2021-06-25 12:34:28 Hail: WARN: Name collision: field 'sample' already in object dict. 
  This field must be referenced with __getitem__ syntax: obj['sample']


In [15]:
region_ht.show(5)

f0,f1,f2,f3,sample
str,int32,int32,float64,str
"""chr1""",69082,70016,213.0,"""6627"""
"""chr1""",182700,182754,183.0,"""6627"""
"""chr1""",183105,183248,433.0,"""6627"""
"""chr1""",183913,184166,726.0,"""6627"""
"""chr1""",184918,185057,23.0,"""6627"""


In [4]:
# import tsv file with pdo membership, cram paths and sample_ID for annotation purposes 
annot_table = hl.import_table('gs://imary116/second_run/data/pdo25_crampath_sampleID.tsv', impute =True, delimiter='\t').key_by('sample_ID')

2021-06-25 12:34:45 Hail: INFO: Reading table to impute column types
2021-06-25 12:34:51 Hail: INFO: Finished type imputation
  Loading field 'pdo' as type str (imputed)
  Loading field 'cram' as type str (imputed)
  Loading field 'sample_ID' as type str (imputed)


In [17]:
annot_table.show(5)

pdo,cram,sample_ID
str,str,str
"""PDO-9490""","""gs://fc-14f056b9-50b2-45ac-99ca-dd0e9f5f081e/dalio_aberdeenwave1_Neale_StClair_schizophrenia_exome/C2003/Exome/0012763881/v2/0012763881.cram""","""0012763881"""
"""PDO-9490""","""gs://fc-14f056b9-50b2-45ac-99ca-dd0e9f5f081e/dalio_aberdeenwave1_Neale_StClair_schizophrenia_exome/C2003/Exome/0012763885/v1/0012763885.cram""","""0012763885"""
"""PDO-9490""","""gs://fc-14f056b9-50b2-45ac-99ca-dd0e9f5f081e/dalio_aberdeenwave1_Neale_StClair_schizophrenia_exome/C2003/Exome/0012763900/v1/0012763900.cram""","""0012763900"""
"""PDO-9490""","""gs://fc-14f056b9-50b2-45ac-99ca-dd0e9f5f081e/dalio_aberdeenwave1_Neale_StClair_schizophrenia_exome/C2003/Exome/0012763914/v1/0012763914.cram""","""0012763914"""
"""PDO-9490""","""gs://fc-14f056b9-50b2-45ac-99ca-dd0e9f5f081e/dalio_aberdeenwave1_Neale_StClair_schizophrenia_exome/C2003/Exome/0012763917/v1/0012763917.cram""","""0012763917"""


In [5]:
# annotate the hail table onto the mt using sample IDs 
region_ht = region_ht.annotate(pdo = annot_table[region_ht['sample']].pdo)

# sanity check 
region_ht.aggregate(hl.agg.collect_as_set(region_ht.pdo)) # all pdos are there 

In [28]:
# sanity check

# num of records for each pdo - should add up to 397441080  
print(region_ht.aggregate(hl.agg.counter(region_ht.pdo))) 

# save the output from print run above to a dictionary 
dic = {'PDO-17976': 20444500, 'PDO-9436': 20444500, 'PDO-6979': 20444500, 'PDO-17756': 20444500, 'PDO-7756': 20444500, 'PDO-529': 14924485, 'PDO-2980': 20444500, 'PDO-10439': 6337795, 'PDO-6706': 20444500, 'PDO-4831': 15537820, 'PDO-6569': 20444500, 'PDO-8917': 20444500, 'PDO-17148': 408890, 'PDO-19811': 20444500, 'PDO-11912': 20444500, 'PDO-17755': 1635560, 'PDO-74': 19217830, 'PDO-14779': 6746685, 'PDO-10544': 20444500, 'PDO-6098': 20444500, 'PDO-1294': 19217830, 'PDO-9490': 20444500, 'PDO-925': 2044450, 'PDO-17294': 4702235, 'PDO-6099': 20444500}
# return values of a dictionary and add them up
sum(dic.values()) # 397441080

2021-06-25 11:08:37 Hail: INFO: Coerced sorted dataset
2021-06-25 11:08:37 Hail: INFO: Coerced dataset with out-of-order partitions.
2021-06-25 11:08:37 Hail: INFO: Ordering unsorted dataset with network shuffle


frozendict({'PDO-17976': 20444500, 'PDO-9436': 20444500, 'PDO-6979': 20444500, 'PDO-17756': 20444500, 'PDO-7756': 20444500, 'PDO-529': 14924485, 'PDO-2980': 20444500, 'PDO-10439': 6337795, 'PDO-6706': 20444500, 'PDO-4831': 15537820, 'PDO-6569': 20444500, 'PDO-8917': 20444500, 'PDO-17148': 408890, 'PDO-19811': 20444500, 'PDO-11912': 20444500, 'PDO-17755': 1635560, 'PDO-74': 19217830, 'PDO-14779': 6746685, 'PDO-10544': 20444500, 'PDO-6098': 20444500, 'PDO-1294': 19217830, 'PDO-9490': 20444500, 'PDO-925': 2044450, 'PDO-17294': 4702235, 'PDO-6099': 20444500})


397441080

In [30]:
region_ht.show(5)

2021-06-25 11:12:14 Hail: INFO: Coerced sorted dataset
2021-06-25 11:12:14 Hail: INFO: Coerced dataset with out-of-order partitions.
2021-06-25 11:12:15 Hail: INFO: Ordering unsorted dataset with network shuffle


f0,f1,f2,f3,sample,pdo
str,int32,int32,float64,str,str
"""chr1""",69082,70016,579.0,"""0012763881""","""PDO-9490"""
"""chr1""",182700,182754,417.0,"""0012763881""","""PDO-9490"""
"""chr1""",183105,183248,881.0,"""0012763881""","""PDO-9490"""
"""chr1""",183913,184166,1470.0,"""0012763881""","""PDO-9490"""
"""chr1""",184918,185057,52.8,"""0012763881""","""PDO-9490"""


In [31]:
region_ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'f0': str 
    'f1': int32 
    'f2': int32 
    'f3': float64 
    'sample': str 
    'pdo': str 
----------------------------------------
Key: []
----------------------------------------


In [6]:
# if the mean coverage (f3 column) >= 10, annotate it with a 1 and if it is not, annotate with a 0
region_ht = region_ht.annotate(mean_coverage10 = hl.if_else(region_ht.f3 >= 10,1,0))

In [33]:
region_ht.show(5)

2021-06-25 11:12:55 Hail: INFO: Coerced sorted dataset
2021-06-25 11:12:55 Hail: INFO: Coerced dataset with out-of-order partitions.
2021-06-25 11:12:56 Hail: INFO: Ordering unsorted dataset with network shuffle


f0,f1,f2,f3,sample,pdo,mean_coverage10
str,int32,int32,float64,str,str,int32
"""chr1""",69082,70016,579.0,"""0012763881""","""PDO-9490""",1
"""chr1""",182700,182754,417.0,"""0012763881""","""PDO-9490""",1
"""chr1""",183105,183248,881.0,"""0012763881""","""PDO-9490""",1
"""chr1""",183913,184166,1470.0,"""0012763881""","""PDO-9490""",1
"""chr1""",184918,185057,52.8,"""0012763881""","""PDO-9490""",1


In [34]:
# do the following for each pdo (group by pdo first)
# for each region (chr:start:end), calculate the proportion of samples that have a mean coverage >=10 
# calculate total number of samples in the same region that have mean coverage of >=10 (mean_cov10), total number of samples within that region (total), and the proportion which is mean_cov10 divided by the total (proportion)
prop_by_region = region_ht.group_by(region_ht.pdo,region_ht.f0,region_ht.f1,region_ht.f2).aggregate(
    mean_cov10 = hl.agg.count_where(region_ht.mean_coverage10 == 1),
    total = hl.agg.count(),
    proportion = hl.agg.fraction(region_ht.mean_coverage10 == 1))

In [35]:
prop_by_region.show(5) # takes a while to run 

2021-06-25 11:13:33 Hail: INFO: Coerced sorted dataset
2021-06-25 11:13:33 Hail: INFO: Coerced dataset with out-of-order partitions.
2021-06-25 11:13:33 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-25 11:18:12 Hail: INFO: Ordering unsorted dataset with network shuffle


pdo,f0,f1,f2,mean_cov10,total,proportion
str,str,int32,int32,int64,int64,float64
"""PDO-10439""","""chr1""",69082,70016,31,31,1.0
"""PDO-10439""","""chr1""",182700,182754,31,31,1.0
"""PDO-10439""","""chr1""",183105,183248,31,31,1.0
"""PDO-10439""","""chr1""",183913,184166,31,31,1.0
"""PDO-10439""","""chr1""",184918,185057,31,31,1.0


In [36]:
prop_by_region.count() # 5111125 - takes a while to run 

2021-06-25 11:20:23 Hail: INFO: Coerced sorted dataset
2021-06-25 11:20:23 Hail: INFO: Coerced dataset with out-of-order partitions.
2021-06-25 11:20:24 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-25 11:24:47 Hail: INFO: Ordering unsorted dataset with network shuffle


5111125

In [37]:
pbr = region_ht.group_by(region_ht.f0,region_ht.f1,region_ht.f2).aggregate(mean_cov10 = hl.agg.count_where(region_ht.mean_coverage10 == 1),
                                                                           total = hl.agg.count(),
                                                                           proportion = hl.agg.fraction(region_ht.mean_coverage10 == 1))

# takes a while to run
pbr.show(5)

2021-06-25 11:31:04 Hail: INFO: Ordering unsorted dataset with network shuffle


f0,f1,f2,mean_cov10,total,proportion
str,int32,int32,int64,int64,float64
"""chr1""",69082,70016,1942,1944,0.999
"""chr1""",182700,182754,1342,1944,0.69
"""chr1""",183105,183248,1396,1944,0.718
"""chr1""",183913,184166,1610,1944,0.828
"""chr1""",184918,185057,1214,1944,0.624


In [38]:
pbr.count() # 204445 - takes a while to run 

2021-06-25 11:43:27 Hail: INFO: Ordering unsorted dataset with network shuffle


204445

In [None]:
# should be the same as pbr 
check = prop_by_region.group_by(prop_by_region.f0,prop_by_region.f1,prop_by_region.f2).aggregate(mean_cov10 = hl.agg.sum(prop_by_region.mean_cov10),
                                                                           total = hl.agg.sum(prop_by_region.total),
                                                                           proportion = hl.agg.sum(prop_by_region.proportion))
check.show(5)

In [None]:
check.count() # 204445

### Part 2: by sample

In [7]:
# calculate the length of each interval (f2 - f1) 
region_ht = region_ht.annotate(region_len = region_ht.f2 - region_ht.f1)

In [None]:
region_ht.show(5)

In [42]:
# add up the interval values per sample (total)
# multiply the 1 v 0 column (mean_coverage10) with the length of the interval (region_len) and then add the values up (meancov10_sum) 
interval_sum_by_sample = region_ht.group_by('sample').aggregate(
    meancov10_sum = hl.agg.sum(region_ht.mean_coverage10 * region_ht.region_len),
    total = hl.agg.sum(region_ht.region_len))

# calculate the proportion 
interval_sum_by_sample = interval_sum_by_sample.annotate(prop = interval_sum_by_sample.meancov10_sum/interval_sum_by_sample.total)

In [43]:
interval_sum_by_sample.show(5)

2021-06-25 11:51:16 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-25 11:51:19 Hail: INFO: Ordering unsorted dataset with network shuffle


sample,meancov10_sum,total,prop
str,int64,int64,float32
"""0012763881""",37785427,38328938,0.986
"""0012763885""",37386207,38328938,0.975
"""0012763900""",37550468,38328938,0.98
"""0012763914""",37717353,38328938,0.984
"""0012763917""",37316233,38328938,0.974


In [44]:
# annotate the pdo over the interval_sum_by_sample table (one to one sample per pdo) 
interval_sum_by_sample = interval_sum_by_sample.annotate(pdo = annot_table[interval_sum_by_sample['sample']].pdo)

#pdo_interval_sum_by_sample = interval_sum_by_sample.annotate(**pdo_cram_annot[interval_sum_by_sample['sample']])

In [45]:
interval_sum_by_sample.show(5)

2021-06-25 11:51:51 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-25 11:51:54 Hail: INFO: Ordering unsorted dataset with network shuffle


sample,meancov10_sum,total,prop,pdo
str,int64,int64,float32,str
"""0012763881""",37785427,38328938,0.986,"""PDO-9490"""
"""0012763885""",37386207,38328938,0.975,"""PDO-9490"""
"""0012763900""",37550468,38328938,0.98,"""PDO-9490"""
"""0012763914""",37717353,38328938,0.984,"""PDO-9490"""
"""0012763917""",37316233,38328938,0.974,"""PDO-9490"""


In [46]:
# sanity check 
interval_sum_by_sample.count() == len(region_ht.aggregate(hl.agg.collect_as_set(region_ht['sample']))) # True

2021-06-25 11:52:26 Hail: INFO: Ordering unsorted dataset with network shuffle


True

In [37]:
# export above table as a tsv for plotting the samples as box plots grouped by pdo
# export is for tsv 
# write is for ht 
interval_sum_by_sample.export('gs://imary116/second_run/interval_sum_by_sample.tsv')

2021-06-21 19:30:51 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-21 19:30:55 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-21 19:31:34 Hail: INFO: merging 1938 files totalling 89.9K...
2021-06-21 19:31:40 Hail: INFO: while writing:
    gs://imary116/second_run/interval_sum_by_sample.tsv
  merge time: 6.364s


### Part 3: calculate average and total coverage for chrX, chrY, and autosomal regions for each sample

In [24]:
# for x chromosome 
x_region_ht = region_ht.filter(region_ht.f0 == "chrX") # filter ht to just chr x 

# for each sample, calculate the average and total coverage across all chrX regions 
x_ave_total_cov = x_region_ht.group_by('sample').aggregate(
    mean_cov = hl.agg.mean(x_region_ht.f3),
    total_cov = hl.agg.sum(x_region_ht.f3))

In [20]:
x_region_ht.show(5)

2021-06-25 13:51:31 Hail: INFO: Coerced sorted dataset
2021-06-25 13:51:31 Hail: INFO: Coerced dataset with out-of-order partitions.
2021-06-25 13:51:31 Hail: INFO: Ordering unsorted dataset with network shuffle


f0,f1,f2,f3,sample,pdo,mean_coverage10,region_len
str,int32,int32,float64,str,str,int32,int32
"""chrX""",284179,284322,169.0,"""0012763881""","""PDO-9490""",1,143
"""chrX""",288724,288877,199.0,"""0012763881""","""PDO-9490""",1,153
"""chrX""",290639,290784,84.3,"""0012763881""","""PDO-9490""",1,145
"""chrX""",291490,291662,276.0,"""0012763881""","""PDO-9490""",1,172
"""chrX""",293026,293226,75.1,"""0012763881""","""PDO-9490""",1,200


In [25]:
x_ave_total_cov.show(5)

2021-06-25 13:55:57 Hail: INFO: Ordering unsorted dataset with network shuffle


sample,mean_cov,total_cov
str,float64,float64
"""0012763881""",96.9,678000.0
"""0012763885""",46.5,325000.0
"""0012763900""",54.4,381000.0
"""0012763914""",82.6,578000.0
"""0012763917""",46.8,327000.0


In [26]:
# for y chromosome 
y_region_ht = region_ht.filter(region_ht.f0 == "chrY") # filter ht to just chr y 

# for each sample, calculate the average and total coverage across all chrY regions
y_ave_total_cov = y_region_ht.group_by('sample').aggregate(
    mean_cov = hl.agg.mean(y_region_ht.f3),
    total_cov = hl.agg.sum(y_region_ht.f3))

In [27]:
y_ave_total_cov.show(5)

2021-06-25 13:57:16 Hail: INFO: Ordering unsorted dataset with network shuffle


sample,mean_cov,total_cov
str,float64,float64
"""0012763881""",270.0,128000.0
"""0012763885""",139.0,65900.0
"""0012763900""",143.0,67400.0
"""0012763914""",243.0,115000.0
"""0012763917""",130.0,61400.0


In [28]:
# for autosomes - all other chr except for chrX and Y 
auto_region_ht = region_ht.filter((region_ht.f0 != "chrX") & (region_ht.f0 != "chrY")) 

# for each sample, calculate the average and total coverage across all autosomal regions
auto_ave_total_cov = auto_region_ht.group_by('sample').aggregate(
    mean_cov = hl.agg.mean(auto_region_ht.f3),
    total_cov = hl.agg.sum(auto_region_ht.f3))

In [29]:
auto_ave_total_cov.show(5)

2021-06-25 13:58:03 Hail: INFO: Ordering unsorted dataset with network shuffle


sample,mean_cov,total_cov
str,float64,float64
"""0012763881""",147.0,28900000.0
"""0012763885""",70.4,13900000.0
"""0012763900""",82.2,16200000.0
"""0012763914""",126.0,24800000.0
"""0012763917""",71.2,14000000.0


In [32]:
# write out outputs 
x_ave_total_cov.export('gs://imary116/second_run/x_ave_total_cov.tsv')
y_ave_total_cov.export('gs://imary116/second_run/y_ave_total_cov.tsv')
auto_ave_total_cov.export('gs://imary116/second_run/auto_ave_total_cov.tsv')

2021-06-25 14:05:21 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-25 14:05:29 Hail: INFO: merging 1938 files totalling 60.6K...
2021-06-25 14:05:33 Hail: INFO: while writing:
    gs://imary116/second_run/x_ave_total_cov.tsv
  merge time: 4.215s
2021-06-25 14:05:45 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-25 14:05:51 Hail: INFO: merging 1938 files totalling 60.6K...
2021-06-25 14:05:56 Hail: INFO: while writing:
    gs://imary116/second_run/y_ave_total_cov.tsv
  merge time: 4.334s
2021-06-25 14:06:08 Hail: INFO: Ordering unsorted dataset with network shuffle
2021-06-25 14:06:14 Hail: INFO: merging 1938 files totalling 60.6K...
2021-06-25 14:06:19 Hail: INFO: while writing:
    gs://imary116/second_run/auto_ave_total_cov.tsv
  merge time: 4.267s
