In [2]:
import os
localfs_path = os.environ.get('SCRATCH_LOCAL') + '/'

os.environ['_JAVA_OPTIONS'] = f'-Djava.io.tmpdir={localfs_path}'

import hail as hl

hl.init(
    tmp_dir=(localfs_path+'tmp_hail'),
    spark_conf={'spark.driver.memory': '30G', 'spark.executor.memory': '10G'}, # I don't know what should be here
    default_reference='GRCh38'
) 

In [3]:
from bokeh.io import show, output_notebook, export_png

from bokeh.layouts import gridplot
output_notebook()
import numpy as np

import bokeh.palettes
from bokeh.models import CategoricalColorMapper

In [4]:
gwas = hl.import_table('/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/external-data/TS_Oct2018', delimiter=' ')
gwas = gwas.annotate(locus = hl.locus(gwas.CHR, hl.int32(gwas.BP), reference_genome='GRCh37'))

rg37 = hl.get_reference('GRCh37')  
rg38 = hl.get_reference('GRCh38')  

2023-09-12 12:46:16.751 Hail: INFO: Reading table without type imputation
  Loading field 'SNP' as type str (not specified)
  Loading field 'CHR' as type str (not specified)
  Loading field 'BP' as type str (not specified)
  Loading field 'A1' as type str (not specified)
  Loading field 'A2' as type str (not specified)
  Loading field 'INFO' as type str (not specified)
  Loading field 'OR' as type str (not specified)
  Loading field 'SE' as type str (not specified)
  Loading field 'P' as type str (not specified)


In [5]:
rg37.add_liftover(
    '/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/data/external-data/references_grch37_to_grch38.over.chain.gz',
    rg38
)

In [6]:
gwas = gwas.annotate(hg39locus = hl.liftover(gwas.locus, 'GRCh38'))

In [7]:
gtf = hl.import_table(
    '/net/pr2/projects/plgrid/plggneuromol/resources/genecode_43/gencode.v43.basic.annotation.gtf',
    delimiter = "\t",
    no_header = True,
    comment = "#"
)

# filter the gtf to only contain protein-coding genes with any GO term
gtf = gtf.filter((gtf.f2 == 'gene') & (gtf.f8.contains('protein_coding')))

gtf = gtf.select(
    gene_position = hl.locus_interval(
        gtf.f0,
        hl.int(gtf.f3),
        hl.int(gtf.f4),
        reference_genome='GRCh38'
    ),
    gene_symbol = gtf.f8.split("\"")[5]
)  

go_genes = hl.import_table(
    '/net/pr2/projects/plgrid/plggneuromol/resources/human-genes-with-GO-and-symbols'
)

go_genes = go_genes.key_by(go_genes['UniProtKB Gene Name symbol'])

gtf = gtf.key_by(gtf['gene_symbol'])
gtf = gtf.filter(
    hl.is_defined(
        go_genes[gtf.gene_symbol]
    )
)

# overall we are keeping 18303 genes. At this poin the within gene intervals are extended by 20kb each side
start = hl.if_else(
    gtf.gene_position.start.position <= 20000, 1, gtf.gene_position.start.position - 20000
)

contig_len = hl.contig_length(gtf.gene_position.start.contig, reference_genome='GRCh38') 

stop = hl.if_else(
    (contig_len - gtf.gene_position.end.position) <= 20000,
    contig_len,
    gtf.gene_position.end.position + 20000
)

gtf = gtf.annotate(
    interval_20kb = hl.locus_interval(
        gtf.gene_position.start.contig,
        start,
        stop,
        reference_genome='GRCh38'
    )
)

gtf = gtf.key_by(gtf.interval_20kb)

2023-09-12 12:46:18.209 Hail: INFO: Reading table without type imputation
  Loading field 'f0' as type str (not specified)
  Loading field 'f1' as type str (not specified)
  Loading field 'f2' as type str (not specified)
  Loading field 'f3' as type str (not specified)
  Loading field 'f4' as type str (not specified)
  Loading field 'f5' as type str (not specified)
  Loading field 'f6' as type str (not specified)
  Loading field 'f7' as type str (not specified)
  Loading field 'f8' as type str (not specified)
2023-09-12 12:46:19.058 Hail: INFO: Reading table without type imputation
  Loading field 'Gene stable ID' as type str (not specified)
  Loading field 'UniProtKB Gene Name symbol' as type str (not specified)


In [8]:
gwas = gwas.annotate(
    nearest_genes_20kb = hl.array(
        hl.set(
            gtf.index(gwas.hg39locus, all_matches=True)['gene_symbol']
        )
    )
)

In [9]:
gwas.checkpoint(localfs_path+'gwas-ts.ht')

2023-09-12 12:47:10.440 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-12 12:47:12.851 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-12 12:47:13.241 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-12 12:47:16.090 Hail: INFO: Ordering unsorted dataset with network shuffle
2023-09-12 12:48:10.075 Hail: INFO: wrote table with 8265318 rows in 3 partitions to /localfs/4736297/gwas-ts.ht
    Total size: 385.89 MiB
    * Rows: 385.89 MiB
    * Globals: 11.00 B
    * Smallest partition: 2293984 rows (106.97 MiB)
    * Largest partition:  3489970 rows (163.69 MiB)


<hail.table.Table at 0x14aa88e9e3b0>

In [10]:
gwas = hl.read_table(localfs_path+'gwas-ts.ht')

#### divide per gene - and calculate which genes are above background (check if any of these genes are from this gwas)

In [11]:
all_genes = gwas.nearest_genes_20kb.collect()
all_genes = [item for sublist in all_genes for item in sublist]
all_genes = list(set(all_genes))

gts_genes = ['DCC', 'RBFOX', 'SLC30A9', 'DCAF4L1', 'SORCS3', 'KCNQ5', 'KCNQ-IT1', 'APOPT1', 'C14orf2', 'NAA11', 'NEGR1',
        'CHADL', 'SOX5', 'PANK2', 'COL27A1', 'PDGFB', 'CELSR3', 'OPA1', 'FBN2', 'WWC1', 'NIPBL', 
             'FN1', 'FBN2', 'SLITRK1', 'SLITRK2', 'SLITRK3', 'SLITRK4', 'SLITRK5', 'SLITRK6', 
             'HDC', 'OPRK1', 'PCDH10', 'NTSR2', 'CHD8', 'SCUBE1', 'PNKD', 'CNTNAP2', 'MOG', 
             'DRD2', 'DRD3', 'DRD4', 'DRD5', 'DAT1', 'DBH', 'HTR2A', 'TPH2', 'EAAT1', 'SAPAP3',
            'CTNNA3', 'NLGN4', 'FSCB', 'IMMP2L', 'NRXN1', 'AADAC', 'DBH', 'MAOA', 'HTR1A', 'HTR2C', 'SLC6A4',
             'TPH2', 'COL27A1', '5-HTTLPR', 'EAAT1', 'COL8A1', 'KCNE1', 'KCNE2',
         'RICTOR', 'WWC1', 'CELSR3, NIPBL', 'FN1', 'PNKD', 'CDH26', 'CADM2', 'OPCML', 'CDH9',
         'NCAM2', 'CD47', 'CDH5', 'CADM4', 'C1QBP', 'CTTN', 'LSAMP',
         'PKP4', 'PCDH1', 'CNTNAP2', 'MBP', 'GABBR2', 'GABBR2', 'GRIK4', 'NCR1', 'FLT3', 'IL12A', 'HDAC9',
         'CD180', 'CDH26', 'NCAM2', 'NTM', 'ROBO2', 'NR2F1', 'ASH1L', 'FLT3']

all_genes = [g for g in all_genes if g not in gts_genes]



In [12]:
np.random.seed(1)
top_genes = ['CHADL', 'DRD2', 'MAOA', 'PCDH10', 'HTR2A', 'SLITRK5', 'SORCS3', 'KCNQ5', 'CDH9', 'CHD8']
colors = bokeh.palettes.plasma(200)

In [13]:
# figure out a way to normalize per gene

In [14]:
plots = []
from bokeh.plotting import figure, show, output_notebook
output_notebook()

for gene in top_genes:
    
    gwas_top = gwas.filter(hl.any(gwas.nearest_genes_20kb.contains(gene)))
    p_top = gwas_top.P.collect()
    p_hist = gwas_top.aggregate(hl.expr.aggregators.hist(hl.float64(gwas_top.P), 0, 1, 100))
    
    p_values = []
    
    for p in range(100):
        randoms = np.random.randint(len(all_genes), size=1)
        genes = [j for i, j in enumerate(all_genes) if i in randoms]

        p_value = gwas.filter(hl.any(lambda x: hl.literal(genes).contains(x), gwas.nearest_genes_20kb)).P.collect()

        p_values.append(p_value)

    y = [float(i) for i in p_top]
    y = -np.log10(y)
    
    hist_top, edge_top = np.histogram(y, bins = 100)
    
    histograms = []
    edges = []

    for p in p_values:
        x = [float(i) for i in p]
        x = -np.log10(x)
        hist,edge = np.histogram(x, bins = 50)
        histograms.append(hist)
        edges.append(edge)

    p1 = figure(plot_width=800, plot_height=800, title = gene)

    for x in range(100):
        p1.quad(
            top=np.log10(histograms[x]),
            bottom=0,
            left=edges[x][:-1],
            right=edges[x][1:],
            fill_color=colors[100],
            line_color=colors[50],
            alpha=0.05
        )

    p1.quad(top=np.log10(hist_top),bottom=0, left=edge_top[:-1], right=edge_top[1:],
            fill_color=colors[0], line_color=colors[0], alpha=1)

    p1.xaxis.axis_label = '-log10(p_value)'
    p1.yaxis.axis_label = 'log10(number of SNPs)'

    p1.xaxis.axis_label_text_font_size = "15pt"
    p1.xaxis.major_label_text_font_size = "15pt"
    p1.yaxis.axis_label_text_font_size = "15pt"
    p1.yaxis.major_label_text_font_size = "15pt"
    
    show(p1)
    
    plots.append(p1)

  top=np.log10(histograms[x]),


ERROR:root:KeyboardInterrupt while sending command.                 (2 + 1) / 3]
Traceback (most recent call last):
  File "/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/venv-hail-0.2.113/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/net/pr2/projects/plgrid/plggneuromol/imdik-zekanowski-gts/venv-hail-0.2.113/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/net/software/testing/software/Python/3.10.4-GCCcore-11.3.0/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
# maybe try qq plots + calculate tons of random lambda GCs (any normalisation? confidence intervals?)

In [22]:
from bokeh.models.mappers import ColorMapper 

In [23]:
p = hl.plot.qq(
    hl.float(gwas.P),
    colors = ColorMapper(['#4A7A89'])
)

2023-09-12 13:04:27.136 Hail: INFO: Ordering unsorted dataset with network shuffle
  tmp_data = {c: v.values for c, v in _df.iteritems()}
2023-09-12 13:04:57.290 Hail: INFO: Ordering unsorted dataset with network shuffle

In [24]:
p.xaxis.axis_label_text_font_size = "20pt"
p.yaxis.axis_label_text_font_size = "20pt"

In [25]:
show(p)

In [26]:
top_genes = ['CHADL','DRD2', 'MAOA', 'PCDH10', 'HTR2A', 'SLITRK5', 'SORCS3', 'KCNQ5', 'CDH9', 'CHD8']

In [27]:
gwas_top = gwas.filter(hl.any(lambda x: hl.literal(top_genes).contains(x), gwas.nearest_genes_20kb))

In [28]:
p2 = hl.plot.qq(hl.float(gwas_top.P))

2023-09-12 13:14:31.601 Hail: INFO: Ordering unsorted dataset with network shuffle
  tmp_data = {c: v.values for c, v in _df.iteritems()}
2023-09-12 13:14:37.678 Hail: INFO: Ordering unsorted dataset with network shuffle

In [30]:
p2.xaxis.axis_label_text_font_size = "20pt"
p2.yaxis.axis_label_text_font_size = "20pt"

In [31]:
show(p2)

In [34]:
gwas_top.count()



5929

In [35]:
gwas_top.aggregate(hl.agg.filter(hl.float(gwas_top.P) < 0.0001, hl.agg.counter(gwas_top.nearest_genes_20kb)))



{<FrozenList(frozen=True, ['CHADL', 'L3MBTL2'])>: 8,
 <FrozenList(frozen=True, ['CHADL', 'L3MBTL2', 'RANGAP1'])>: 6,
 <FrozenList(frozen=True, ['CHADL', 'RANGAP1'])>: 5}

In [None]:
#check where these variants are

In [None]:
lgcs = []

for i in range(1000):
    
    randoms = np.random.randint(len(all_genes), size=10)
    genes = [j for i, j in enumerate(all_genes) if i in randoms]

    gwas_rand = gwas.filter(hl.any(lambda x: hl.literal(genes).contains(x), gwas.nearest_genes_20kb))

    lgcs.append(hl.lambda_gc(hl.float(gwas_rand.P)))



In [39]:
np.percentile(lgcs, 90)

1.508084816970013

In [44]:
np.percentile(lgcs, 79)

1.3305443766179947

In [47]:
(100 - 79)/ 100

0.21

https://stackoverflow.com/questions/74874235/confidence-interval-in-normal-q-q-plot-using-statsmodels