In [1]:
import time
import logging
import pandas as pd

from tqdm import tqdm
from numpy import mean
from numpy.random import choice
from gendas.engine import Gendas
from gendas.utils import flatten

In [2]:
%%bash
cat data/gendas.conf


[cadd]
type = tabix
file = cadd_score.tsv.gz
header = CHR, POS, REF, ALT, RAW, PHRED
ctypes = str, int, str, str, float, float
sequence= CHR
begin = POS
end = POS

[variants]
type = tabix
file = variants.tsv.gz
header = CHR, POS, REF, ALT, SAMPLE
ctypes = str, int, str, str, str
sequence = CHR
begin = POS
end = POS

[tcga]
type = tabix
file = tcga.txt.gz
header = CHR, POS, REF, ALT
ctypes = str, int, str, str
sequence = CHR
begin = POS
end = POS


[exons]
type = tabix
file = cds_exons.tsv.gz
header = CHR, START, STOP, GENE
ctypes = str, int, int, str
sequence = CHR
begin = START
end = STOP
indices = GENE,

[genes]
type = tabix
file = cds_annotations.tsv.gz
header = CHR, GENE, SYMBOL, BEGIN, END, STRAND
ctypes = str, str, str, int, int, str
sequence = CHR
begin = BEGIN
end = END


In [3]:
def functional_score(gd, sampling=100000):
    
    possible_scores = list(gd['cadd']['PHRED'])
    observed_scores = list(gd['tcga'].merge(gd['cadd'], on=['REF', 'ALT'])['cadd']['PHRED'])

    if len(observed_scores) == 0:
        return 1.0

    background = choice(possible_scores, size=(sampling, len(observed_scores))).mean(axis=1)
    obs = len(background[background >= mean(observed_scores)])

    return max(1, obs) / sampling

In [4]:
gd = Gendas('data/gendas.conf')

In [5]:
%%time

pvalues = gd.groupby(gd['exons']['GENE']).aggregate({
    'PVALUE': functional_score
})

df = pd.DataFrame.from_dict(
    flatten(
        tqdm(pvalues, total=len(set(gd['exons']['GENE'])))
    ), orient='columns').set_index(['GENE'])
df = df.sort_values('PVALUE', ascending=True)
print(df.head(10))

100%|██████████| 242/242 [00:34<00:00,  7.00it/s]

                  PVALUE
GENE                    
ENSG00000156273  0.00001
ENSG00000154654  0.00001
ENSG00000157540  0.00001
ENSG00000160179  0.00001
ENSG00000156304  0.00001
ENSG00000159216  0.00001
ENSG00000156299  0.00001
ENSG00000177692  0.00001
ENSG00000185658  0.00001
ENSG00000159086  0.00001
CPU times: user 368 ms, sys: 25.7 ms, total: 394 ms
Wall time: 34.6 s





In [None]:
len(gd['tcga'])