In [1]:
import logging
import os
import pandas as pd
from lifd.lifd import LiFD, LIFD_COL, LIFD_SUP_COL, LIFD1_SUP_COL, LIFD2_SUP_COL
from lifd.databases.hotspots_database import HotspotsDB
from lifd.databases.cgi_database import CgiDB
from lifd.databases.oncokb_database import OncoKBDB
from lifd.databases.cosmic_db import CosmicDB
from lifd.predictors.vep import Vep
from lifd.predictors.candra import Candra
from lifd.predictors.cravat import Cravat
from lifd.predictors.cgi import Cgi
from lifd.predictors.fathmm import FatHMM
from lifd.settings import HOTSPOTS_FP, ONCOGENIC_VARS_FP, ONCOKB_ALLVARS_FP, COSMIC_VARS_FP, REF_GENOME_FA_FP
from lifd.utils import NT_VAR_COL, PT_VAR_COL, FUNC_COL, MUT_EFFECT_COL, init
from lifd.mutation_effects import get_mutation_details, annotate_effects

2020-02-24 11:03:20,532 lifd:37 INFO: VarCode is used for mutation effect prediction.


In [2]:
from lifd.cgi_settings import CGI_USER_ID, CGI_TOKEN
# from lifd.excluded_module.cgi_settings import CGI_USER_ID, CGI_TOKEN
# CGI_USER_ID = '<YOUR_CGI_USERNAME>'
# CGI_TOKEN = '<YOUR_CGI_TOKEN>'
Cgi.set_login(CGI_USER_ID, CGI_TOKEN)

logger = logging.getLogger('lifd.{}'.format(__name__))

output_dir = init()

# excel file with input variants
input_fp = 'example_variants.xlsx'
# CSV file with list of putative driver genes (e.g. TCGA consensus list, Bailey et al, Cell 2018)
driver_gene_fp = 'BaileyDing2018_driverconsensus.csv'

from pathlib import Path
input_p = Path(input_fp)
output_fn = '{}_LiFDed.xlsx'.format(input_p.stem)
output_fp = os.path.join(input_p.parent, output_fn)

if input_p.suffix == '.xlsx' or input_p.suffix == '.xls':
    var_df = pd.read_excel(input_fp)
    print('Read file with {} variants: {}'.format(len(var_df), input_fp))

2020-02-24 11:03:20,545 lifd.lifd.predictors.cgi:62 INFO: Successfully set new username reiter.j@gmail.com and token.
2020-02-24 11:03:20,546 lifd.lifd.utils:44 INFO: Output directory: /Users/reiter/workspaces/LiFD_dev/lifd_examples/LiFD_TMP


Read file with 7 variants: example_variants.xlsx


In [3]:
# set up databases for first phase of LiFD
hs_db = HotspotsDB(HOTSPOTS_FP)
cgi_db = CgiDB(ONCOGENIC_VARS_FP)
oncokb_db = OncoKBDB(ONCOKB_ALLVARS_FP)
cosmic_db = CosmicDB(COSMIC_VARS_FP)
dbs = [hs_db, cgi_db, oncokb_db, cosmic_db
      ]

# set up predictors for second phase of LiFD
prds = [Vep, #Cravat, # Candra, FatHMM, Cgi
       ]
# prds = [FatHMM]
lifd = LiFD(databases=dbs, predictors=prds, 
            driver_gene_fp=driver_gene_fp, driver_gene_col='TCGADrClf', )

2020-02-24 11:03:20,577 lifd.lifd.mutation_effects:60 INFO: Set reference genome to hg19.
2020-02-24 11:03:20,578 lifd:84 INFO: Using databases: Hotspots, CGI_Catalog, OncoKB, COSMIC
2020-02-24 11:03:20,579 lifd:98 INFO: Using predictors: Vep
2020-02-24 11:03:20,585 lifd:120 INFO: Using 299 driver genes for annotation from /Users/reiter/databases/BaileyDing2018_driverconsensus.csv.
2020-02-24 11:03:20,586 lifd:123 INFO: Initialized LiFD 0.1.1 with 4 databases and 1 predictors.


In [4]:
var_df = lifd.run_lifd(var_df, output_dir=output_dir, export_fn=output_fn)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/reiter/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/reiter/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/reiter/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle
2020-02-24 11:03:21,383 lifd:219 INFO: Added potentially functional driver gene mutation annotation to 5 variants.
2020-02-24 11:03:21,383 lifd:225 INFO: LIFD output directory: /Users/reiter/workspaces/LiFD_dev/lifd_examples/LiFD_TMP
2020-02-24 11:03:21,385 lifd.lifd.databases.hotspots_database:49 INFO: Reading database Hotspots from file /Users/reiter/databases/ChangTaylor_hotspots_cd2018_v2.xls.
2020-02-24 11:03:21,984 lifd.lifd.databases.hotspots_database:77 INFO: Loaded 2547 cancer hotspot variants from file 

In [5]:
var_df

Unnamed: 0,Chromosome,StartPosition,EndPosition,ReferenceAllele,AlternateAllele,Subject,GeneSymbol,CancerType,NtVarKey,PtVarKey,...,In_OncoKB,In_COSMIC,LiFD1_support,PolyPhen,Sift,VEP_impact,LiFD2_results,LiFD2_support,LiFD_support,LiFD
0,12,25398284,25398284,C,A,P1,KRAS,PAAD,12__25398284__C__A,KRAS__G12V,...,True,1149,4.0,0.972,0.0,MODERATE,"[0.0, 1.0]",0.0,4.0,True
1,19,48994757,48994756,-,G,P1,LMTK3,PAAD,19__48994757__-__G,LMTK3__E1407fs,...,False,20,1.0,,,HIGH,"[1.0, 1.0]",1.0,2.0,False
2,17,7573996,7573996,A,G,P2,TP53,PAAD,17__7573996__A__G,TP53__L344P,...,True,7,3.0,0.967,0.0,MODERATE,"[0.0, 1.0]",0.0,3.0,True
3,1,27058029,27058029,T,G,P2,ARID1A,PAAD,1__27058029__T__G,ARID1A__Y579*,...,False,0,0.0,,,HIGH,"[1.0, 1.0]",1.0,1.0,True
4,12,25398284,25398284,C,A,P2,KRAS,PAAD,12__25398284__C__A,KRAS__G12V,...,True,1149,4.0,0.972,0.0,MODERATE,"[0.0, 1.0]",0.0,4.0,True
5,1,119427951,119427951,T,G,P2,TBX15,PAAD,1__119427951__T__G,TBX15__S405R,...,False,0,0.0,0.719,0.05,MODERATE,"[0.0, 1.0]",0.0,0.0,False
6,12,25380275,25380275,T,A,P3,KRAS,PAAD,12__25380275__T__A,KRAS__Q61H,...,True,50,4.0,0.121,0.0,MODERATE,"[0.0, 1.0]",0.0,4.0,True


In [6]:
var_df.columns.values

array(['Chromosome', 'StartPosition', 'EndPosition', 'ReferenceAllele',
       'AlternateAllele', 'Subject', 'GeneSymbol', 'CancerType',
       'NtVarKey', 'PtVarKey', 'Transcript_ID', 'Protein_ID',
       'FatHMM_Key', 'MutationEffect', 'MaybeFunctional', 'TCGADrClf',
       'In_Hotspots', 'In_CGI_Catalog', 'In_OncoKB', 'In_COSMIC',
       'LiFD1_support', 'PolyPhen', 'Sift', 'VEP_impact', 'LiFD2_results',
       'LiFD2_support', 'LiFD_support', 'LiFD'], dtype=object)

In [7]:
# ########################### debugging ############################
# import pysam
# from lifd.settings import REF_GENOME_FA_FP

In [8]:
# genome = pysam.Fastafile(REF_GENOME_FA_FP)

In [9]:
# genome.fetch('chr5', 12755, 12858)

In [10]:
# curl -H "Authorization: Bearer [your token]" https://www.oncokb.org/api/v1/genes

# from lifd.oncokb_settings import ONCOKB_TOKEN
# import pycurl
# from io import BytesIO

# buffer = BytesIO()
# crl = pycurl.Curl()
# crl.setopt(crl.URL, 'https://www.oncokb.org/api/v1/info')
# crl.setopt(crl.WRITEDATA, buffer)
# crl.perform()
# crl.close()

# buffer.getvalue()

# buffer = BytesIO()
# crl = pycurl.Curl()
# crl.setopt(crl.URL, 'https://oncokb.org:443/api/v1/utils/allAnnotatedVariants.txt')
# crl.setopt(crl.WRITEDATA, buffer)
# crl.perform()
# crl.close()

# body = buffer.getvalue()
# body

# with open('test.tsv', 'wb') as f:
#     f.write(buffer.getvalue())
# buffer.close()