In [1]:
import os
import pandas as pd
from lifd.lifd import LiFD, LIFD_COL, LIFD_SUP_COL, LIFD1_SUP_COL, LIFD2_SUP_COL
from lifd.databases.hotspots_database import HotspotsDB
from lifd.databases.cgi_database import CgiDB
from lifd.databases.oncokb_database import OncoKBDB
from lifd.databases.cosmic_db import CosmicDB
from lifd.predictors.vep import Vep
from lifd.predictors.candra import Candra
from lifd.predictors.cravat import Cravat
from lifd.predictors.cgi import Cgi
from lifd.predictors.fathmm import FatHMM
from lifd.settings import HOTSPOTS_FP, ONCOGENIC_VARS_FP, ONCOKB_ALLVARS_FP, COSMIC_VARS_FP
from lifd.utils import NT_VAR_COL, PT_VAR_COL, FUNC_COL, MUT_EFFECT_COL, init
# from lifd.mutation_effects import get_mutation_details, annotate_effects

INFO:lifd.lifd:VarCode is used for mutation effect prediction.


In [2]:
from cgi_settings import CGI_USER_ID, CGI_TOKEN
# from lifd.excluded_module.cgi_settings import CGI_USER_ID, CGI_TOKEN
# CGI_USER_ID = '<YOUR_CGI_USERNAME>'
# CGI_TOKEN = '<YOUR_CGI_TOKEN>'
Cgi.set_login(CGI_USER_ID, CGI_TOKEN)

output_dir = init()

# excel file with input variants
input_fp = 'example_variants.xlsx'
# CSV file with list of putative driver genes (e.g. TCGA consensus list, Bailey et al, Cell 2018)
driver_gene_fp = 'BaileyDing2018_driverconsensus.csv'

from pathlib import Path
input_p = Path(input_fp)
output_fn = '{}_LiFDed.xlsx'.format(input_p.stem)
output_fp = os.path.join(input_p.parent, output_fn)

if input_p.suffix == '.xlsx' or input_p.suffix == '.xls':
    var_df = pd.read_excel(input_fp)
    print('Read file with {} variants: {}'.format(len(var_df), input_fp))

INFO:lifd.predictors.cgi:Successfully set new username reiter.j@gmail.com and token.
INFO:lifd.utils:Output directory: /Users/reiter/workspaces/ped_gits/met_heterogeneity/analysis/lifd_examples/TMP


Read file with 7 variants: example_variants.xlsx


In [3]:
# set up databases for first phase of LiFD
hs_db = HotspotsDB(HOTSPOTS_FP)
cgi_db = CgiDB(ONCOGENIC_VARS_FP)
oncokb_db = OncoKBDB(ONCOKB_ALLVARS_FP)
cosmic_db = CosmicDB(COSMIC_VARS_FP)
dbs = [hs_db, cgi_db, oncokb_db, cosmic_db
      ]

# set up predictors for second phase of LiFD
prds = [Vep, Candra, Cravat, FatHMM, Cgi]
# prds = [FatHMM]
lifd = LiFD(databases=dbs, predictors=prds, 
            driver_gene_fp=driver_gene_fp, driver_gene_col='TCGADrClf', )

INFO:lifd.mutation_effects:Set reference genome to hg19.
INFO:lifd.lifd:Using databases: Hotspots, CGI_Catalog, OncoKB, COSMIC
INFO:lifd.lifd:Using predictors: Vep, Candra, Cravat, FatHMM, Cgi
INFO:lifd.lifd:Using 299 driver genes for annotation from /Users/reiter/databases/BaileyDing2018_driverconsensus.csv.
INFO:lifd.lifd:Initialized LiFD 0.1.0 with 4 databases and 5 predictors.


In [4]:
var_df = lifd.run_lifd(var_df, output_dir=output_dir, export_fn=output_fn)

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/reiter/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/reiter/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /Users/reiter/Library/Caches/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle
INFO:lifd.lifd:Added potentially functional driver gene mutation annotation to 4 variants.
INFO:lifd.lifd:LIFD output directory: /Users/reiter/workspaces/ped_gits/met_heterogeneity/analysis/lifd_examples/TMP
INFO:lifd.databases.hotspots_database:Reading database Hotspots from file /Users/reiter/databases/ChangTaylor_hotspots_cd2018_v2.xls.
INFO:lifd.databases.hotspots_database:Loaded 2547 cancer hotspot variants from file /Users/reiter/databases/ChangTaylor_hotspots_cd2018_v2.xls.
INFO:lifd.databases.cgi_database

In [None]:
var_df

In [None]:
var_df.columns.values