# DriverDB_v3

In [24]:
from typing import List

import pandas as pd

In [25]:
DATA_PATH = "mutation_download_tab.txt"

In [26]:
driver_data = pd.read_csv(DATA_PATH, sep="\t")
driver_data

Unnamed: 0,cancer_project,cancer_type_abbr,tool,driver_gene
0,"Adrenocortical_carcinoma(TCGA,US)",ACC,activedriver,"APC, USF3, TMCC1, CXXC1, CPS1, KIAA1671, ADNP,..."
1,"Adrenocortical_carcinoma(TCGA,US)",ACC,comet,"ANO2, ANK3, STAB1, CLTB, HUWE1, EPG5, CHD2, RB..."
2,"Adrenocortical_carcinoma(TCGA,US)",ACC,dendrix,"NLRP12, GOLGA4, NAV3, UNC13C, CMYA5, CCDC168, ..."
3,"Adrenocortical_carcinoma(TCGA,US)",ACC,driverml,"EXPH5, EML6, TP53, MUC4, NUP188"
4,"Adrenocortical_carcinoma(TCGA,US)",ACC,e-driver,"XIRP2, SVEP1, CSMD3, CCDC168, HMCN1, VWF, GRID..."
...,...,...,...,...
367,"Uveal_melanoma(TCGA,US)",UVM,msea,"EIF1AX, SF3B1, GNAQ, GNA11"
368,"Uveal_melanoma(TCGA,US)",UVM,mutex,"GNAQ, GNA11"
369,"Uveal_melanoma(TCGA,US)",UVM,mutsigcv,"GNA11, HECW2"
370,"Uveal_melanoma(TCGA,US)",UVM,netbox,"BAP1, COL14A1, CYSLTR2, EIF1AX, GNA11, GNAQ, M..."


array(['activedriver', 'comet', 'dendrix', 'driverml', 'e-driver', 'ipac',
       'msea', 'mutsigcv', 'netbox', 'oncodriveclust', 'dawnrank',
       'drivernet', 'memo', 'mutex'], dtype=object)

In [58]:
def get_genes(cancer_type, data, tool=None) -> List[str]:
    if tool is None:
        if cancer_type == "all":
            genes = set(data["driver_gene"].apply(lambda x: [gene.strip() for gene in x.split(",")]).explode())
        else:
            data_filtered = data[data["cancer_type_abbr"] == cancer_type].copy()
            genes = set(data_filtered["driver_gene"].apply(lambda x: [gene.strip() for gene in x.split(",")]).explode())

    else:
        assert tool in list(driver_data["tool"].unique())
        data_tool = data[data["tool"] == tool]
        if cancer_type == "all":
            genes = set(data_tool["driver_gene"].apply(lambda x: [gene.strip() for gene in x.split(",")]).explode())
        else:
            data_filtered = data_tool[data_tool["cancer_type_abbr"] == cancer_type].copy()
            genes = set(data_filtered["driver_gene"].apply(lambda x: [gene.strip() for gene in x.split(",")]).explode())

    return sorted(genes)

In [51]:
brca_genes = get_genes("BRCA", driver_data)
len(brca_genes)

2206

In [52]:
all_genes = get_genes("all", driver_data)
len(all_genes)

14692

**dawnrank**

In [60]:
brca_genes_dawnrank = get_genes("BRCA", driver_data, "dawnrank")
len(brca_genes_dawnrank)

30

In [61]:
all_genes_dawnrank = get_genes("all", driver_data, "dawnrank")
len(all_genes_dawnrank)

295

**drivernet**

In [64]:
brca_genes_drivernet = get_genes("BRCA", driver_data, "drivernet")
print(len(brca_genes_drivernet))

all_genes_drivernet = get_genes("all", driver_data, "drivernet")
len(all_genes_drivernet)

119


279

**Mutex**

In [66]:
brca_genes_mutex = get_genes("BRCA", driver_data, "mutex")
print(len(brca_genes_mutex))

all_genes_mutex = get_genes("all", driver_data, "mutex")
len(all_genes_mutex)

18


658

### Extracting the genes

In [44]:
from datetime import datetime

In [45]:
def extract_genes(genes: List, cancer_type):
    file_date = datetime.today().strftime('%Y-%m-%d')
    file_name = f"{cancer_type}_genes_{file_date}.txt"
    with open(file_name, "w") as file:
        for gene in genes:
            file.write(f"{gene}\n")

    print(f"{cancer_type} genes are extracted successfully.")


In [46]:
extract_genes(all_genes, "all")

all genes are extracted successfully.


In [47]:
extract_genes(brca_genes, "BRCA")

BRCA genes are extracted successfully.


In [62]:
extract_genes(all_genes_dawnrank, "all_dawnrank")
extract_genes(brca_genes_dawnrank, "BRCA_dawnrank")

all_dawnrank genes are extracted successfully.
BRCA_dawnrank genes are extracted successfully.


In [65]:
extract_genes(all_genes_drivernet, "all_drivernet")
extract_genes(brca_genes_drivernet, "BRCA_drivernet")

all_drivernet genes are extracted successfully.
BRCA_drivernet genes are extracted successfully.


In [67]:
extract_genes(all_genes_mutex, "all_mutex")
extract_genes(brca_genes_mutex, "BRCA_mutex")

all_mutex genes are extracted successfully.
BRCA_mutex genes are extracted successfully.
