In [1]:
from pandas import DataFrame

In [2]:
from data_sources.drug_connectivity_map import DrugConnectivityMap
from data_sources.molecular_signatures_db import MolecularSignaturesDatabase
from data_sources.tcga import TCGA

dcm = DrugConnectivityMap()
msigdb = MolecularSignaturesDatabase()
tcga = TCGA()

In [3]:
msigdb.version

'6.2'

Available collections:

In [4]:
msigdb.gene_sets

[{'name': 'c2.cgp', 'id_type': 'entrez'},
 {'name': 'c7.all', 'id_type': 'entrez'},
 {'name': 'c2.all', 'id_type': 'symbols'},
 {'name': 'c2.cp', 'id_type': 'symbols'},
 {'name': 'c2.all', 'id_type': 'entrez'},
 {'name': 'c2.cp.reactome', 'id_type': 'entrez'},
 {'name': 'c2.cp.kegg', 'id_type': 'entrez'},
 {'name': 'c2.cp.reactome', 'id_type': 'symbols'},
 {'name': 'h.all', 'id_type': 'symbols'},
 {'name': 'c2.cp', 'id_type': 'entrez'},
 {'name': 'c6.all', 'id_type': 'entrez'},
 {'name': 'h.all', 'id_type': 'entrez'}]

Genes in TCGA and LINCS connectivity map: 

In [5]:
tcga_genes = set(tcga.expression.genes('BRCA'))

In [6]:
dcm_genes = set(dcm.entrez_gene_ids.astype(str))

In [7]:
chosen_collections = {
    'KEGG': 'c2.cp.kegg',
    'Reactome': 'c2.cp.reactome',
    'Hallmarks': 'h.all'
}

data = []

min_genes = 15
max_genes = 500

def trimmed_len(collection):
    return len(collection.trim(min_genes, max_genes).gene_sets)


for name, identifier in chosen_collections.items():
    collection = msigdb.load(identifier, 'entrez')
    subset = collection.subset

    tcga_subset = subset(tcga_genes)
    lincs_subset = subset(dcm_genes)
    shared_subset = subset(tcga_genes & dcm_genes)

    data.append({
        'Name': name,
        'ID': identifier,
        'Total': trimmed_len(collection),
        'TCGA': trimmed_len(tcga_subset),
        'LINCS': trimmed_len(lincs_subset),
        'TCGA $\\cap$ LINCS': trimmed_len(shared_subset)
    })

In [8]:
order = [
    'Name', 'ID', 'Total',
    'TCGA', 'LINCS', 'TCGA $\\cap$ LINCS'
]
df = DataFrame(data)[order]
df.style.hide_index()

Name,ID,Total,TCGA,LINCS,TCGA $\cap$ LINCS
KEGG,c2.cp.kegg,178,178,167,167
Reactome,c2.cp.reactome,521,512,471,471
Hallmarks,h.all,50,50,50,50
