In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib as mpl
from matplotlib import pyplot as plt

from scipy.stats import zscore

from collections import defaultdict, Counter

# Load annotations

In [7]:
cell_line_info = pd.read_csv("../data/sample_info.csv")
cell_line_annotations = pd.read_excel("../data/41586_2019_1186_MOESM4_ESM.xlsx",
                                      sheet_name="Cell Line Annotations")

hs_muts = pd.read_hdf("../data/hs_muts.h5",key="hs_muts")
damaging_muts = pd.read_hdf("../data/damaging_muts.h5",key="damaging_muts")
tertp = pd.read_excel("../data/41586_2019_1186_MOESM8_ESM.xlsx",skiprows=4)

fusions = pd.read_csv("../data/CCLE_Fusions_20181130.txt",sep="\t")
copynumber = pd.read_hdf("../data/CCLE_gene_cn.hdf",key="copynumber")
copynumber = copynumber.dropna(how="any",thresh=1000,axis=1)

tss1kb_meth = pd.read_hdf("../data/CCLE_RRBS_TSS1kb_20181022.hdf",key="tss1kb_meth")

ccle_genex = pd.read_hdf("../data/CCLE_RNAseq_rsem_genes_tpm_20180929.hdf",key="ccle_genex")
exonusage = pd.read_hdf("../data/CCLE_RNAseq_ExonUsageRatio_20180929.hdf",key="exonusage")

mirna = pd.read_hdf("../data/CCLE_miRNA_20181103.hdf",key="mirna")
chromatin_profiling = pd.read_hdf("../data/CCLE_GlobalChromatinProfiling_20181130.hdf",key="chromatin_profiling")
rppa = pd.read_hdf("../data/CCLE_RPPA_20181003.hdf",key="rppa")

msi = pd.read_excel("../data/41586_2019_1186_MOESM10_ESM.xlsx",sheet_name="MSI calls")
absolute = pd.read_excel("../data/CCLE_ABSOLUTE_combined_20181227.xlsx",
                         sheet_name = "ABSOLUTE_combined.table")

In [25]:
cell_line_annotations

Unnamed: 0,CCLE_ID,depMapID,Name,Site_Primary,Site_Subtype1,Site_Subtype2,Site_Subtype3,Histology,Hist_Subtype1,Hist_Subtype2,...,Growth.Medium,Supplements,Freezing.Medium,Doubling.Time.from.Vendor,Doubling.Time.Calculated.hrs,type,type_refined,PATHOLOGIST_ANNOTATION,mutRate,tcga_code
0,DMS53_LUNG,ACH-000698,DMS 53,lung,NS,NS,NS,carcinoma,small_cell_carcinoma,NS,...,Waymouth's +10%FBS,,5%DMSO,,450.0,lung_small_cell,lung_small_cell,Lung:SCLC,157.241638,SCLC
1,SW1116_LARGE_INTESTINE,ACH-000489,SW1116,large_intestine,colon,NS,NS,carcinoma,adenocarcinoma,NS,...,L15+10%FBS,,5%DMSO,,123.2,colorectal,colorectal,Colorectal:Carcinoma,94.373192,COAD/READ
2,NCIH1694_LUNG,ACH-000431,NCI-H1694,lung,NS,NS,NS,carcinoma,small_cell_carcinoma,NS,...,DMEM:F12 (1:1) + 5% FBS,".005 mg/ml insulin, .01 mg/ml transferrin, 30n...",5% DMSO,,81.2,lung_small_cell,lung_small_cell,Lung:SCLC,109.534987,SCLC
3,P3HR1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACH-000707,P3HR-1,haematopoietic_and_lymphoid_tissue,NS,NS,NS,lymphoid_neoplasm,Burkitt_lymphoma,NS,...,,,,,,lymphoma_Burkitt,lymphoma_Burkitt,Lymphoma:NH_B_cell,166.456406,DLBC
4,HUT78_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACH-000509,HuT 78,haematopoietic_and_lymphoid_tissue,skin,NS,NS,lymphoid_neoplasm,mycosis_fungoides-Sezary_syndrome,NS,...,,,,,,lymphoma_other,T-cell_lymphoma_other,Lymphoma:NH_T_cell,118.217569,
5,UMUC3_URINARY_TRACT,ACH-000522,UM-UC-3,urinary_tract,bladder,NS,NS,carcinoma,transitional_cell_carcinoma,NS,...,EMEM +10%FBS,,5%DMSO,,386.7,urinary_tract,urinary_tract,Bladder:Carcinoma,124.707962,BLCA
6,HOS_BONE,ACH-000613,HOS,bone,NS,NS,NS,osteosarcoma,NS,NS,...,,,,,,osteosarcoma,osteosarcoma,Bone:Sarcoma_Osteo,127.979671,SARC
7,HUNS1_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACH-000829,HuNS1,haematopoietic_and_lymphoid_tissue,NS,NS,NS,lymphoid_neoplasm,plasma_cell_myeloma,NS,...,,,,,,multiple_myeloma,multiple_myeloma,Lymphoma:Multiple_Myeloma,187.913876,MM
8,AML193_HAEMATOPOIETIC_AND_LYMPHOID_TISSUE,ACH-000557,AML-193,haematopoietic_and_lymphoid_tissue,NS,NS,NS,haematopoietic_neoplasm,acute_myeloid_leukaemia,M5,...,IMDM+5%FBS,"0.005mg/ml insulin, 0.005mg/ml transferrin and...",5% DMSO,,80.3,AML,AML,Leukemia:AML,133.368637,LAML
9,RVH421_SKIN,ACH-000614,RVH-421,skin,NS,NS,NS,malignant_melanoma,NS,NS,...,RPMI1640+10%FBS,,5%DMSO,,49.5,melanoma,melanoma,Skin:Melanoma,,SKCM


In [35]:
subtypes = cell_line_annotations[["depMapID","type_refined"]].set_index("depMapID")
subtypes = subtypes["type_refined"]
subtypes = subtypes.dropna()
subtypes = subtypes.apply(lambda x: x.capitalize().replace("_"," "))

rename_map = {"T-cell lymphoma other":"Other T-cell lymphoma",
              "Aml":"AML",
              "Ewings sarcoma": "Ewing's sarcoma",
              "Fibroblast like":"Fibroblast-like",
              "Lung nsc":"Lunc, NSC",
              "Lymphoma hodgkin":"Hodgkin's lymphoma",
              "T-cell all":"T-cell ALL",
              "B-cell all":"B-cell ALL",
              "Cml":"CML",
              "B-cell lymphoma other":"Other B-cell lymphoma",
              "Leukemia other":"Other leukemia"
             }

subtypes = subtypes.rename(rename_map)

Counter(subtypes)

Counter({'Lung small cell': 53,
         'Colorectal': 61,
         'Lymphoma burkitt': 11,
         'T-cell lymphoma other': 11,
         'Urinary tract': 27,
         'Osteosarcoma': 7,
         'Multiple myeloma': 30,
         'Aml': 39,
         'Melanoma': 55,
         'Breast': 54,
         'Ewings sarcoma': 11,
         'Liver': 27,
         'Mesothelioma': 9,
         'Ovary': 54,
         'Esophagus': 27,
         'Endometrium': 28,
         'Fibroblast like': 40,
         'Glioma': 64,
         'Pancreas': 46,
         'Neuroblastoma': 17,
         'Lung nsc': 136,
         'Chondrosarcoma': 3,
         'Lymphoma hodgkin': 9,
         'Lymphoma dlbcl': 17,
         'Stomach': 38,
         'Soft tissue': 17,
         'Kidney': 38,
         'T-cell all': 16,
         'B-cell all': 13,
         'Giant cell tumour': 1,
         'Upper aerodigestive': 34,
         'Other': 4,
         'Thyroid': 13,
         'Cml': 15,
         'Bile duct': 8,
         'B-cell lymphoma other': 16,

# Top recurrent mutations

In [16]:
def get_top_muts():
    
    align_hs_muts, align_damaging_muts = hs_muts.align(damaging_muts,join="outer",axis=1)
    align_hs_muts = align_hs_muts.fillna(0)
    align_damaging_muts = align_damaging_muts.fillna(0)
    
    hs_damaging_muts = (align_hs_muts+align_damaging_muts).clip(0,1)
        
    mut_totals = hs_damaging_muts.sum()
    mut_totals = mut_totals.sort_values()
    
    return hs_damaging_muts[mut_totals.index[-8:]]
    
muts = get_top_muts()

# Fusions

In [17]:
def get_fusions():

    fusions["value"] = 1

    fusions_mat = pd.pivot_table(fusions, values="value",
                                index=["BroadID"], columns="X.FusionName", fill_value=0)
    
    fusions_mat.columns = fusions_mat.columns.map(lambda x: x.replace("--","-"))

    return fusions_mat[["BCR-ABL1","EWSR1-FLI1","KMT2A-MLLT3"]]

select_fusions = get_fusions()

# Top continuous annotations

In [18]:
def top_variable(annotations, top_n,clip_left=-3,clip_right=3):
    
    stdevs = annotations.std()
    stdevs = stdevs.sort_values()
    
    top_names = stdevs.index[-top_n:]
    
    top_annotations = annotations[top_names]
    top_annotations = (top_annotations - top_annotations.mean())/top_annotations.std()
    
    top_annotations = top_annotations.clip(clip_left,clip_right)
    
    return top_annotations

In [19]:
select_copynumber = top_variable(copynumber, 1000)
select_meth = top_variable(tss1kb_meth, 1000)
select_genex = top_variable(ccle_genex, 1000)
select_exonusage = top_variable(exonusage, 1000)
select_mirna = top_variable(mirna, 1000)
select_chromatin = top_variable(chromatin_profiling, 1000)
select_rppa = top_variable(rppa, 1000)

# MSI, ploidy, and ancestry

In [20]:
is_msi = msi[msi["CCLE.MSI.call"].isin(['inferred-MSI','inferred-MSS'])]
is_msi = is_msi[["depMapID","CCLE.MSI.call"]].set_index("depMapID")
is_msi = is_msi == "inferred-MSI"

ploidy = absolute[["depMapID","ploidy"]].set_index("depMapID")
ancestry = cell_line_annotations[["inferred_ancestry","depMapID"]].set_index("depMapID").dropna()

In [None]:
merged_annotations = pd.concat([])