# Exploration and checking things out

In [1]:
import pandas as pd
import numpy as np
import time
import pickle

## Load data files

In [2]:
# Load drug annotations file, containing list and info about drugs from GDSC
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Drug annotations/Screened_Compounds-March_27th_2018.xlsx"
drug_annotations_df = pd.read_excel(filepath)
print(drug_annotations_df.shape)

(267, 5)


In [3]:
# Load original gene expression data for cell lines from GDSC
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Gene expression/sanger1018_brainarray_ensemblgene_rma-March_2nd_2017.txt"
gene_expression_df = pd.read_table(filepath)
print(gene_expression_df.shape)

(17737, 1019)


In [4]:
# Load file with CNV data for cell lines from GDSC
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Copy number variations/Gene_level_CN-July_4th_2016.xlsx"
cnv_df = pd.read_excel(filepath, sheet_name=1)
print(cnv_df.shape)

(46222, 1000)


In [5]:
cnv_df.shape

(46222, 1000)

In [6]:
# Load data with binary CNV calls
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Copy number variations/cnv_binary_1.csv"
d1 = pd.read_csv(filepath)
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Copy number variations/PANCANCER_Genetic_feature_cna_Mon Aug  6 16_18_51 2018 (kopia).csv"
d2 = pd.read_table(filepath)
d2.columns = ["genes_in_segment"]
def f(s):
    return s.strip(",")
cnv_binary_df = d1.copy()
cnv_binary_df["genes_in_segment"] = d2["genes_in_segment"].apply(f)
print(cnv_binary_df.shape)

(419050, 9)


In [7]:
# Load file containing drug-response data for cell lines, but the one downloaded using data download tool from GDSC
# website - not really relevant, just to check some things out
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Sensitivity profiles/PANCANCER_IC_Mon Aug  6 15_47_28 2018.csv"
drug_response_df = pd.read_csv(filepath)
print(drug_response_df.shape)
print("Number of different max concentrations used:", drug_response_df["Max conc"].nunique())

(224202, 14)
Number of different max concentrations used: 37


In [8]:
# Load file containing drug-response data for cell lines from GDSC download site
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Sensitivity profiles/v17.3_fitted_dose_response-March_27th_2018.xlsx"
drug_response_df = pd.read_excel(filepath)
print(drug_response_df.shape)

(224202, 13)


In [15]:
# Load binary calls from STITCH
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/STITCH/STITCH_GDSC_binary_calls.csv"
stitch_calls_df = pd.read_csv(filepath)
print(stitch_calls_df.shape)

(152, 530)


In [40]:
# Load chemical-protein links from STITCH
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/STITCH/9606.protein_chemical.links.transfer.v5.0.tsv"
stitch_links_df = pd.read_table(filepath)
print(stitch_links_df.shape)


(15473939, 11)


In [67]:
# Load mapping from STRING to entrez ID
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/STRING/entrez_gene_id.vs.string.v10.28042015.tsv"
string_map_df = pd.read_table(filepath)
print(string_map_df.shape)

(17538, 2)


In [10]:
# Load file contaning binary coding variants for cell lines
filepath = "/media/krzysztof/Nowy/Doktorat/Data/Original Data/Genomics of Drug Sensitivity in Cancer/Original GDSC Data/Mutation calls/PANCANCER_Genetic_feature_variant_Mon Aug  6 15_45_44 2018.csv"
coding_variants_df = pd.read_csv(filepath)
print(coding_variants_df.shape)
print("Number of considered genes:", coding_variants_df.genetic_feature.nunique())

(295740, 9)
Number of considered genes: 310


## Prototype utilities for data extraction and structurization (classes, functions, methods)

In [11]:
class Drug(object):
    
    """Class representing compound from GDSC database.
    
    Attributes:
    gdsc_id: ID from GDSC website
    name: drug name
    targets: list of strings representing drug's target genes
    target_pathway: drug's target pathway
    
    
    
    Methods:
    extract_drug_response_data: generates a frame with drug-response data
    
    
    """
    # Class variables
    map_from_hgnc_to_ensembl = None
    map_from_ensembl_to_hgnc = None
    
    # Instance methods
    def __init__(self, gdsc_id, name, targets, target_pathway):
        self.gdsc_id = gdsc_id
        self.name = name
        self.targets = targets
        self.target_pathway = target_pathway
        self.ensembl_targets = []
        for x in self.targets:
            try:
                self.ensembl_targets.append(self.map_from_hgnc_to_ensembl[x])
            except KeyError:
                pass
        
    
    def extract_drug_response_data(self, sensitivity_profiles_df, metric="AUC"):
        """Generate a DataFrame containing reponses for every cell line screened for that drug.
        
        Arguments:
        sensitivity_profiles_df (DataFrame): DataFrame of drug response data from GDSC
        metric (string): which statistic to use as a response metric (default "AUC")
        
        Returns:
        None
        """
        df = drug_response_df[drug_response_df.DRUG_ID == self.gdsc_id][["COSMIC_ID", metric]]
        df.columns = ["cell_line_id", metric]   # Insert column with samples ID
        
        self.total_no_samples_screened = df.shape[0]    # Record how many screened cell lines for drug
        self.response_data = df
        
    
    def extract_screened_cell_lines(self, sensitivity_profiles_df):
        """Generate set of cell lines screened for that drug.
        
        Arguments:
        sensitivity_profiles_df (DataFrame): DataFrame of drug response data from GDSC
        
        Returns:
        None
        """
        self.screened_cell_lines = list(
            sensitivity_profiles_df[sensitivity_profiles_df.DRUG_ID == self.gdsc_id]["COSMIC_ID"])
    
    def extract_gene_expression(self, gene_expression_df):
        """Generate DataFrame of gene expression data for cell lines screened for this drug, only
        considering drug's target genes.
        
        Arguments:
        gene_expression_df: original GDSC gene expression DataFrame
        sensitivity_profiles_df (DataFrame): DataFrame of drug response data from GDSC
        
        Returns:
        None
        """
        cell_lines_str = []   # Gene expressesion DF column names are strings
        for x in self.screened_cell_lines:
            cell_lines_str.append(str(x))
        cl_to_extract = []
        for x in cell_lines_str:
            if x in list(gene_expression_df.columns):   
                cl_to_extract.append(x)   # Extract only cell lines contained in gene expression data
        gene_expr = gene_expression_df[
            gene_expression_df.ensembl_gene.isin(self.ensembl_targets)][["ensembl_gene"] + cl_to_extract]
        gene_expr_t = gene_expr.transpose()
        columns = list(gene_expr_t.loc["ensembl_gene"])
        gene_expr_t.columns = columns
        gene_expr_t = gene_expr_t.drop(["ensembl_gene"])
        rows = list(gene_expr_t.index)
        gene_expr_t.insert(0, "cell_line_id", rows)   # Insert columns with cell line IDs
        gene_expr_t.reset_index(drop=True, inplace=True)
        gene_expr_t["cell_line_id"] = pd.to_numeric(gene_expr_t["cell_line_id"])
        self.gene_expression_data = gene_expr_t
        
    def extract_mutation_data(self, mutation_df):
        """Generate a DataFrame with binary mutation calls for screened cell lines and target genes.
        
        Arguments:
        mutation_df: DataFrame with original mutation calls from GDSC
        
        Returns:
        None
        """
        targets = [x + "_mut" for x in test_compound.targets]
        df = mutation_df.copy()[
                mutation_df.cosmic_sample_id.isin(self.screened_cell_lines)]
        df = df[df.genetic_feature.isin(targets)][["cosmic_sample_id", "genetic_feature", "is_mutated"]]
        cosmic_ids = []
        genetic_features = {}
        for feature in df.genetic_feature.unique():
            genetic_features[feature] = []
        for ide in df.cosmic_sample_id.unique():
            cosmic_ids.append(ide)
            for feature in genetic_features:
                mutation_status = df[
                    (df.cosmic_sample_id == ide) & (df.genetic_feature == feature)]["is_mutated"].iloc[0]
                genetic_features[feature].append(mutation_status)
        df1 = pd.DataFrame()
        df1.insert(0, "cell_line_id", cosmic_ids)    # Insert column with samples IDs
        for feature in genetic_features:
            df1[feature] = genetic_features[feature]
        self.mutation_data = df1
        
    def extract_cnv_data(self, cnv_binary_df):
        """Generate data containing binary CNV calls for cell lines screened for the drug.
        
        Arguments:
        cnv_binary_df: DataFrame from GDSC download tool with CNV data
        
        Returns:
        None
        """
        df = cnv_binary_df[cnv_binary_df.cosmic_sample_id.isin(self.screened_cell_lines)]
        features_to_extract = []   # Map drug's targets to CNV features (segments)
        for row in cnv_binary_df.drop_duplicates(subset="genetic_feature").itertuples():
            feature_name = getattr(row, "genetic_feature")
            genes_in_segment = getattr(row, "genes_in_segment").split(",")
            for target in test_compound.targets:
                if target in genes_in_segment:
                    features_to_extract.append(feature_name)   # If target is in any segment, add it to the list
        features_to_extract = list(set(features_to_extract))
        df = df[df.genetic_feature.isin(features_to_extract)]
        cosmic_ids = []
        feature_dict = {}   # Separate lists for every column in final DataFrame
        for feature in df.genetic_feature.unique():
            feature_dict[feature] = []
        for cl_id in df.cosmic_sample_id.unique():
            cosmic_ids.append(cl_id)
            for feature in feature_dict:
                status = df[
                    (df.cosmic_sample_id == cl_id) & (df.genetic_feature == feature)]["is_mutated"].iloc[0]
                feature_dict[feature].append(status)
        new_df = pd.DataFrame()
        for feature in feature_dict:
            new_df[feature] = feature_dict[feature]
        new_df.insert(0, "cell_line_id", cosmic_ids)
        self.cnv_data = new_df

    
    def concatenate_data(self, data_combination):
        """Generate data containing chosen combination of genetic data classes.
        
        Arguments:
        data_combination: list of strings containing data classes to be included. Available options are:
            "mutation", "expression", "CNV"
        
        Returns:
        None
        """
        # Create a list of DataFrames to include
        objects = [self.response_data]
        if "mutation" in data_combination and self.mutation_data.shape[0] > 0:
            objects.append(self.mutation_data)
        if "expression" in data_combination and self.gene_expression_data.shape[0] > 0:
            objects.append(self.gene_expression_data)
        if "CNV" in data_combination and self.cnv_data.shape[0] > 0:
            objects.append(self.cnv_data)
        # Find intersection in cell lines for all desirable DataFrames
        cl_intersection = set(list(self.response_data["cell_line_id"]))
        for obj in objects:
            cl_intersection = cl_intersection.intersection(set(list(obj["cell_line_id"])))
        objects_common = []
        for obj in objects:
            objects_common.append(obj[obj["cell_line_id"].isin(cl_intersection)])
        # Check if all DataFrames have the same number of samples
        no_samples = objects_common[0].shape[0]
        for obj in objects_common:
            assert obj.shape[0] == no_samples
            obj.sort_values("cell_line_id", inplace=True)
            obj.reset_index(drop=True, inplace=True)
        cl_ids = objects_common[0]["cell_line_id"]
        df_concatenated = pd.concat(objects_common, axis=1, ignore_index=False)
        metric = self.response_data.columns[-1]   # Extract the name of metric which was used for sensitivity
        sensitivities = df_concatenated[metric]
        df_concatenated = df_concatenated.drop(["cell_line_id", metric], axis=1)
        df_concatenated.insert(0, "cell_line_id", cl_ids)
        df_concatenated.insert(df_concatenated.shape[1], metric, sensitivities)
        self.full_data = df_concatenated
        
    
    
    @classmethod
    def load_mappings(cls, filepath_hgnc_to_ensembl, filepath_ensembl_to_hgnc):
        """Load dictonaries with gene mappings between HGNC and Ensembl (from pickle files) and assugn it 
        to corresponding class variables.
        
        Arguments:
        filepath_hgnc_to_ensembl: file with accurate mapping
        filepath_ensembl_to_hgnc: file with accurate mapping
        
        Returns:
        None
        """
        cls.map_from_hgnc_to_ensembl = pickle.load(open(filepath_hgnc_to_ensembl, "rb"))
        cls.map_from_ensembl_to_hgnc = pickle.load(open(filepath_ensembl_to_hgnc, "rb"))
        
    
    @staticmethod
    def create_drugs(drug_annotations_df):
        """Create a dictionary of Drug class objects, each referenced by it's ID.

        Arguments:
        drug_annotations_df (DataFrame): DataFrame of drug annotations from GDSC website

        Returns:
        Dictionary of Drug objects as values and their ID's as keys
        """
        drugs = {}
        for row in drug_annotations_df.itertuples(index=True, name="Pandas"):
            gdsc_id = getattr(row, "DRUG_ID")
            name = getattr(row, "DRUG_NAME")
            targets = getattr(row, "TARGET").split(", ")
            target_pathway = getattr(row, "TARGET_PATHWAY")

            drugs[gdsc_id] = Drug(gdsc_id, name, targets, target_pathway)
        return drugs


In [12]:
filepath1 = "/media/krzysztof/Nowy/Doktorat/Projects/GDSC - Prediction only with data related to nominal drug targets (minimal approach)/Created data/mapping_from_ensembl_id_to_hgnc_symbol.p"
filepath2 = "/media/krzysztof/Nowy/Doktorat/Projects/GDSC - Prediction only with data related to nominal drug targets (minimal approach)/Created data/mapping_from_hgnc_symbol_to_ensembl_id.p"
Drug.load_mappings(filepath2, filepath1)

In [13]:
test_drugs = Drug.create_drugs(drug_annotations_df.iloc[100:108])
print(test_drugs.keys())

dict_keys([228, 229, 230, 231, 235, 238, 245, 249])


In [14]:
test_compound = test_drugs[249]
test_compound.extract_screened_cell_lines(drug_response_df)
print(test_compound.name)
print(test_compound.ensembl_targets)
print(test_compound.targets)
test_compound.extract_drug_response_data(drug_response_df)
test_compound.extract_gene_expression(gene_expression_df)
test_compound.extract_mutation_data(coding_variants_df)
test_compound.extract_cnv_data(cnv_binary_df)

test_compound.concatenate_data(["expression", "mutation", "CNV"])

print(test_compound.gene_expression_data.shape)
print(test_compound.mutation_data.shape)
print(test_compound.cnv_data.shape)
print(test_compound.full_data.shape)

Cabozantinib
['ENSG00000105976', 'ENSG00000165731', 'ENSG00000157404', 'ENSG00000102755', 'ENSG00000122025', 'ENSG00000037280']
['VEGFR', 'MET', 'RET', 'KIT', 'FLT1', 'FLT3', 'FLT4', 'TIE2,AXL']
(920, 7)
(887, 4)
(916, 4)
(861, 14)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [17]:
drug_annotations_df.head()

Unnamed: 0,DRUG_ID,DRUG_NAME,SYNONYMS,TARGET,TARGET_PATHWAY
0,1,Erlotinib,"Tarceva, RG-1415, CP-358774, OSI-774, Ro-50823...",EGFR,EGFR signaling
1,3,Rapamycin,"AY-22989, Sirolimus, WY-090217, Torisel, Rapamune",MTORC1,PI3K/MTOR signaling
2,5,Sunitinib,"Sutent, Sunitinib Malate, SU-11248","PDGFR, KIT, VEGFR, FLT3, RET, CSF1R",RTK signaling
3,6,PHA-665752,"PHA665752, PHA 665752",MET,RTK signaling
4,9,MG-132,"LLL cpd, MG 132, MG132","Proteasome, CAPN1",Protein stability and degradation


In [18]:
stitch_calls_df.head()

Unnamed: 0,COSMIC drug name,ProteinID / DrugID,9606.ENSP00000263377,9606.ENSP00000354526,9606.ENSP00000398516,9606.ENSP00000403946,9606.ENSP00000358813,9606.ENSP00000263967,9606.ENSP00000274335,9606.ENSP00000289153,...,9606.ENSP00000448165,9606.ENSP00000359478,9606.ENSP00000355930,9606.ENSP00000261707,9606.ENSP00000337773,9606.ENSP00000355920,9606.ENSP00000366506,9606.ENSP00000261416,9606.ENSP00000223095,9606.ENSP00000386171
0,PFI-1,CIDs71271629,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,UNC1215,CIDs57339144,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,LFM-A13,CIDs54676905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,NG-25,CIDs53340664,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,PF-4708671,CIDs51371303,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
# for column in stitch_calls_df.columns:
#     if stitch_calls_df[stitch_calls_df["COSMIC drug name"] == "Erlotinib"][column].iloc[0] == 1:
#         print(column)

In [38]:
stitch_calls_df[stitch_calls_df["COSMIC drug name"] == "Erlotinib"]

Unnamed: 0,COSMIC drug name,ProteinID / DrugID,9606.ENSP00000263377,9606.ENSP00000354526,9606.ENSP00000398516,9606.ENSP00000403946,9606.ENSP00000358813,9606.ENSP00000263967,9606.ENSP00000274335,9606.ENSP00000289153,...,9606.ENSP00000448165,9606.ENSP00000359478,9606.ENSP00000355930,9606.ENSP00000261707,9606.ENSP00000337773,9606.ENSP00000355920,9606.ENSP00000366506,9606.ENSP00000261416,9606.ENSP00000223095,9606.ENSP00000386171
124,Erlotinib,CIDs00176870,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [48]:
# Erlotinib ID: CIDs00176870
print(stitch_links_df[stitch_links_df.chemical == "CIDs00176870"].shape)
print(stitch_links_df[stitch_links_df.chemical == "CIDs00176870"].protein.nunique())

(304, 11)
304


In [64]:
erlotinib = stitch_links_df[stitch_links_df.chemical == "CIDs00176870"]

In [65]:
erlotinib[erlotinib.experimental_direct > 0].shape

(44, 11)

In [68]:
string_map_df.head()

Unnamed: 0,#Entrez_Gene_ID,STRING_Locus_ID
0,1,9606.ENSP00000263100
1,2,9606.ENSP00000323929
2,9,9606.ENSP00000443194
3,10,9606.ENSP00000286479
4,13,9606.ENSP00000232892
