# Filtered Data Creation #

Pre-requisites to running this file
- Run the string_processing.py file to preprocess the STRING database if using it as an addition filter. Save this to STRING_full_filtered.tsv in the data_processed directory
- Run the preprocessing.ipynb if using STRING filtration
- If not using STRING filtration, modify this file to preprocess the DNA/RNA/Protein data and remove the STRING filtration code

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import pandas as pd

from preprocess_data import *
from create_training_data import *
from string_preprocessing import *

# import everything needed

Preprocess NCI ALMANAC data

In [2]:
preprocess_drug_data()
nci_almanac_combo_df, almanac_cl_name_to_id = get_drug_data()
almanac_cell_lines = set(almanac_cl_name_to_id.keys())
print("Number of unique cell lines: " + str(len(almanac_cl_name_to_id.keys())))
print("ID for cell line ACHN should be 3: " + str(almanac_cl_name_to_id["ACHN"]))

  nci_almanac_combo_df = pd.read_csv("../data/NCI-ALMANAC/ComboDrugGrowth_Nov2017.csv")


Original shape: (3686475, 29) Preprocessed shape: (2871444, 8)
Number of unique cell lines: 61
ID for cell line ACHN should be 3: 3


Test drug_nsc_to_name fetching

In [3]:
drug_nsc_to_name = get_nsc_to_drug_name_dict()
print("Drug NSC to name dict:")
for key in list(drug_nsc_to_name.keys())[:5]:
    print(key, drug_nsc_to_name[key])

Drug NSC to name dict:
740 Methotrexate
750 Busulfan
752 Thioguanine/6-Thioguanine
755 Mercaptopurine
762 Mechlorethamine hydrochloride


get_smiles_and_fingerprints test

In [4]:
nsc_to_morgan_fingerprints = get_smiles_and_fingerprints(fp_len=256)
print(nsc_to_morgan_fingerprints)

[02:47:37] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[02:47:37] ERROR: Could not sanitize molecule ending on line 6200
[02:47:37] ERROR: Explicit valence for atom # 0 Cl, 2, is greater than permitted


{'740': array([0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.,
       0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 1., 0., 1.,
       0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0.,
       0., 0., 0.

Test get_physicochemical_properties

In [5]:
nsc_to_properties = get_physicochemical_properties()
print(nsc_to_properties)

Is 66847 in drug_nsc_to_name? True
Is 66847 in sdf_names? True
Number of NSCs in NCI ALMANAC but not in SDF file: 2
{'119875', '753082'}
Number of molecules in SDF file but not in NCI ALMANAC: 0
{'740': {'MolWt': 454.44700000000023, 'TPSA': 210.53999999999996, 'LogP': 0.26840000000000125, 'NumAliphaticRings': 0, 'NumAromaticRings': 3, 'NumHDonors': 5, 'NumHAcceptors': 10}, '750': {'MolWt': 246.30599999999998, 'TPSA': 86.74000000000001, 'LogP': -0.2809999999999997, 'NumAliphaticRings': 0, 'NumAromaticRings': 0, 'NumHDonors': 0, 'NumHAcceptors': 6}, '752': {'MolWt': 167.197, 'TPSA': 83.38, 'LogP': 0.5976899999999999, 'NumAliphaticRings': 0, 'NumAromaticRings': 2, 'NumHDonors': 3, 'NumHAcceptors': 4}, '755': {'MolWt': 152.18200000000002, 'TPSA': 57.36, 'LogP': 1.0154899999999998, 'NumAliphaticRings': 0, 'NumAromaticRings': 2, 'NumHDonors': 2, 'NumHAcceptors': 3}, '762': {'MolWt': 192.51700000000002, 'TPSA': 3.24, 'LogP': 1.8176, 'NumAliphaticRings': 0, 'NumAromaticRings': 0, 'NumHDonors':

[02:47:37] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[02:47:37] ERROR: Could not sanitize molecule ending on line 6200
[02:47:37] ERROR: Explicit valence for atom # 0 Cl, 2, is greater than permitted


In [6]:
nsc_to_prop_df = pd.read_csv("../data_processed/almanac_nsc_to_properties.tsv", sep='\t', index_col=0)
nsc_to_prop_df

Unnamed: 0_level_0,Name,MolWt,TPSA,LogP,NumAliphaticRings,NumAromaticRings,NumHDonors,NumHAcceptors
NSC,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
740,Methotrexate,454.447,210.54,0.26840,0,3,5,10
750,Busulfan,246.306,86.74,-0.28100,0,0,0,6
752,Thioguanine/6-Thioguanine,167.197,83.38,0.59769,0,2,3,4
755,Mercaptopurine,152.182,57.36,1.01549,0,2,2,3
762,Mechlorethamine hydrochloride,192.517,3.24,1.81760,0,0,0,1
...,...,...,...,...,...,...,...,...
757441,Axitinib,386.480,70.67,4.63910,0,4,2,4
760766,Vandetanib,475.362,59.51,5.00420,1,3,1,6
761431,vemurafenib,489.931,91.92,5.54420,0,4,2,4
761432,Cabazitaxel,835.944,202.45,4.56780,4,2,3,14


Preprocess DNA dataset

In [7]:
# preprocess_dna_data() # Uncomment if you haven't run this as part of preprocessing.ipynb
dna_cl_names, dna_identifier_to_entrez, dna_cl_to_exome_mut = get_dna_data()
print("Number of unique cell lines: " + str(len(dna_cl_names)))
print("Number of DNA identifiers: " + str(len(dna_identifier_to_entrez.keys())))
print("Shape of DNA mutation matrix: " + str(dna_cl_to_exome_mut.shape))

# Which cell lines are in the NCI-ALMANAC dataset but not in the DNA dataset?
print("Cell lines in NCI-ALMANAC but not in DNA:")
print(almanac_cell_lines - dna_cl_names)

# Which cell lines are in the DNA dataset but not in the NCI-ALMANAC dataset?
print("Cell lines in DNA but not in NCI-ALMANAC:")
print(dna_cl_names - almanac_cell_lines)

# Which cell lines are in dataset intersection?
print("Cell lines in intersection:")
print(almanac_cell_lines.intersection(dna_cl_names))


Number of unique cell lines: 59
Number of DNA identifiers: 138892
Shape of DNA mutation matrix: (60, 138893)
Cell lines in NCI-ALMANAC but not in DNA:
{'MDA-MB-468', 'SF-539\x1a', 'MCF7'}
Cell lines in DNA but not in NCI-ALMANAC:
{'MDA-N'}
Cell lines in intersection:
{'NCI-H460', 'HCT-15', 'DU-145', 'KM12', '786-0', 'MALME-3M', 'SR', 'OVCAR-4', 'UACC-257', 'SF-295', 'A549/ATCC', 'HOP-62', 'SK-MEL-2', 'OVCAR-8', 'HT29', 'IGROV1', 'U251', 'UO-31', 'UACC-62', 'HCT-116', 'SK-MEL-5', 'SN12C', 'SK-OV-3', 'MDA-MB-231/ATCC', 'COLO 205', 'SF-268', 'A498', 'SNB-75', 'RPMI-8226', 'NCI/ADR-RES', 'RXF 393', 'SK-MEL-28', 'M14', 'SF-539', 'ACHN', 'MDA-MB-435', 'BT-549', 'HOP-92', 'NCI-H23', 'NCI-H322M', 'CAKI-1', 'TK-10', 'SNB-19', 'EKVX', 'OVCAR-3', 'T-47D', 'HS 578T', 'LOX IMVI', 'HCC-2998', 'MOLT-4', 'CCRF-CEM', 'NCI-H226', 'NCI-H522', 'K-562', 'OVCAR-5', 'PC-3', 'HL-60(TB)', 'SW-620'}


Preprocess RNA data

In [8]:
# preprocess_rna_data() # Uncomment if you haven't run this as part of preprocessing.ipynb
rna_cl_names, rna_entrez_to_gene_name, rna_cl_to_expr = get_rna_data()
print("Number of unique cell lines: " + str(len(rna_cl_names)))
print("Number of RNA entrez IDs: " + str(len(rna_entrez_to_gene_name.keys())))
print("Shape of RNA expression matrix: " + str(rna_cl_to_expr.shape))

# Which cell lines are in the NCI-ALMANAC dataset but not in the DNA dataset?
print("Cell lines in NCI-ALMANAC but not in RNA:")
print(almanac_cell_lines - rna_cl_names)

# Which cell lines are in the DNA dataset but not in the NCI-ALMANAC dataset?
print("Cell lines in RNA but not in NCI-ALMANAC:")
print(rna_cl_names - almanac_cell_lines)

# Which cell lines are in dataset intersection?
print("Cell lines in intersection:")
print(almanac_cell_lines.intersection(rna_cl_names))

Number of unique cell lines: 60
Number of RNA entrez IDs: 26178
Shape of RNA expression matrix: (60, 26179)
Cell lines in NCI-ALMANAC but not in RNA:
{'MDA-MB-468', 'SF-539\x1a'}
Cell lines in RNA but not in NCI-ALMANAC:
{'MDA-N'}
Cell lines in intersection:
{'NCI-H460', 'HCT-15', 'DU-145', 'KM12', '786-0', 'MALME-3M', 'SR', 'OVCAR-4', 'UACC-257', 'SF-295', 'A549/ATCC', 'HOP-62', 'SK-MEL-2', 'OVCAR-8', 'HT29', 'IGROV1', 'U251', 'UO-31', 'UACC-62', 'HCT-116', 'SK-MEL-5', 'SN12C', 'SK-OV-3', 'MDA-MB-231/ATCC', 'COLO 205', 'SF-268', 'A498', 'SNB-75', 'RPMI-8226', 'NCI/ADR-RES', 'RXF 393', 'SK-MEL-28', 'M14', 'SF-539', 'ACHN', 'MDA-MB-435', 'BT-549', 'HOP-92', 'NCI-H23', 'NCI-H322M', 'CAKI-1', 'TK-10', 'SNB-19', 'EKVX', 'OVCAR-3', 'T-47D', 'HS 578T', 'LOX IMVI', 'MCF7', 'HCC-2998', 'MOLT-4', 'CCRF-CEM', 'NCI-H226', 'NCI-H522', 'K-562', 'OVCAR-5', 'PC-3', 'HL-60(TB)', 'SW-620'}


Preprocess protein data

In [9]:
# preprocess_protein_data() # Uncomment if you haven't run this as part of preprocessing.ipynb
protein_cl_names, protein_identifier_to_entrez, protein_cl_to_expr = get_protein_data()
print("Number of unique cell lines: " + str(len(protein_cl_names)))
print("Number of protein identifiers: " + str(len(protein_identifier_to_entrez.keys())))
print("Shape of protein matrix: " + str(protein_cl_to_expr.shape))

# Which cell lines are in the NCI-ALMANAC dataset but not in the protein dataset?
print("Cell lines in NCI-ALMANAC but not in protein:")
print(almanac_cell_lines - protein_cl_names)

# Which cell lines are in the protein dataset but not in the NCI-ALMANAC dataset?
print("Cell lines in protein but not in NCI-ALMANAC:")
print(protein_cl_names - almanac_cell_lines)

# Which cell lines are in dataset intersection?
print("Cell lines in intersection:")
print(almanac_cell_lines.intersection(protein_cl_names))

Number of unique cell lines: 59
Number of protein identifiers: 3167
Shape of protein matrix: (59, 3168)
Cell lines in NCI-ALMANAC but not in protein:
{'MDA-MB-468', 'SF-539\x1a'}
Cell lines in protein but not in NCI-ALMANAC:
set()
Cell lines in intersection:
{'NCI-H460', 'HCT-15', 'DU-145', 'KM12', '786-0', 'MALME-3M', 'SR', 'OVCAR-4', 'UACC-257', 'SF-295', 'A549/ATCC', 'HOP-62', 'SK-MEL-2', 'OVCAR-8', 'HT29', 'IGROV1', 'U251', 'UO-31', 'UACC-62', 'HCT-116', 'SK-MEL-5', 'SN12C', 'SK-OV-3', 'MDA-MB-231/ATCC', 'COLO 205', 'SF-268', 'A498', 'SNB-75', 'RPMI-8226', 'NCI/ADR-RES', 'RXF 393', 'SK-MEL-28', 'M14', 'SF-539', 'ACHN', 'MDA-MB-435', 'BT-549', 'HOP-92', 'NCI-H23', 'NCI-H322M', 'CAKI-1', 'TK-10', 'SNB-19', 'EKVX', 'OVCAR-3', 'T-47D', 'HS 578T', 'LOX IMVI', 'MCF7', 'HCC-2998', 'MOLT-4', 'CCRF-CEM', 'NCI-H226', 'NCI-H522', 'K-562', 'OVCAR-5', 'PC-3', 'HL-60(TB)', 'SW-620'}


Get the intersection of cell lines for all data modalities

In [10]:
filtered_almanac_df, filtered_dna, filtered_rna, filtered_protein, filtered_string, intersection_entrez_ids = get_filtered_data() # Would need to modify to not use STRING if you don't have it

Number of cell lines in intersection: 58
Original shape: (2871444, 8) Filtered shape: (2774280, 8)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  almanac_df["DRUGORDER"] = swapped_order


Original shape: (60, 138893) Filtered shape: (58, 138893)
DNA NANs: False
Original shape: (60, 26179) Filtered shape: (58, 18212)
RNA NANs: False
Original shape: (59, 3168) Filtered shape: (58, 3168)
Protein NANs: False
Number of DNA entrez ids: 15664
Number of RNA entrez ids: 18211
Number of protein entrez ids: 3162
Number of string entrez ids: 18421
Number of entrez ids in intersection: 2665
Original shape: (58, 138893) Filtered shape: (58, 23373)
Original shape: (58, 18212) Filtered shape: (58, 2666)
Original shape: (58, 3168) Filtered shape: (58, 2669)
Original dataframe shape: (5171250, 5)
Protein subnetwork shape: (2173208, 5)
