# Create the DNA, RNA, and Protein H5 Files #

File to generate the H5 files for the DNA, RNA, and protein data.

Pre-requisites:
- Finish the pre-processing steps before this file, especially running nci_almanac_therapy_classification.ipynb

In [1]:
# Import everything you need
import h5py
import numpy as np
import pandas as pd
from dataset_creation.getProcessedData import *

Assume that test_create_training_data.ipynb has already preprocessed and filtered the data modalities. Filtered dataframes are stored at:
- data_processed/filtered_almcomb_pg.csv (has percent growth and score)
- data_processed/filtered_almcomb_combo_.csv (comboscore, hsa, zip)
- data_processed/filtered_dna_df.csv
- data_processed/filtered_rna_df.csv
- data_processed/filtered_protein_df.csv
- data_processed/filtered_string_df.csv

First get the datasets

In [2]:
dna_df_unfilt, dna_identifier_df = get_dna_data()
rna_df_unfilt, rna_identifier_df = get_rna_data()
protein_df_unfilt, protein_identifier_df = get_protein_data()
# dataframes have a structure of row index cell line and column index -omic feature

Filter DNA dataset by getting the features with top 5% of variance

In [3]:
dna_df, new_intersection_entrez = remove_low_var_columns(dna_identifier_df, dna_df_unfilt, threshold=5)

print("Original RNA data shape:", rna_df_unfilt.shape)
rna_identifiers = [rna_identifier_df[rna_identifier_df['Entrez'] == entrez_id]['Gene'].values[0] for entrez_id in new_intersection_entrez]
rna_df = rna_df_unfilt[rna_identifiers]
print("Filtered RNA data shape:", rna_df.shape)


print("Original protein data shape:", protein_df_unfilt.shape)
prot_identifiers = [protein_identifier_df[protein_identifier_df['Entrez'] == entrez_id]['Identifier'].values[0] for entrez_id in new_intersection_entrez]
protein_df = protein_df_unfilt[prot_identifiers]
print("Filtered protein data shape:", protein_df.shape)


Original intersection entrez IDs: 2665
Original number of features: 23372
Keeping top 5% of features = 1168 features
Number of features after removing low variance columns: 1168
Number of unique entrez IDs after removing low variance columns: 786
Original RNA data shape: (58, 2665)
Filtered RNA data shape: (58, 786)
Original protein data shape: (58, 2668)
Filtered protein data shape: (58, 786)


In [5]:
drug_pg_df, drug_comboscore_df, nsc_to_mfp, nsc_to_prop_df = get_processed_drug_data()

Previously, we wrote CSV files the stored redundant information. This time, we create HDF5 files to reduce redundancy 

In [6]:
with h5py.File('data/ASP_dataset_slices/all_256mfpdnarnaprot.h5', 'w') as f:
    # Store drug morgan fingerprints
    drug_mfp_group = f.create_group('drug_mfp')
    drug_nscs = list(nsc_to_mfp.keys())
    drug_nscs_array = np.array(drug_nscs, dtype='S6')
    drug_mfp_group.create_dataset('nscs', data=drug_nscs_array)
    mfp = np.array([nsc_to_mfp[nsc] for nsc in drug_nscs], dtype=np.uint8)
    print("Storing drug Morgan fingerprints with shape:", mfp.shape) #Should be Nx256
    drug_mfp_group.create_dataset(
        'mfp',
        data=mfp,
        compression='gzip',
        compression_opts=9,
        chunks=(min(64, len(drug_nscs)), mfp.shape[1]),
    )

    drug_mfp_group.attrs['n_drugs'] = len(drug_nscs)
    drug_mfp_group.attrs['mfp_length'] = mfp.shape[1]
    mapping = drug_mfp_group.create_group('mapping')
    for i, nsc in enumerate(drug_nscs):
        mapping.attrs[nsc] = i

    # Store DNA data
    dna_group = f.create_group('dna')
    cell_lines = dna_df.index.values
    cell_lines_array = np.array(cell_lines, dtype='S15')
    dna_group.create_dataset('cell_lines', data=cell_lines_array)
    dna_features = dna_df.values
    print("Storing DNA data with shape:", dna_features.shape) #Should be 58x1168
    dna_group.create_dataset(
        'features',
        data=dna_features,
        compression='gzip',
        compression_opts=9,
        chunks=(min(32, len(cell_lines)), dna_features.shape[1]),
    )
    feature_identifiers = dna_df.columns.values
    feature_identifiers_array = np.array(feature_identifiers, dtype='S38')
    dna_group.create_dataset('feature_identifiers', data=feature_identifiers_array)
    dna_group.attrs['n_cell_lines'] = len(cell_lines)
    dna_group.attrs['n_features'] = dna_features.shape[1]
    mapping = dna_group.create_group('mapping')
    for i, identifier in enumerate(feature_identifiers):
        mapping.attrs[identifier] = i

    #RNA longest length is 9
    # Store RNA data
    rna_group = f.create_group('rna')
    cell_lines = rna_df.index.values
    cell_lines_array = np.array(cell_lines, dtype='S15')
    rna_group.create_dataset('cell_lines', data=cell_lines_array)
    rna_features = rna_df.values
    print("Storing RNA data with shape:", rna_features.shape) #Should be 58x786
    rna_group.create_dataset(
        'features',
        data=rna_features,
        compression='gzip',
        compression_opts=9,
        chunks=(min(32, len(cell_lines)), rna_features.shape[1]),
    )
    feature_identifiers = rna_df.columns.values
    feature_identifiers_array = np.array(feature_identifiers, dtype='S10')
    rna_group.create_dataset('feature_identifiers', data=feature_identifiers_array)
    rna_group.attrs['n_cell_lines'] = len(cell_lines)
    rna_group.attrs['n_features'] = rna_features.shape[1]
    mapping = rna_group.create_group('mapping')
    for i, identifier in enumerate(feature_identifiers):
        mapping.attrs[identifier] = i

    #Protein longest length is 82
    # Store protein data
    protein_group = f.create_group('protein')
    cell_lines = protein_df.index.values
    cell_lines_array = np.array(cell_lines, dtype='S15')
    protein_group.create_dataset('cell_lines', data=cell_lines_array)
    protein_features = protein_df.values
    print("Storing protein data with shape:", protein_features.shape) #Should be 58x786
    protein_group.create_dataset(
        'features',
        data=protein_features,
        compression='gzip',
        compression_opts=9,
        chunks=(min(32, len(cell_lines)), protein_features.shape[1]),
    )
    feature_identifiers = protein_df.columns.values
    feature_identifiers_array = np.array(feature_identifiers, dtype='S83')
    protein_group.create_dataset('feature_identifiers', data=feature_identifiers_array)
    protein_group.attrs['n_cell_lines'] = len(cell_lines)
    protein_group.attrs['n_features'] = protein_features.shape[1]
    mapping = protein_group.create_group('mapping')
    for i, identifier in enumerate(feature_identifiers):
        mapping.attrs[identifier] = i
        


Storing drug Morgan fingerprints with shape: (105, 256)
Storing DNA data with shape: (58, 1168)
Storing RNA data with shape: (58, 786)
Storing protein data with shape: (58, 786)


In [8]:
# Save the percent growth data and comboscore dataframes
drug_comboscore_df.to_csv('data/ASP_dataset_slices/drug_comboscore_hsa_zip.csv', index=False)
drug_pg_df.to_csv('data/ASP_dataset_slices/drug_percent_growth.csv', index=False)

Get the indices in percent growth or comboscore datasets that correspond to specific tissue types

In [10]:
# Get the drug data
cancer_type_prefixes = ['all_cancer_', 'breast_', 'cns_', 'colon_', 'leukemia_', 'melanoma_', 'nsclc_', 'ovarian_', 'prostate_', 'renal_']
panel_filtering = ['ALL', 'Breast Cancer', 'CNS Cancer', 'Colon Cancer', 'Leukemia', 'Melanoma', 'Non-Small Cell Lung Cancer', 'Ovarian Cancer', 'Prostate Cancer', 'Renal Cancer']

# Get the indices for each of the cancer types for pg data
pg_cancer_type_to_row_indices = {}
for ct in drug_pg_df['PANEL'].unique():
    pg_cancer_type_to_row_indices[ct] = drug_pg_df[drug_pg_df['PANEL'] == ct].index.to_list()

# Get the indices for each of the cancer types for comboscore data
comboscore_cancer_type_to_row_indices = {}
for ct in drug_comboscore_df['PANEL'].unique():
    comboscore_cancer_type_to_row_indices[ct] = drug_comboscore_df[drug_comboscore_df['PANEL'] == ct].index.to_list()


# Write the indices to a file
for i in range(1, len(cancer_type_prefixes)):
    with open(f'data/ASP_dataset_slices/{cancer_type_prefixes[i]}pg_indices.txt', 'w') as f:
        for idx in pg_cancer_type_to_row_indices[panel_filtering[i]]:
            f.write(f'{idx}\n')
    with open(f'data/ASP_dataset_slices/{cancer_type_prefixes[i]}comboscore_indices.txt', 'w') as f:
        for idx in comboscore_cancer_type_to_row_indices[panel_filtering[i]]:
            f.write(f'{idx}\n')

This creates txt files containing the indices in the NCI_ALMANAC drug data that correspond to specific drug class types

In [11]:
# We want to think about drug classes too. Chemo, Targeted, Other are the 3 classes. Want to bucket into
# Chemo+Chemo, Chemo+Targeted, Chemo+Other, Targeted+Targeted, Targeted+Other, Other+Other
drug_classification_fn = 'data_processed/almanac_nsc_to_drug_types.csv'
drug_classification_df = pd.read_csv(drug_classification_fn, header=0, dtype='str')
drug_classification_df.set_index('NSC_ID', inplace=True)

# Percentage Growth Regression indices
pg_drug1s_classes_mapped = drug_pg_df['NSC1'].map(drug_classification_df['Therapy_Class'])
pg_drug2s_classes_mapped = drug_pg_df['NSC2'].map(drug_classification_df['Therapy_Class'])

pg_chemo_chemo_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Chemotherapy') & (pg_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
# Verify this works - first index (0) should be in chemo_chemo_indices
assert pg_chemo_chemo_indices[0] == 0
print("PG Chemo-Chemo:", len(pg_chemo_chemo_indices))

pg_targeted_targeted_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Targeted') & (pg_drug2s_classes_mapped == 'Targeted')].index.to_list()
print("PG Targeted-Targeted:", len(pg_targeted_targeted_indices))

pg_other_other_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Other') & (pg_drug2s_classes_mapped == 'Other')].index.to_list()
print("PG Other-Other:", len(pg_other_other_indices))

pg_chemo_targeted_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Chemotherapy') & (pg_drug2s_classes_mapped == 'Targeted')].index.to_list()
pg_chemo_targeted_indices += drug_pg_df[(pg_drug1s_classes_mapped == 'Targeted') & (pg_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("PG Chemo-Targeted:", len(pg_chemo_targeted_indices))

pg_chemo_other_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Chemotherapy') & (pg_drug2s_classes_mapped == 'Other')].index.to_list()
pg_chemo_other_indices += drug_pg_df[(pg_drug1s_classes_mapped == 'Other') & (pg_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("PG Chemo-Other:", len(pg_chemo_other_indices))

pg_targeted_other_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Targeted') & (pg_drug2s_classes_mapped == 'Other')].index.to_list()
pg_targeted_other_indices += drug_pg_df[(pg_drug1s_classes_mapped == 'Other') & (pg_drug2s_classes_mapped == 'Targeted')].index.to_list()
print("PG Targeted-Other:", len(pg_targeted_other_indices))

# The number of indices should add up to the total number of rows in the pg data
print(drug_pg_df.shape[0] == len(pg_chemo_chemo_indices) + len(pg_chemo_targeted_indices) + len(pg_chemo_other_indices) + len(pg_targeted_targeted_indices) + len(pg_targeted_other_indices) + len(pg_other_other_indices))

# Comboscore indices
cs_drug1s_classes_mapped = drug_comboscore_df['NSC1'].map(drug_classification_df['Therapy_Class'])
cs_drug2s_classes_mapped = drug_comboscore_df['NSC2'].map(drug_classification_df['Therapy_Class'])

cs_chemo_chemo_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Chemotherapy') & (cs_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("CS Chemo-Chemo:", len(cs_chemo_chemo_indices))

cs_targeted_targeted_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Targeted') & (cs_drug2s_classes_mapped == 'Targeted')].index.to_list()
print("CS Targeted-Targeted:", len(cs_targeted_targeted_indices))

cs_other_other_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Other') & (cs_drug2s_classes_mapped == 'Other')].index.to_list()
print("CS Other-Other:", len(cs_other_other_indices))

cs_chemo_targeted_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Chemotherapy') & (cs_drug2s_classes_mapped == 'Targeted')].index.to_list()
cs_chemo_targeted_indices += drug_comboscore_df[(cs_drug1s_classes_mapped == 'Targeted') & (cs_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("CS Chemo-Targeted:", len(cs_chemo_targeted_indices))

cs_chemo_other_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Chemotherapy') & (cs_drug2s_classes_mapped == 'Other')].index.to_list()
cs_chemo_other_indices += drug_comboscore_df[(cs_drug1s_classes_mapped == 'Other') & (cs_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("CS Chemo-Other:", len(cs_chemo_other_indices))

cs_targeted_other_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Targeted') & (cs_drug2s_classes_mapped == 'Other')].index.to_list()
cs_targeted_other_indices += drug_comboscore_df[(cs_drug1s_classes_mapped == 'Other') & (cs_drug2s_classes_mapped == 'Targeted')].index.to_list()
print("CS Targeted-Other:", len(cs_targeted_other_indices))

# The number of indices should add up to the total number of rows in the comboscore data
print(drug_comboscore_df.shape[0] == len(cs_chemo_chemo_indices) + len(cs_chemo_targeted_indices) + len(cs_chemo_other_indices) + len(cs_targeted_targeted_indices) + len(cs_targeted_other_indices) + len(cs_other_other_indices))


# write the indices to separate files
drug_class_file_prefix = 'data/ASP_dataset_slices/' + 'all_cancer_'
pg_indices_suffix = '_pg_indices.txt'
combo_indices_suffix = '_cs_indices.txt'

for task in ['pg', 'cs']:
    for drug_class in ['chemo_chemo', 'chemo_targeted', 'chemo_other', 'targeted_targeted', 'targeted_other', 'other_other']:
        if task == 'pg':
            with open(drug_class_file_prefix + drug_class + pg_indices_suffix, 'w') as f:
                for idx in eval(f'{task}_{drug_class}_indices'):
                    f.write(f'{idx}\n')
        else:
            with open(drug_class_file_prefix + drug_class + combo_indices_suffix, 'w') as f:
                for idx in eval(f'{task}_{drug_class}_indices'):
                    f.write(f'{idx}\n')


PG Chemo-Chemo: 1833186
PG Targeted-Targeted: 235746
PG Other-Other: 142110
PG Chemo-Targeted: 1307490
PG Chemo-Other: 1064940
PG Targeted-Other: 384912
True
CS Chemo-Chemo: 195682
CS Targeted-Targeted: 26194
CS Other-Other: 15790
CS Chemo-Targeted: 142510
CS Chemo-Other: 115976
CS Targeted-Other: 42768
True
