# Create -Omics Identifiers and Masks CSV Files #
- Create the column identifiers for each omics modality and also the masking file based on genes for the PCNNGL model

Pre-requisites:
- Finished the pre-processing steps prior to this file

In [10]:
# Import everything you need

import numpy as np
import os
import pandas as pd
import torch
from getProcessedData import *

First get the datasets

In [11]:
dna_df_unfilt, dna_identifier_df = get_dna_data()
rna_df_unfilt, rna_identifier_df = get_rna_data()
protein_df_unfilt, protein_identifier_df = get_protein_data()
# dataframes have a structure of row index cell line and column index -omic feature

Filter DNA dataset by getting the features with top 5% of variance. Previously determined that 1692 was the 95% cutoff

In [12]:
dna_df, new_intersection_entrez = remove_low_var_columns(dna_identifier_df, dna_df_unfilt, threshold=1692)

print("Original RNA data shape:", rna_df_unfilt.shape)
rna_identifiers = [rna_identifier_df[rna_identifier_df['Entrez'] == entrez_id]['Gene'].values[0] for entrez_id in new_intersection_entrez]
rna_df = rna_df_unfilt[rna_identifiers]
print("Filtered RNA data shape:", rna_df.shape)


print("Original protein data shape:", protein_df_unfilt.shape)
prot_identifiers = [protein_identifier_df[protein_identifier_df['Entrez'] == entrez_id]['Identifier'].values[0] for entrez_id in new_intersection_entrez]
protein_df = protein_df_unfilt[prot_identifiers]
print("Filtered protein data shape:", protein_df.shape)


Original intersection entrez IDs: 2665
Original number of features: 23372
Number of features after removing low variance columns: 1171
Number of unique entrez IDs after removing low variance columns: 722
Original RNA data shape: (58, 2665)
Filtered RNA data shape: (58, 722)
Original protein data shape: (58, 2668)
Filtered protein data shape: (58, 722)


Get the drug data

In [13]:
drug_pg_df, drug_comboscore_df, nsc_to_mfp, nsc_to_prop_df = get_processed_drug_data()

Generate the CSV files for the identifiers

In [14]:
# Concatenate into strings
drug1_identifiers = ','.join([('Drug1Ft' + str(x)) for x in range(256)])
drug1_conc_identifiers = ',Drug1Conc'
drug2_identifiers = ','.join([('Drug2Ft' + str(x)) for x in range(256)])
drug2_conc_identifiers = ',Drug2Conc'
dna_identifiers = ','.join([str(x) for x in dna_df.columns])
rna_identifiers = ','.join([str(x) for x in rna_df.columns])
protein_identifiers = ','.join([str(x) for x in protein_df.columns])

# Files to fill in
mfp_len = 256

file_prefix = '../data/ASP_dataset_slices/'
identifiers_folder = 'identifiers_column_names/'
mf_prefix = str(mfp_len) + '_mfp'
dna_prefix = 'dna'
rna_prefix = 'rna'
prot_prefix = 'prot'
pg_prefix = '_pg'
cs_prefix = '_cs'
identifier_suffix = '_identifiers.csv'
mask1gl_suffix = '_mask1gl.csv'


data_combinations_to_get = [
    # Single modalities
    {'mfp': True, 'dna': False, 'rna': False, 'prot': False},
    # {'mfp': False, 'dna': True, 'rna': False, 'prot': False},
    # {'mfp': False, 'dna': False, 'rna': True, 'prot': False},
    # {'mfp': False, 'dna': False, 'rna': False, 'prot': True},

    # MFP + 1 Omic modalities
    {'mfp': True, 'dna': True, 'rna': False, 'prot': False},
    {'mfp': True, 'dna': False, 'rna': True, 'prot': False},
    {'mfp': True, 'dna': False, 'rna': False, 'prot': True},

    # MFP + 2 Omic modalities
    {'mfp': True, 'dna': True, 'rna': True, 'prot': False},
    {'mfp': True, 'dna': True, 'rna': False, 'prot': True},
    {'mfp': True, 'dna': False, 'rna': True, 'prot': True},
    
    # MFP + 3 Omic modalities
    {'mfp': True, 'dna': True, 'rna': True, 'prot': True},
]

for data_combo in data_combinations_to_get:
    fn = file_prefix + identifiers_folder
    pg_identifiers = ''
    cs_identifiers = ''
    if data_combo['mfp']:
        fn += mf_prefix
        # If there are other modalities, add a comma
        if pg_identifiers != '':
            pg_identifiers += ','
        if cs_identifiers != '':
            cs_identifiers += ','
        pg_identifiers += drug1_identifiers + drug1_conc_identifiers + ',' + drug2_identifiers + drug2_conc_identifiers
        cs_identifiers += drug1_identifiers + ',' + drug2_identifiers
    if data_combo['dna']:
        fn += dna_prefix
        # If there are other modalities, add a comma
        if pg_identifiers != '':
            pg_identifiers += ','
        if cs_identifiers != '':
            cs_identifiers += ','
        pg_identifiers += dna_identifiers
        cs_identifiers += dna_identifiers
    if data_combo['rna']:
        fn += rna_prefix
        # If there are other modalities, add a comma
        if pg_identifiers != '':
            pg_identifiers += ','
        if cs_identifiers != '':
            cs_identifiers += ','
        pg_identifiers += rna_identifiers
        cs_identifiers += rna_identifiers
    if data_combo['prot']:
        fn += prot_prefix
        # If there are other modalities, add a comma
        if pg_identifiers != '':
            pg_identifiers += ','
        if cs_identifiers != '':
            cs_identifiers += ','
        pg_identifiers += protein_identifiers
        cs_identifiers += protein_identifiers
    pg_fn = fn + pg_prefix + identifier_suffix
    cs_fn = fn + cs_prefix + identifier_suffix

    print('Saving pg identifiers to:', pg_fn)
    print('Saving cs identifiers to:', cs_fn)

    # Save the identifiers to the files
    # if there is no folder, create it
    if not os.path.exists(os.path.dirname(pg_fn)):
        try:
            os.makedirs(os.path.dirname(pg_fn))
        except OSError as exc:
            raise exc
    if not os.path.exists(os.path.dirname(cs_fn)):
        try:
            os.makedirs(os.path.dirname(cs_fn))
        except OSError as exc:
            raise exc
    with open(pg_fn, 'w') as f:
        f.write(pg_identifiers)
    with open(cs_fn, 'w') as f:
        f.write(cs_identifiers)
    

Saving pg identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfp_pg_identifiers.csv
Saving cs identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfp_cs_identifiers.csv
Saving pg identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfpdna_pg_identifiers.csv
Saving cs identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfpdna_cs_identifiers.csv
Saving pg identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfprna_pg_identifiers.csv
Saving cs identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfprna_cs_identifiers.csv
Saving pg identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfpprot_pg_identifiers.csv
Saving cs identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfpprot_cs_identifiers.csv
Saving pg identifiers to: ../data/ASP_dataset_slices/identifiers_column_names/256_mfpdnarna_pg_identifiers.csv
Saving cs identifiers to: ../

Generate the CSV files for the 1 gene layer mask
- will need to create new mask for other layers over time

In [15]:
final_entrez_sorted_list = sorted(new_intersection_entrez) # for 1 partially connected gene layer

# Mask1gls only change for the column feature data (MFP, DNA, RNA, Protein), so don't need to recompute for cancer type or for drug class type
# It does change between comboscore (CS) vs percent growth (PG) data because of added concentration columns
cs_mfp_identifier_list = [('Drug1Ft' + str(x)) for x in range(256)] + [('Drug2Ft' + str(x)) for x in range(256)] # No concentration in comboscore data
pg_mfp_identifier_list = [('Drug1Ft' + str(x)) for x in range(256)] + ['Drug1Conc'] + [('Drug2Ft' + str(x)) for x in range(256)] + ['Drug2Conc'] # With Concentration

cs_mfp_mask1gl = pd.DataFrame(np.ones((512, len(final_entrez_sorted_list))), index=cs_mfp_identifier_list, columns=final_entrez_sorted_list)
pg_mfp_mask1gl = pd.DataFrame(np.ones((514, len(final_entrez_sorted_list))), index=pg_mfp_identifier_list, columns=final_entrez_sorted_list)

# Initialize all the other mask1gls
dna_mask1gl = pd.DataFrame(np.zeros((len(dna_df.columns), len(final_entrez_sorted_list))), index=dna_df.columns, columns=final_entrez_sorted_list)
rna_mask1gl = pd.DataFrame(np.zeros((len(rna_df.columns), len(final_entrez_sorted_list))), index=rna_df.columns, columns=final_entrez_sorted_list)
protein_mask1gl = pd.DataFrame(np.zeros((len(protein_df.columns), len(final_entrez_sorted_list))), index=protein_df.columns, columns=final_entrez_sorted_list)

for i, entrez in enumerate(final_entrez_sorted_list):
    for j, identifier in enumerate(dna_df.columns):
        if dna_identifier_df[dna_identifier_df['Identifier'] == identifier]['Entrez'].values[0] == entrez:
            dna_mask1gl.iloc[j, i] = 1
    for k, identifier in enumerate(rna_df.columns):
        if rna_identifier_df[rna_identifier_df['Gene'] == identifier]['Entrez'].values[0] == entrez:
            rna_mask1gl.iloc[k, i] = 1
    for l, identifier in enumerate(protein_df.columns):
        if protein_identifier_df[protein_identifier_df['Identifier'] == identifier]['Entrez'].values[0] == entrez:
            protein_mask1gl.iloc[l, i] = 1

# Save the mask1gls, do the concatenation of the dataframes in the model code
cs_mfp_mask1gl_fn = file_prefix + mf_prefix + cs_prefix + mask1gl_suffix
pg_mfp_mask1gl_fn = file_prefix + mf_prefix + pg_prefix + mask1gl_suffix
dna_mask1gl_fn = file_prefix + dna_prefix + mask1gl_suffix
rna_mask1gl_fn = file_prefix + rna_prefix + mask1gl_suffix
protein_mask1gl_fn = file_prefix + prot_prefix + mask1gl_suffix

pg_mfp_mask1gl.to_csv(pg_mfp_mask1gl_fn, index=True, header=True)
cs_mfp_mask1gl.to_csv(cs_mfp_mask1gl_fn, index=True, header=True)
dna_mask1gl.to_csv(dna_mask1gl_fn, index=True, header=True)
rna_mask1gl.to_csv(rna_mask1gl_fn, index=True, header=True)
protein_mask1gl.to_csv(protein_mask1gl_fn, index=True, header=True)
