# Create the Morgan Fingerprint Only Datasets #
File to generate the Morgan Fingerprints only CSV files for:
- Binary ComboScore Classification
- ComboScore Regression
- Percent Growth Regression

Pre-requisites:
- Finish the pre-processing steps before this file

In [None]:
# Import everything you need

from getProcessedData import *

Drug combination that can be filtered by cancer type
['ALL', 'Breast Cancer', 'CNS Cancer', 'Colon Cancer', 'Leukemia', 'Melanoma', 'Non-Small Cell Lung Cancer', 'Ovarian Cancer', 'Prostate Cancer', 'Renal Cancer']

In [2]:
panel_type = ['ALL', 'Breast Cancer', 'CNS Cancer', 'Colon Cancer', 'Leukemia', 'Melanoma', 'Non-Small Cell Lung Cancer', 'Ovarian Cancer', 'Prostate Cancer', 'Renal Cancer']
cancer_type_prefixes = ['all_cancer_', 'breast_', 'cns_', 'colon_', 'leukemia_', 'melanoma_', 'nsclc_', 'ovarian_', 'prostate_', 'renal_']

# CHANGE THIS LINE TO CHANGE THE CANCER TYPE - SHOULD ONLY NEED ALL
ct_index = 0

print("Cancer type is", panel_type[ct_index])
print("Cancer type prefix is", cancer_type_prefixes[ct_index])

Cancer type is ALL
Cancer type prefix is all_cancer_


Percent Growth CSV File Generation

In [4]:
# Generate the CSV file for cancer type, MFP, percent growth
mfp_len = 256
drug_pg_df, drug_comboscore_df, nsc_to_mfp, nsc_to_prop_df = get_processed_drug_data(cancer_type=panel_type[ct_index])

print(drug_pg_df['PANEL'].unique())
num_pg_samples = drug_pg_df.shape[0]
print(num_pg_samples)

drug1s = drug_pg_df['NSC1'].values
drug1sconc = drug_pg_df['CONC1'].values
drug2s = drug_pg_df['NSC2'].values
drug2sconc = drug_pg_df['CONC2'].values


# File name for the CSV file
fn_mfp_percgrowth = '../data/ASP_dataset_slices/' + cancer_type_prefixes[ct_index] + str(mfp_len) + '_mfp_percgrowth.csv'
print("Writing to", fn_mfp_percgrowth)

with open(fn_mfp_percgrowth, 'w') as f:
    # map the NSC drug IDs to the MFP bit vectors
    for i in range(num_pg_samples):
        mfp1 = nsc_to_mfp[str(drug1s[i])]
        mfp2 = nsc_to_mfp[str(drug2s[i])]
        percgrowth = drug_pg_df.iloc[i]['PERCENTGROWTH']
        f.write(','.join([str(x) for x in mfp1]) + ',')
        f.write(str(drug1sconc[i]) + ',') # Add concentration of drug 1 in M
        f.write(','.join([str(x) for x in mfp2]) + ',')
        f.write(str(drug2sconc[i]) + ',') # Add concentration of drug 2 in M
        f.write(str(percgrowth) + '\n')

        if i % 100000 == 0:
            print("percent growth sample", i)



['Renal Cancer' 'Non-Small Cell Lung Cancer' 'Leukemia' 'Colon Cancer'
 'Prostate Cancer' 'Ovarian Cancer' 'Melanoma' 'CNS Cancer'
 'Breast Cancer']
2774280
Writing to ../data/ASP_dataset_slices/all_cancer_256_mfp_percgrowth.csv
percent growth sample 0
percent growth sample 100000
percent growth sample 200000
percent growth sample 300000
percent growth sample 400000
percent growth sample 500000
percent growth sample 600000
percent growth sample 700000
percent growth sample 800000
percent growth sample 900000
percent growth sample 1000000
percent growth sample 1100000
percent growth sample 1200000
percent growth sample 1300000
percent growth sample 1400000
percent growth sample 1500000
percent growth sample 1600000
percent growth sample 1700000
percent growth sample 1800000
percent growth sample 1900000
percent growth sample 2000000
percent growth sample 2100000
percent growth sample 2200000
percent growth sample 2300000
percent growth sample 2400000
percent growth sample 2500000
percen

ComboScore Classification and Regression CSV File Generation

In [5]:
fn_mfp_comboscore = '../data/ASP_dataset_slices/' + cancer_type_prefixes[ct_index] + str(mfp_len) + '_mfp_comboscore.csv'
fn_mfp_comboscore_bc0 = '../data/ASP_dataset_slices/' + cancer_type_prefixes[ct_index] + str(mfp_len) + '_mfp_bc0_comboscore.csv'

print("Comboscore file is", fn_mfp_comboscore)
print("Comboscore file with binary classification is", fn_mfp_comboscore_bc0)

num_cs_samples = drug_comboscore_df.shape[0]
drug1cs = drug_comboscore_df['NSC1'].values
drug2cs = drug_comboscore_df['NSC2'].values

with open(fn_mfp_comboscore, 'w') as f:
    with open(fn_mfp_comboscore_bc0, 'w') as f_bc0:
        # map the NSC drug IDs to the MFP bit vectors
        for i in range(num_cs_samples):
            mfp1 = nsc_to_mfp[str(drug1cs[i])]
            mfp2 = nsc_to_mfp[str(drug2cs[i])]
            cs = drug_comboscore_df.iloc[i]['SCORE']
            f.write(','.join([str(x) for x in mfp1]) + ',')
            f.write(','.join([str(x) for x in mfp2]) + ',')
            f.write(str(cs) + '\n')

            if cs > 0:
                f_bc0.write(','.join([str(x) for x in mfp1]) + ',')
                f_bc0.write(','.join([str(x) for x in mfp2]) + ',')
                f_bc0.write('1' + '\n') # 1 for synergistic
            else:
                f_bc0.write(','.join([str(x) for x in mfp1]) + ',')
                f_bc0.write(','.join([str(x) for x in mfp2]) + ',')
                f_bc0.write('0' + '\n') # 0 for non-synergistic

            if i % 100000 == 0:
                print("comboscore", i)

Comboscore file is ../data/ASP_dataset_slices/all_cancer_256_mfp_comboscore.csv
Comboscore file with binary classification is ../data/ASP_dataset_slices/all_cancer_256_mfp_bc0_comboscore.csv
comboscore 0
comboscore 100000
comboscore 200000
comboscore 300000
