# Create the DNA, RNA, and Protein CSV Files #

File to generate the CSV files for the DNA, RNA, and protein data.

Pre-requisites:
- Finish the pre-processing steps before this file, especially running nci_almanac_therapy_classification.ipynb

In [1]:
# Import everything you need

import numpy as np
import pandas as pd
import torch
from getProcessedData import *

Assume that test_create_training_data.ipynb has already preprocessed and filtered the data modalities. Filtered dataframes are stored at:
- data_processed/filtered_almanac_df.csv (has percent growth and score)
- data_processed/filtered_almanac_comboscore_df.csv (comboscore)
- data_processed/filtered_dna_df.csv
- data_processed/filtered_rna_df.csv
- data_processed/filtered_protein_df.csv
- data_processed/filtered_string_df.csv

First get the datasets

In [2]:
dna_df_unfilt, dna_identifier_df = get_dna_data()
rna_df_unfilt, rna_identifier_df = get_rna_data()
protein_df_unfilt, protein_identifier_df = get_protein_data()
# dataframes have a structure of row index cell line and column index -omic feature

Filter DNA dataset by getting the features with top 5% of variance. Previously determined that 1692 was the 95% cutoff

In [3]:
dna_df, new_intersection_entrez = remove_low_var_columns(dna_identifier_df, dna_df_unfilt, threshold=1692)

print("Original RNA data shape:", rna_df_unfilt.shape)
rna_identifiers = [rna_identifier_df[rna_identifier_df['Entrez'] == entrez_id]['Gene'].values[0] for entrez_id in new_intersection_entrez]
rna_df = rna_df_unfilt[rna_identifiers]
print("Filtered RNA data shape:", rna_df.shape)


print("Original protein data shape:", protein_df_unfilt.shape)
prot_identifiers = [protein_identifier_df[protein_identifier_df['Entrez'] == entrez_id]['Identifier'].values[0] for entrez_id in new_intersection_entrez]
protein_df = protein_df_unfilt[prot_identifiers]
print("Filtered protein data shape:", protein_df.shape)


Original intersection entrez IDs: 2665
Original number of features: 23372
Number of features after removing low variance columns: 1171
Number of unique entrez IDs after removing low variance columns: 722
Original RNA data shape: (58, 2665)
Filtered RNA data shape: (58, 722)
Original protein data shape: (58, 2668)
Filtered protein data shape: (58, 722)


When generating files, there is an option if you would like to generate a tissue type specific dataset. You would just change out the cancer type prefix:
- all_cancer_prefix = 'all_cancer_'
- breast_cancer_prefix = 'breast_'
- cns_cancer_prefix = 'cns_'
- colon_cancer_prefix = 'colon_'
- leukemia_prefix = 'leukemia_'
- melanoma_prefix = 'melanoma_'
- nsclc_prefix = 'nsclc_'
- ovarian_cancer_prefix = 'ovarian_'
- prostate_cancer_prefix = 'prostate_'
- renal_cancer_prefix = 'renal_'

For the primary purpose, you should just use all_cancer

In [4]:
# Files to fill in
mfp_len = 256

cancer_type_prefixes = ['all_cancer_', 'breast_', 'cns_', 'colon_', 'leukemia_', 'melanoma_', 'nsclc_', 'ovarian_', 'prostate_', 'renal_']
panel_filtering = ['ALL', 'Breast Cancer', 'CNS Cancer', 'Colon Cancer', 'Leukemia', 'Melanoma', 'Non-Small Cell Lung Cancer', 'Ovarian Cancer', 'Prostate Cancer', 'Renal Cancer']

# CHANGE THIS LINE HERE
ct_index = 0
print("Using cancer type prefix:", cancer_type_prefixes[ct_index])
print("Using panel filter:", panel_filtering[ct_index])

file_prefix = '../data/ASP_dataset_slices/'
cancer_type_prefix = cancer_type_prefixes[ct_index]
mf_prefix = str(mfp_len) + '_mfp'
dna_prefix = 'dna'
rna_prefix = 'rna'
prot_prefix = 'prot'
percgrowth_suffix = '_percgrowth.csv'
bc0_combo_suffix = '_bc0_comboscore.csv'
reg_combo_suffix = '_comboscore.csv'

# Single Omic Files
fn_dna_bc0 = file_prefix + cancer_type_prefix + dna_prefix + bc0_combo_suffix
fn_dna_reg = file_prefix + cancer_type_prefix + dna_prefix + reg_combo_suffix
fn_dna_percgrowth = file_prefix + cancer_type_prefix + dna_prefix + percgrowth_suffix
fn_rna_bc0 = file_prefix + cancer_type_prefix + rna_prefix + bc0_combo_suffix
fn_rna_reg = file_prefix + cancer_type_prefix + rna_prefix + reg_combo_suffix
fn_rna_percgrowth = file_prefix + cancer_type_prefix + rna_prefix + percgrowth_suffix
fn_prot_bc0 = file_prefix + cancer_type_prefix + prot_prefix + bc0_combo_suffix
fn_prot_reg = file_prefix + cancer_type_prefix + prot_prefix + reg_combo_suffix
fn_prot_percgrowth = file_prefix + cancer_type_prefix + prot_prefix + percgrowth_suffix

# MFP + Single Omic Files
fn_mfpdna_bc0 = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + bc0_combo_suffix
fn_mfpdna_reg = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + reg_combo_suffix
fn_mfpdna_percgrowth = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + percgrowth_suffix
fn_mfprna_bc0 = file_prefix + cancer_type_prefix + mf_prefix + rna_prefix + bc0_combo_suffix
fn_mfprna_reg = file_prefix + cancer_type_prefix + mf_prefix + rna_prefix + reg_combo_suffix
fn_mfprna_percgrowth = file_prefix + cancer_type_prefix + mf_prefix + rna_prefix + percgrowth_suffix
fn_mfpprot_bc0 = file_prefix + cancer_type_prefix + mf_prefix + prot_prefix + bc0_combo_suffix
fn_mfpprot_reg = file_prefix + cancer_type_prefix + mf_prefix + prot_prefix + reg_combo_suffix
fn_mfpprot_percgrowth = file_prefix + cancer_type_prefix + mf_prefix + prot_prefix + percgrowth_suffix

# MFP + 2 Omic Files
fn_mfpdnarna_bc0 = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + rna_prefix + bc0_combo_suffix
fn_mfpdnarna_reg = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + rna_prefix + reg_combo_suffix
fn_mfpdnarna_percgrowth = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + rna_prefix + percgrowth_suffix
fn_mfpdnaprot_bc0 = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + prot_prefix + bc0_combo_suffix
fn_mfpdnaprot_reg = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + prot_prefix + reg_combo_suffix
fn_mfpdnaprot_percgrowth = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + prot_prefix + percgrowth_suffix
fn_mfprnaprot_bc0 = file_prefix + cancer_type_prefix + mf_prefix + rna_prefix + prot_prefix + bc0_combo_suffix
fn_mfprnaprot_reg = file_prefix + cancer_type_prefix + mf_prefix + rna_prefix + prot_prefix + reg_combo_suffix
fn_mfprnaprot_percgrowth = file_prefix + cancer_type_prefix + mf_prefix + rna_prefix + prot_prefix + percgrowth_suffix

# MFP + 3 Omic Files
fn_mfpdnarnaprot_bc0 = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + rna_prefix + prot_prefix + bc0_combo_suffix
fn_mfpdnarnaprot_reg = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + rna_prefix + prot_prefix + reg_combo_suffix
fn_mfpdnarnaprot_percgrowth = file_prefix + cancer_type_prefix + mf_prefix + dna_prefix + rna_prefix + prot_prefix + percgrowth_suffix

Using cancer type prefix: all_cancer_
Using panel filter: ALL


This creates txt files containing the indices in the NCI_ALMANAC drug data that correspond to specific tissue types

In [5]:
# Get the drug data

# Put the comboscore df filenames here, and sync it with the ct_index
cancer_type = panel_filtering[ct_index]
drug_pg_df, drug_comboscore_df, nsc_to_mfp, nsc_to_prop_df = get_processed_drug_data() # Pass through the ct type if not just all cancer

# Get the indices for each of the cancer types for pg data
pg_cancer_type_to_row_indices = {}
for ct in drug_pg_df['PANEL'].unique():
    pg_cancer_type_to_row_indices[ct] = drug_pg_df[drug_pg_df['PANEL'] == ct].index.to_list()

# Get the indices for each of the cancer types for comboscore data
comboscore_cancer_type_to_row_indices = {}
for ct in drug_comboscore_df['PANEL'].unique():
    comboscore_cancer_type_to_row_indices[ct] = drug_comboscore_df[drug_comboscore_df['PANEL'] == ct].index.to_list()


# Write the indices to a file
for i in range(1, len(cancer_type_prefixes)):
    with open(f'../data/ASP_dataset_slices/{cancer_type_prefixes[i]}pg_indices.txt', 'w') as f:
        for idx in pg_cancer_type_to_row_indices[panel_filtering[i]]:
            f.write(f'{idx}\n')
    with open(f'../data/ASP_dataset_slices/{cancer_type_prefixes[i]}comboscore_indices.txt', 'w') as f:
        for idx in comboscore_cancer_type_to_row_indices[panel_filtering[i]]:
            f.write(f'{idx}\n')

This creates txt files containing the indices in the NCI_ALMANAC drug data that correspond to specific drug class types

In [6]:
# We want to think about drug classes too. Chemo, Targeted, Other are the 3 classes. Want to bucket into
# Chemo+Chemo, Chemo+Targeted, Chemo+Other, Targeted+Targeted, Targeted+Other, Other+Other
drug_classification_fn = '../data_processed/almanac_nsc_to_drug_types.csv'
drug_classification_df = pd.read_csv(drug_classification_fn, header=0, dtype='str')
drug_classification_df.set_index('NSC_ID', inplace=True)

# Percentage Growth Regression indices
pg_drug1s_classes_mapped = drug_pg_df['NSC1'].map(drug_classification_df['Therapy_Class'])
pg_drug2s_classes_mapped = drug_pg_df['NSC2'].map(drug_classification_df['Therapy_Class'])

pg_chemo_chemo_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Chemotherapy') & (pg_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
# Verify this works - first index (0) should be in chemo_chemo_indices
assert pg_chemo_chemo_indices[0] == 0
print("PG Chemo-Chemo:", len(pg_chemo_chemo_indices))

pg_targeted_targeted_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Targeted') & (pg_drug2s_classes_mapped == 'Targeted')].index.to_list()
print("PG Targeted-Targeted:", len(pg_targeted_targeted_indices))

pg_other_other_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Other') & (pg_drug2s_classes_mapped == 'Other')].index.to_list()
print("PG Other-Other:", len(pg_other_other_indices))

pg_chemo_targeted_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Chemotherapy') & (pg_drug2s_classes_mapped == 'Targeted')].index.to_list()
pg_chemo_targeted_indices += drug_pg_df[(pg_drug1s_classes_mapped == 'Targeted') & (pg_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("PG Chemo-Targeted:", len(pg_chemo_targeted_indices))

pg_chemo_other_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Chemotherapy') & (pg_drug2s_classes_mapped == 'Other')].index.to_list()
pg_chemo_other_indices += drug_pg_df[(pg_drug1s_classes_mapped == 'Other') & (pg_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("PG Chemo-Other:", len(pg_chemo_other_indices))

pg_targeted_other_indices = drug_pg_df[(pg_drug1s_classes_mapped == 'Targeted') & (pg_drug2s_classes_mapped == 'Other')].index.to_list()
pg_targeted_other_indices += drug_pg_df[(pg_drug1s_classes_mapped == 'Other') & (pg_drug2s_classes_mapped == 'Targeted')].index.to_list()
print("PG Targeted-Other:", len(pg_targeted_other_indices))

# The number of indices should add up to the total number of rows in the pg data
print(drug_pg_df.shape[0] == len(pg_chemo_chemo_indices) + len(pg_chemo_targeted_indices) + len(pg_chemo_other_indices) + len(pg_targeted_targeted_indices) + len(pg_targeted_other_indices) + len(pg_other_other_indices))

# Comboscore indices
cs_drug1s_classes_mapped = drug_comboscore_df['NSC1'].map(drug_classification_df['Therapy_Class'])
cs_drug2s_classes_mapped = drug_comboscore_df['NSC2'].map(drug_classification_df['Therapy_Class'])

cs_chemo_chemo_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Chemotherapy') & (cs_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("CS Chemo-Chemo:", len(cs_chemo_chemo_indices))

cs_targeted_targeted_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Targeted') & (cs_drug2s_classes_mapped == 'Targeted')].index.to_list()
print("CS Targeted-Targeted:", len(cs_targeted_targeted_indices))

cs_other_other_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Other') & (cs_drug2s_classes_mapped == 'Other')].index.to_list()
print("CS Other-Other:", len(cs_other_other_indices))

cs_chemo_targeted_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Chemotherapy') & (cs_drug2s_classes_mapped == 'Targeted')].index.to_list()
cs_chemo_targeted_indices += drug_comboscore_df[(cs_drug1s_classes_mapped == 'Targeted') & (cs_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("CS Chemo-Targeted:", len(cs_chemo_targeted_indices))

cs_chemo_other_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Chemotherapy') & (cs_drug2s_classes_mapped == 'Other')].index.to_list()
cs_chemo_other_indices += drug_comboscore_df[(cs_drug1s_classes_mapped == 'Other') & (cs_drug2s_classes_mapped == 'Chemotherapy')].index.to_list()
print("CS Chemo-Other:", len(cs_chemo_other_indices))

cs_targeted_other_indices = drug_comboscore_df[(cs_drug1s_classes_mapped == 'Targeted') & (cs_drug2s_classes_mapped == 'Other')].index.to_list()
cs_targeted_other_indices += drug_comboscore_df[(cs_drug1s_classes_mapped == 'Other') & (cs_drug2s_classes_mapped == 'Targeted')].index.to_list()
print("CS Targeted-Other:", len(cs_targeted_other_indices))

# The number of indices should add up to the total number of rows in the comboscore data
print(drug_comboscore_df.shape[0] == len(cs_chemo_chemo_indices) + len(cs_chemo_targeted_indices) + len(cs_chemo_other_indices) + len(cs_targeted_targeted_indices) + len(cs_targeted_other_indices) + len(cs_other_other_indices))


# write the indices to separate files
drug_class_file_prefix = '../data/ASP_dataset_slices/' + cancer_type_prefix
pg_indices_suffix = '_pg_indices.txt'
combo_indices_suffix = '_cs_indices.txt'

for task in ['pg', 'cs']:
    for drug_class in ['chemo_chemo', 'chemo_targeted', 'chemo_other', 'targeted_targeted', 'targeted_other', 'other_other']:
        if task == 'pg':
            with open(drug_class_file_prefix + drug_class + pg_indices_suffix, 'w') as f:
                for idx in eval(f'{task}_{drug_class}_indices'):
                    f.write(f'{idx}\n')
        else:
            with open(drug_class_file_prefix + drug_class + combo_indices_suffix, 'w') as f:
                for idx in eval(f'{task}_{drug_class}_indices'):
                    f.write(f'{idx}\n')


PG Chemo-Chemo: 1023777
PG Targeted-Targeted: 131625
PG Other-Other: 79326
PG Chemo-Targeted: 730059
PG Chemo-Other: 594645
PG Targeted-Other: 214848
True
CS Chemo-Chemo: 109287
CS Targeted-Targeted: 14625
CS Other-Other: 8814
CS Chemo-Targeted: 79569
CS Chemo-Other: 64761
CS Targeted-Other: 23872
True


Generate the CSV files for percent growth regression
- Percent growth regression needs to include the concentration of drug 1 and drug 2
- NOTE THAT THIS APPENDS TO THE FILES, IF YOU NEED TO RERUN IT, DELETE THE FILES

In [7]:
# Generate 
num_pg_samples = drug_pg_df.shape[0]
print(num_pg_samples)

# NOTE - all concentrations are in M 
drug1s_pg = drug_pg_df['NSC1'].values
drug1s_conc = drug_pg_df['CONC1'].values
drug2s_pg = drug_pg_df['NSC2'].values
drug2s_conc = drug_pg_df['CONC2'].values
cell_lines_pg = drug_pg_df['CELLNAME'].values

for i in range(num_pg_samples):
    mfp1 = nsc_to_mfp[str(drug1s_pg[i])]
    mfp2 = nsc_to_mfp[str(drug2s_pg[i])]
    percgrowth = drug_pg_df.iloc[i]['PERCENTGROWTH']
    dna_row = dna_df.loc[cell_lines_pg[i]]
    rna_row = rna_df.loc[cell_lines_pg[i]]
    protein_row = protein_df.loc[cell_lines_pg[i]]

    # Concatenate into strings
    drug1_string = ','.join([str(x) for x in mfp1])
    drug1_string += ',' + str(drug1s_conc[i]) # Add concentration after MFP
    drug2_string = ','.join([str(x) for x in mfp2])
    drug2_string += ',' + str(drug2s_conc[i]) # Add concentration after MFP
    dna_string = ','.join([str(x) for x in dna_row.values])
    rna_string = ','.join([str(x) for x in rna_row.values])
    protein_string = ','.join([str(x) for x in protein_row.values])

    # MFP + Single Omic Files
    with open(fn_mfpdna_percgrowth, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + str(percgrowth) + '\n')
    with open(fn_mfprna_percgrowth, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + rna_string + ',' + str(percgrowth) + '\n')
    with open(fn_mfpprot_percgrowth, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + protein_string + ',' + str(percgrowth) + '\n')
    
    # MFP + 2 Omic Files
    with open(fn_mfpdnarna_percgrowth, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + rna_string + ',' + str(percgrowth) + '\n')
    with open(fn_mfpdnaprot_percgrowth, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + protein_string + ',' + str(percgrowth) + '\n')
    with open(fn_mfprnaprot_percgrowth, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + rna_string + ',' + protein_string + ',' + str(percgrowth) + '\n')
    
    # MFP + 3 Omic Files
    with open(fn_mfpdnarnaprot_percgrowth, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + rna_string + ',' + protein_string + ',' + str(percgrowth) + '\n')


    if i % 100000 == 0:
        print("percent growth sample", i)



2774280
percent growth sample 0
percent growth sample 100000
percent growth sample 200000
percent growth sample 300000
percent growth sample 400000
percent growth sample 500000
percent growth sample 600000
percent growth sample 700000
percent growth sample 800000
percent growth sample 900000
percent growth sample 1000000
percent growth sample 1100000
percent growth sample 1200000
percent growth sample 1300000
percent growth sample 1400000
percent growth sample 1500000
percent growth sample 1600000
percent growth sample 1700000
percent growth sample 1800000
percent growth sample 1900000
percent growth sample 2000000
percent growth sample 2100000
percent growth sample 2200000
percent growth sample 2300000
percent growth sample 2400000
percent growth sample 2500000
percent growth sample 2600000
percent growth sample 2700000


Generate CSV for binary classification on comboscore and regression on comboscore

In [8]:
num_cs_samples = drug_comboscore_df.shape[0]
drug1cs = drug_comboscore_df['NSC1'].values
drug2cs = drug_comboscore_df['NSC2'].values
cell_linescs = drug_comboscore_df['CELLNAME'].values

for i in range(num_cs_samples):
    mfp1 = nsc_to_mfp[str(drug1cs[i])]
    mfp2 = nsc_to_mfp[str(drug2cs[i])]
    cs = drug_comboscore_df.iloc[i]['SCORE']
    dna_row = dna_df.loc[cell_linescs[i]]
    rna_row = rna_df.loc[cell_linescs[i]]
    protein_row = protein_df.loc[cell_linescs[i]]

    # Concatenate into strings
    drug1_string = ','.join([str(x) for x in mfp1])
    drug2_string = ','.join([str(x) for x in mfp2])
    dna_string = ','.join([str(x) for x in dna_row.values])
    rna_string = ','.join([str(x) for x in rna_row.values])
    protein_string = ','.join([str(x) for x in protein_row.values])

    # MFP + Single Omic Files
    with open(fn_mfpdna_reg, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + str(cs) + '\n')
    with open(fn_mfpdna_bc0, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',')
        if cs > 0:
            f.write('1' + '\n')
        else:
            f.write('0' + '\n')
    with open(fn_mfprna_reg, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + rna_string + ',' + str(cs) + '\n')
    with open(fn_mfprna_bc0, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + rna_string + ',')
        if cs > 0:
            f.write('1' + '\n')
        else:
            f.write('0' + '\n')
    with open(fn_mfpprot_reg, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + protein_string + ',' + str(cs) + '\n')
    with open(fn_mfpprot_bc0, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + protein_string + ',')
        if cs > 0:
            f.write('1' + '\n')
        else:
            f.write('0' + '\n')

    # MFP + 2 Omic Files
    with open(fn_mfpdnarna_reg, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + rna_string + ',' + str(cs) + '\n')
    with open(fn_mfpdnarna_bc0, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + rna_string + ',')
        if cs > 0:
            f.write('1' + '\n')
        else:
            f.write('0' + '\n')
    with open(fn_mfpdnaprot_reg, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + protein_string + ',' + str(cs) + '\n')
    with open(fn_mfpdnaprot_bc0, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + protein_string + ',')
        if cs > 0:
            f.write('1' + '\n')
        else:
            f.write('0' + '\n')
    with open(fn_mfprnaprot_reg, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + rna_string + ',' + protein_string + ',' + str(cs) + '\n')
    with open(fn_mfprnaprot_bc0, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + rna_string + ',' + protein_string + ',')
        if cs > 0:
            f.write('1' + '\n')
        else:
            f.write('0' + '\n')

    # MFP + 3 Omic Files
    with open(fn_mfpdnarnaprot_reg, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + rna_string + ',' + protein_string + ',' + str(cs) + '\n')
    with open(fn_mfpdnarnaprot_bc0, 'a') as f:
        f.write(drug1_string + ',' + drug2_string + ',' + dna_string + ',' + rna_string + ',' + protein_string + ',')
        if cs > 0:
            f.write('1' + '\n')
        else:
            f.write('0' + '\n')
    
    if i % 100000 == 0:
        print("comboscore", i)

comboscore 0
comboscore 100000
comboscore 200000
comboscore 300000
