In [1]:
import pandas as pd
import os
from sklearn.model_selection import KFold
import numpy as np

# Import the matador dataset (direct and indirect interactions) after we got the protein sequences and drug smiles

In [2]:
matador_df = pd.read_csv('../../datasets/matador_crossed.csv')

In [3]:
matador_df.head()

Unnamed: 0,complexity,drug_name,drug_pubchem_id,exact_mass,hba,hbd,heavy_atoms,interaction_type,molecular_formula,mw_freebase,mw_monoisotopic,protein_chembl_id,protein_name,protein_sequence,protein_string_id,protein_uniprot_id,sequence_length,smiles,target_type,tpsa
0,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,CHEMBL4929;,APCS,MNKPLLWISVLTSLLEAFAHTDLSGKVFVFPRESVTDHVNLITPLE...,9606.ENSP00000255040,P02743,223,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
1,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,FTH1,MTTASTSQVRQNYHQDSEAAINRQINLELYASYVYLSMSYYFDRDD...,9606.ENSP00000273550,P02794,183,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
2,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,CHEMBL2364709;,FGG,MSWSLHPRNLILYFYALLFLSSTCVAYVATRDNCCILDERFGSYCP...,9606.ENSP00000336829,P02679,453,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
3,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,SERPINA1,MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPT...,9606.ENSP00000348068,P01009,418,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
4,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,LCN2,MPLGLLWLGLALLGALHAQAQDSTSDLIPAPPLSKVPLQQNFQDNQ...,9606.ENSP00000277480,P80188,198,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3


In [4]:
data = matador_df.copy()

# Data formating 

In [5]:
dti = matador_df.set_index(['protein_string_id'])

In [6]:
proteins = matador_df['protein_string_id'].unique()
drugs = matador_df['drug_pubchem_id'].unique()

num_drugs = len(drugs)
num_proteins = len(proteins)

drug2Index = {}
index2Drug = {}

cptDrug = 0
for entry in drugs:
    drug2Index[entry] = cptDrug
    index2Drug[cptDrug] = entry
    cptDrug += 1

protein2Index = {}
index2Protein = {}

cptProtein = 0
for entry in proteins:
    protein2Index[entry] = cptProtein
    index2Protein[cptProtein] = entry
    cptProtein += 1

In [7]:
interaction_types = dti['interaction_type'].unique()
num_interaction_types = len(interaction_types)

In [8]:
def format_data_crbm(data, train_data):
    data_drug_indexed = data.set_index(["protein_string_id", "drug_pubchem_id"])
    output_target_rbm = np.zeros(shape=(num_interaction_types, num_proteins, num_drugs))
    data_proteins = data['protein_string_id'].unique()
    cpt = 0
    for i in interaction_types:
        for proteinID in data_proteins:
            p_interactions = np.zeros(shape=(num_drugs,))
            interactions = data_drug_indexed.loc[proteinID, :]
            for drug_id in interactions.index:
                p_interactions[drug2Index[drug_id]] = 1. if (interactions.loc[
                    drug_id, 'interaction_type']) == i else 0.
            p = protein2Index[proteinID]
            output_target_rbm[cpt][p] = p_interactions.copy()
        cpt += 1

    # True: the interaction is used for prediction, False: the interaction is used for training
    isMissingDrug = np.ones((num_proteins, num_drugs), dtype=np.int)

    for cpt in range(train_data.shape[0]):
        row = train_data.iloc[cpt]
        chemical_id = row['drug_pubchem_id']
        protein_id = row['protein_string_id']

        drug_index = drug2Index[chemical_id]
        protein_index = protein2Index[protein_id]

        isMissingDrug[protein_index][drug_index] = 0

    data_protein_indexed = data.set_index(["drug_pubchem_id", "protein_string_id"])
    output_drug_rbm = np.zeros(shape=(num_interaction_types, num_drugs, num_proteins))
    data_drugs = data['drug_pubchem_id'].unique()
    cpt = 0
    for i in interaction_types:
        for drugID in data_drugs:
            d_interactions = np.zeros(shape=(num_proteins,))
            interactions = data_protein_indexed.loc[drugID, :]
            for protein_id in interactions.index:
                d_interactions[protein2Index[protein_id]] = 1. if (interactions.loc[
                    protein_id, 'interaction_type']) == i else 0.
            d = drug2Index[drugID]
            output_drug_rbm[cpt][d] = d_interactions.copy()
        cpt += 1

        # True: the interaction is used for prediction, False: the interaction is used for training
    isMissingProtein = np.ones((num_drugs, num_proteins), dtype=np.int)

    for cpt in range(train_data.shape[0]):
        row = train_data.iloc[cpt]
        chemical_id = row['drug_pubchem_id']
        protein_id = row['protein_string_id']

        drug_index = drug2Index[chemical_id]
        protein_index = protein2Index[protein_id]

        isMissingProtein[drug_index][protein_index] = 0

    return [output_target_rbm, isMissingDrug, output_drug_rbm, isMissingProtein]

# Matador dataset splits to be used to create the data for CRBM (distinction and no distinction), SB-CRBM (distinction and no distinction)

In [None]:
# Number of splits
n_splits = 10

i = 0

kf = KFold(n_splits=n_splits, shuffle=True, random_state=333)

for train, test in kf.split(matador_df):
    i += 1
    print("Step " + str(i))
    X_train = matador_df.loc[train]
    X_test = matador_df.loc[test]
    
    directory = "../../datasets/matador_splits/{0}".format(i)
    os.makedirs(directory)
    
    X_train.to_csv('{0}/X_train.csv'.format(directory), index=False)
    X_test.to_csv('{0}/X_test.csv'.format(directory), index=False)

In [9]:
# Number of splits
n_splits = 10
for i in range(1, n_splits + 1):
    print('Step {0}'.format(i))
    X_train = pd.read_csv('../../datasets/matador_splits/{0}/X_train.csv'.format(i))
    data_target_rbm, isMissing_target_rbm, data_drug_rbm, isMissing_drug_rbm = format_data_crbm(matador_df, X_train)
    
    directory = "../../datasets/crbm/{0}".format(i)
    os.makedirs(directory)
    
    for interaction in range(num_interaction_types):
        df_X_train_target_rbm = pd.DataFrame(data_target_rbm[interaction])
        df_X_train_target_rbm.to_csv(directory + "/{0}_df_X_train_target_rbm.csv".format(interaction),
                                   index=None, header=None)

        df_X_train_drug_rbm = pd.DataFrame(data_drug_rbm[interaction])
        df_X_train_drug_rbm.to_csv(directory + "/{0}_df_X_train_drug_rbm.csv".format(interaction),
                                   index=None, header=None)

    df_isMissing_target_rbm = pd.DataFrame(isMissing_target_rbm)
    df_isMissing_target_rbm.to_csv(directory + "/df_isMissing_target_rbm.csv", index=None, header=None)

    df_isMissing_drug_rbm = pd.DataFrame(isMissing_drug_rbm)
    df_isMissing_drug_rbm.to_csv(directory + "/df_isMissing_drug_rbm.csv", index=None, header=None)

Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9
Step 10
