In [1]:
import pandas as pd
import math
import numpy as np
import os

In [2]:
data_matador_df = pd.read_csv('../../datasets/matador_crossed.csv')

In [3]:
data_matador_df.shape

(7767, 20)

In [4]:
data_matador_df.head()

Unnamed: 0,complexity,drug_name,drug_pubchem_id,exact_mass,hba,hbd,heavy_atoms,interaction_type,molecular_formula,mw_freebase,mw_monoisotopic,protein_chembl_id,protein_name,protein_sequence,protein_string_id,protein_uniprot_id,sequence_length,smiles,target_type,tpsa
0,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,CHEMBL4929;,APCS,MNKPLLWISVLTSLLEAFAHTDLSGKVFVFPRESVTDHVNLITPLE...,9606.ENSP00000255040,P02743,223,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
1,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,FTH1,MTTASTSQVRQNYHQDSEAAINRQINLELYASYVYLSMSYYFDRDD...,9606.ENSP00000273550,P02794,183,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
2,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,CHEMBL2364709;,FGG,MSWSLHPRNLILYFYALLFLSSTCVAYVATRDNCCILDERFGSYCP...,9606.ENSP00000336829,P02679,453,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
3,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,SERPINA1,MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPT...,9606.ENSP00000348068,P01009,418,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3
4,126,gold sodium thiomalate,11954225,367.939,1,5,11,2,C4H4AuNaO4S,368.088,367.939,,LCN2,MPLGLLWLGLALLGALHAQAQDSTSDLIPAPPLSKVPLQQNFQDNQ...,9606.ENSP00000277480,P80188,198,[H+].C(C(C(=O)[O-])[S-])C(=O)[O-].[Na+].[Au+],XXXXX,81.3


In [5]:
proteins_df = data_matador_df[['protein_string_id', 'protein_sequence']].copy()
proteins_df.drop_duplicates(subset=['protein_string_id'], inplace=True)
proteins_df_indexed = proteins_df.set_index(['protein_string_id'])

In [6]:
drugs_df = data_matador_df[['drug_pubchem_id', 'smiles']].copy()
drugs_df.drop_duplicates(subset=['drug_pubchem_id'], inplace=True)
drugs_df_indexed = drugs_df.set_index(['drug_pubchem_id'])

In [7]:
proteins = proteins_df['protein_string_id'].unique()
drugs = drugs_df['drug_pubchem_id'].unique()

num_drugs = len(drugs)
num_proteins = len(proteins)

drug2Index = {}
index2Drug = {}

cptDrug = 0
for entry in drugs:
    drug2Index[entry] = cptDrug
    index2Drug[cptDrug] = entry
    cptDrug += 1

protein2Index = {}
index2Protein = {}

cptProtein = 0
for entry in proteins:
    protein2Index[entry] = cptProtein
    index2Protein[cptProtein] = entry
    cptProtein += 1

## We load the Drug-Drug and Target-Target similarity

In [8]:
sim_protein_protein_normalized = np.load('sim_target_target_normalized.npy')
sim_drug_drug_normalized = np.load('sim_drug_drug_normalized.npy')

In [9]:
dti = data_matador_df.copy()
interaction_types = dti['interaction_type'].unique()
num_interaction_types = len(interaction_types)

In [10]:
def to_matrix(data):
    mat = np.zeros((num_proteins, num_interaction_types, num_drugs))
    data_proteins = data['protein_string_id'].unique()
    data_drugs = data['drug_pubchem_id'].unique()
    
    data_drug_indexed = data.set_index(["protein_string_id", "drug_pubchem_id"])
    
    for proteinID in data_proteins:
        proteinIndex = protein2Index[proteinID]
        for drugID in data_drugs:
            drugIndex = drug2Index[drugID]
            if((proteinID, drugID) in data_drug_indexed.index):
                interaction = data_drug_indexed.loc[proteinID, drugID]['interaction_type']
                interaction_index = int(interaction) - 1
                mat[proteinIndex][interaction_index][drugIndex] = 1.
                
    return mat

In [11]:
def weighted_profile_method(drugID, drugIndex, proteinID, proteinIndex, X_train_matrix, omega):
    sim_drug_data_index = np.argwhere((sim_drug_drug_normalized[drugIndex] >= omega) & (sim_drug_drug_normalized[drugIndex] != 1.0))
    sim_protein_data_index = np.argwhere((sim_protein_protein_normalized[proteinIndex] >= omega) & (sim_protein_protein_normalized[proteinIndex] != 1.0))
    
    # Interaction probability based on drug
    direct_drug_data = X_train_matrix[proteinIndex][0][sim_drug_data_index]
    indirect_drug_data = X_train_matrix[proteinIndex][1][sim_drug_data_index]
    sim_drug_data = sim_drug_drug_normalized[drugIndex][sim_drug_data_index]
    
    if(np.sum(sim_drug_data) == 0):
        return [-1, -1]
    
    direct_prob_drug = np.sum(direct_drug_data * sim_drug_data) / np.sum(sim_drug_data)
    indirect_prob_drug = np.sum(indirect_drug_data * sim_drug_data) / np.sum(sim_drug_data)
    
    # Interaction probability based on protein
    direct_protein_data = X_train_matrix[sim_protein_data_index, 0, drugIndex]
    indirect_protein_data = X_train_matrix[sim_protein_data_index, 1, drugIndex]
    sim_protein_data = sim_protein_protein_normalized[proteinIndex][sim_protein_data_index]
    
    if(np.sum(sim_protein_data) == 0):
        return [-1, -1]
    
    direct_prob_protein = np.sum(direct_protein_data * sim_protein_data) / np.sum(sim_protein_data)
    indirect_prob_protein = np.sum(indirect_protein_data * sim_protein_data) / np.sum(sim_protein_data)
    
    prediction_direct = (direct_prob_drug + direct_prob_protein) / 2
    prediction_indirect = (indirect_prob_drug + indirect_prob_protein) / 2
    
    return [prediction_direct, prediction_indirect]

In [15]:
def format_similarity_boosted_data(data, train_data, X_train_matrix, threshold = 0.0, omega = 0.0):
#     print("Threshold: {0:0.2f}".format(threshold))
    data_drug_indexed = data.set_index(["protein_string_id", "drug_pubchem_id"])
    output_drug_rbm_training = np.zeros(shape=(num_interaction_types, num_proteins, num_drugs))
    output_drug_rbm_testing = np.zeros(shape=(num_interaction_types, num_proteins, num_drugs))
    
    output_protein_rbm_training = np.zeros(shape=(num_interaction_types, num_drugs, num_proteins))
    output_protein_rbm_testing = np.zeros(shape=(num_interaction_types, num_drugs, num_proteins))
    
    data_proteins = data['protein_string_id'].unique()
    data_drugs = data['drug_pubchem_id'].unique()
    
    train_data_drug_indexed = train_data.set_index(["protein_string_id", "drug_pubchem_id"])
    
    # True: the interaction is used for prediction, False: the interaction is used for training / training
    isMissingDrugTraining = np.ones((num_proteins, num_drugs), dtype=np.int)
    
    # True: the interaction is used for prediction, False: the interaction is used for training / testing
    isMissingDrugTesting = np.ones((num_proteins, num_drugs), dtype=np.int)
    
    # True: the interaction is used for prediction, False: the interaction is used for training / training
    isMissingProteinTraining = np.ones((num_drugs, num_proteins), dtype=np.int)
    
    # True: the interaction is used for prediction, False: the interaction is used for training / testing
    isMissingProteinTesting = np.ones((num_drugs, num_proteins), dtype=np.int)
    
    # Conditional RBM data
    probDrugTraining = np.zeros(shape=(num_interaction_types, num_proteins, num_drugs))
    probDrugTesting = np.zeros(shape=(num_interaction_types, num_proteins, num_drugs))
    
    probProteinTraining = np.zeros(shape=(num_interaction_types, num_drugs, num_proteins))
    probProteinTesting = np.zeros(shape=(num_interaction_types, num_drugs, num_proteins))
    
    for proteinID in data_proteins:
        proteinIndex = protein2Index[proteinID]
        for drugID in data_drugs:
            drugIndex = drug2Index[drugID]
            # We have the interaction in the training set
            if((proteinID, drugID) in train_data_drug_indexed.index):
                interaction = train_data_drug_indexed.loc[proteinID, drugID]['interaction_type']
                interaction_index = int(interaction) - 1
                
                output_drug_rbm_training[interaction_index][proteinIndex][drugIndex] = 1.
                output_drug_rbm_testing[interaction_index][proteinIndex][drugIndex] = 1.
                isMissingDrugTraining[proteinIndex][drugIndex] = 0
                isMissingDrugTesting[proteinIndex][drugIndex] = 0
                
                output_protein_rbm_training[interaction_index][drugIndex][proteinIndex] = 1.
                output_protein_rbm_testing[interaction_index][drugIndex][proteinIndex] = 1.              
                isMissingProteinTraining[drugIndex][proteinIndex] = 0
                isMissingProteinTesting[drugIndex][proteinIndex] = 0
                
                probDrugTraining[interaction_index][proteinIndex][drugIndex] = 1.
                probDrugTesting[interaction_index][proteinIndex][drugIndex] = 1.
                
                probProteinTraining[interaction_index][drugIndex][proteinIndex] = 1.
                probProteinTesting[interaction_index][drugIndex][proteinIndex] = 1.
            else:
                # Interaction in the test set
                if((proteinID, drugID) in data_drug_indexed.index):
                    interaction = data_drug_indexed.loc[proteinID, drugID]['interaction_type']
                    interaction_index = int(interaction) - 1

                    output_drug_rbm_testing[interaction_index][proteinIndex][drugIndex] = 1.
                    output_protein_rbm_testing[interaction_index][drugIndex][proteinIndex] = 1.

                # We don't have the interaction, we use ALADIN
                direct_prob, indirect_prob = weighted_profile_method(drugID, drugIndex, proteinID, proteinIndex, X_train_matrix, omega)
                
                if(direct_prob == -1 or indirect_prob == -1):
                    continue
                
#                 print(direct_prob, indirect_prob)
                
                if(direct_prob > threshold or indirect_prob > threshold):
                    if(direct_prob > indirect_prob):
#                         print("direct_prob: {0:0.2f}".format(direct_prob))
                        output_drug_rbm_training[0][proteinIndex][drugIndex] = 1.
                        isMissingDrugTraining[proteinIndex][drugIndex] = 0
                        probDrugTraining[0][proteinIndex][drugIndex] = direct_prob

                        output_protein_rbm_training[0][drugIndex][proteinIndex] = 1.
                        isMissingProteinTraining[drugIndex][proteinIndex] = 0
                        probProteinTraining[0][drugIndex][proteinIndex] = direct_prob
                    
                    elif(indirect_prob > direct_prob):
#                         print("indirect_prob: {0:0.2f}".format(indirect_prob))
                        output_drug_rbm_training[1][proteinIndex][drugIndex] = 1.
                        isMissingDrugTraining[proteinIndex][drugIndex] = 0
                        probDrugTraining[1][proteinIndex][drugIndex] = indirect_prob

                        output_protein_rbm_training[1][drugIndex][proteinIndex] = 1.
                        isMissingProteinTraining[drugIndex][proteinIndex] = 0
                        probProteinTraining[1][drugIndex][proteinIndex] = indirect_prob
    
    return [output_drug_rbm_training, isMissingDrugTraining, output_drug_rbm_testing, isMissingDrugTesting, probDrugTraining, probDrugTesting, output_protein_rbm_training, isMissingProteinTraining, output_protein_rbm_testing, isMissingProteinTesting, probProteinTraining, probProteinTesting]

## We format the data by varying the parameter `omega` in the range [0.0,0.9] by a step of 0.1
## Since our model performed the best when `omega = 0.5`, we use this value to format the final data.

## We format the data for the Similarity-Boosted Method (distinction between direct and indirect interactions)

In [16]:
n_splits = 10
threshold = 0.0
omega = 0.5
while (omega <= 0.5):
    for i in range(1, n_splits + 1):
        print('Step {0}'.format(i))
        X_train = pd.read_csv('../../datasets/matador_splits/{0}/X_train.csv'.format(i))
        X_train_matrix = to_matrix(X_train)
        data_drug_rbm_training, isMissing_drug_rbm_training, data_drug_rbm_testing, isMissing_drug_rbm_testing, probDrugTraining, probDrugTesting, data_protein_rbm_training, isMissing_protein_rbm_training, data_protein_rbm_testing, isMissing_protein_rbm_testing, probProteinTraining, probProteinTesting = format_similarity_boosted_data(dti, X_train, X_train_matrix, threshold, omega)
        
        folder = "similarity_boosted_omega_0_{0}/{1}".format(int(omega * 10), i)
        directory = "../../datasets/{0}".format(folder)
        os.makedirs(directory)

        for interaction in range(num_interaction_types):
            df_X_train_drug_rbm = pd.DataFrame(data_drug_rbm_training[interaction])
            df_X_train_drug_rbm.to_csv(directory + "/{0}_df_X_train_target_rbm.csv".format(interaction), index=None, header=None)

            df_X_test_drug_rbm = pd.DataFrame(data_drug_rbm_testing[interaction])
            df_X_test_drug_rbm.to_csv(directory + "/{0}_df_X_test_target_rbm.csv".format(interaction), index=None, header=None)

            df_X_train_protein_rbm = pd.DataFrame(data_protein_rbm_training[interaction])
            df_X_train_protein_rbm.to_csv(directory + "/{0}_df_X_train_drug_rbm.csv".format(interaction), index=None, header=None)

            df_X_test_protein_rbm = pd.DataFrame(data_protein_rbm_testing[interaction])
            df_X_test_protein_rbm.to_csv(directory + "/{0}_df_X_test_drug_rbm.csv".format(interaction), index=None, header=None)

            df_probDrugTraining = pd.DataFrame(probDrugTraining[interaction])
            df_probDrugTraining.to_csv(directory + "/{0}_df_probTargetTraining.csv".format(interaction), index=None, header=None)

            df_probDrugTesting = pd.DataFrame(probDrugTesting[interaction])
            df_probDrugTesting.to_csv(directory + "/{0}_df_probTargetTesting.csv".format(interaction), index=None, header=None)

            df_probProteinTraining = pd.DataFrame(probProteinTraining[interaction])
            df_probProteinTraining.to_csv(directory + "/{0}_df_probDrugTraining.csv".format(interaction), index=None, header=None)

            df_probProteinTesting = pd.DataFrame(probProteinTesting[interaction])
            df_probProteinTesting.to_csv(directory + "/{0}_df_probDrugTesting.csv".format(interaction), index=None, header=None)

        df_isMissing_drug_rbm_training = pd.DataFrame(isMissing_drug_rbm_training)
        df_isMissing_drug_rbm_training.to_csv(directory + "/df_isMissing_target_rbm_training.csv", index=None, header=None)

        df_isMissing_drug_rbm_testing = pd.DataFrame(isMissing_drug_rbm_testing)
        df_isMissing_drug_rbm_testing.to_csv(directory + "/df_isMissing_target_rbm_testing.csv", index=None, header=None)


        df_isMissing_protein_rbm_training = pd.DataFrame(isMissing_protein_rbm_training)
        df_isMissing_protein_rbm_training.to_csv(directory + "/df_isMissing_drug_rbm_training.csv", index=None, header=None)

        df_isMissing_protein_rbm_testing = pd.DataFrame(isMissing_protein_rbm_testing)
        df_isMissing_protein_rbm_testing.to_csv(directory + "/df_isMissing_drug_rbm_testing.csv", index=None, header=None)
    
    omega += 0.1

Step 1
Step 2
Step 3
Step 4
Step 5
Step 6
Step 7
Step 8
Step 9
Step 10


## Statistics of the dataset

In [16]:
sum_direct = len(data_matador_df.loc[data_matador_df['interaction_type'] == 1])
sum_indirect = len(data_matador_df.loc[data_matador_df['interaction_type'] == 2])

In [17]:
known_interactions = sum_direct + sum_indirect
total_interactions = num_drugs * num_proteins
unknown_interactions = total_interactions - known_interactions
print('Number of drugs: {0}'.format(num_drugs))
print('Number of target proteins: {0}'.format(num_proteins))
print('Number of direct interactions: {0} ({1:0.2f}% of known interactions, {2:0.2f}% of possible interactions)'.format(sum_direct, sum_direct * 100 / known_interactions, sum_direct * 100 / total_interactions))
print('Number of indirect interactions: {0} ({1:0.2f}% of known interactions, {2:0.2f}% of possible interactions)'.format(sum_indirect, sum_indirect * 100 / known_interactions, sum_indirect * 100 / total_interactions))
print('Number of possible interactions: {0}'.format(total_interactions))
print('Number of known interactions: {0} ({1:0.2f}% of possible interactions)'.format(known_interactions, known_interactions * 100 /total_interactions))
print('Number of unknown interactions: {0} ({1:0.2f}% of possible interactions)'.format(unknown_interactions, unknown_interactions * 100 /total_interactions))

Number of drugs: 684
Number of target proteins: 1434
Number of direct interactions: 4539 (58.44% of known interactions, 0.46% of possible interactions)
Number of indirect interactions: 3228 (41.56% of known interactions, 0.33% of possible interactions)
Number of possible interactions: 980856
Number of known interactions: 7767 (0.79% of possible interactions)
Number of unknown interactions: 973089 (99.21% of possible interactions)
