In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import rbf_kernel

In [3]:
class NRLMF:
    def __init__(self, c=5, gamma=1, lambda_d=1, lambda_t=1, r=10, alpha=0.01, beta=0.01, theta=0.01, max_iter=100):
        self.c = c
        self.gamma = gamma
        self.lambda_d = lambda_d
        self.lambda_t = lambda_t
        self.r = r
        self.alpha = alpha
        self.beta = beta
        self.theta = theta
        self.max_iter = max_iter

    def fix_model(self, Y, Sd, St):
        self.Y = Y
        self.Sd = Sd
        self.St = St
        self.num_drugs, self.num_targets = Y.shape
        self.D = np.random.rand(self.num_drugs, self.r)
        self.T = np.random.rand(self.num_targets, self.r)

        for _ in range(self.max_iter):
            self.update_D()
            self.update_T()

    def update_D(self):
        for i in range(self.num_drugs):
            self.D[i] = np.linalg.solve(
                self.T.T @ self.T + self.lambda_d * np.eye(self.r),
                self.Y[i] @ self.T
            )

    def update_T(self):
        for j in range(self.num_targets):
            self.T[j] = np.linalg.solve(
                self.D.T @ self.D + self.lambda_t * np.eye(self.r),
                self.Y[:, j] @ self.D
            )

    def predict(self):
        return self.D @ self.T.T

In [4]:
# # Load the protein, molecule, and interaction data
protein_data = pd.read_csv('../../data/3-nrlmf/1-input_nrlmf/prot-simmilarity_bindingdb.csv', index_col=0)
protein_data

Unnamed: 0_level_0,O15379,P36894,P20393,P06213,P35354,P08235,Q9NTG7,P13945,O95140,Q9H0K1,...,P11086,Q9Y6E0,P62508,Q16552,P49763,Q01469,O43194,Q5NUL3,Q9H093,Q7Z4H4
UniProt ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
O15379,1.000000,0.445442,0.056997,0.738728,0.680135,0.142965,-0.102300,-0.329581,-0.246953,-0.095607,...,-0.329105,-0.251004,-0.189997,0.399919,0.035464,-0.133709,-0.116645,-0.151527,-0.227225,-0.135784
P36894,0.445442,1.000000,-0.473966,0.265636,0.423584,-0.311948,-0.454940,-0.159260,0.276228,-0.558050,...,-0.595314,-0.487953,0.077909,0.096086,-0.255061,0.290213,0.489863,0.429425,-0.707432,-0.543418
P20393,0.056997,-0.473966,1.000000,0.392952,-0.150037,0.945981,0.278616,-0.071074,-0.660764,0.432233,...,0.158755,0.145551,-0.182383,0.553563,0.498066,-0.169259,-0.498443,-0.669896,0.614452,0.767112
P06213,0.738728,0.265636,0.392952,1.000000,0.480054,0.441825,-0.115547,-0.434822,-0.317298,0.100204,...,-0.391025,-0.088088,-0.106384,0.725522,0.373736,-0.005924,-0.297441,-0.433404,0.123441,0.132615
P35354,0.680135,0.423584,-0.150037,0.480054,1.000000,-0.125586,-0.222264,-0.164691,-0.073429,0.129437,...,-0.587644,-0.460732,-0.302720,0.440246,0.141608,-0.313627,0.303626,0.176200,-0.380789,-0.186409
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q01469,-0.133709,0.290213,-0.169259,-0.005924,-0.313627,-0.089763,-0.442933,-0.508036,0.486992,-0.352723,...,-0.208250,0.324278,0.305606,-0.248709,-0.336389,1.000000,-0.163729,-0.228452,-0.008381,-0.338111
O43194,-0.116645,0.489863,-0.498443,-0.297441,0.303626,-0.467428,-0.187152,0.432939,0.366611,-0.324539,...,-0.352884,-0.754025,-0.144368,-0.058878,-0.147349,-0.163729,1.000000,0.867246,-0.780265,-0.449770
Q5NUL3,-0.151527,0.429425,-0.669896,-0.433404,0.176200,-0.635694,0.094151,0.648345,0.243106,-0.468925,...,-0.056952,-0.660063,-0.267613,-0.243547,-0.273415,-0.228452,0.867246,1.000000,-0.830788,-0.467036
Q9H093,-0.227225,-0.707432,0.614452,0.123441,-0.380789,0.522877,0.199391,-0.368273,-0.288914,0.506368,...,0.280411,0.717479,0.182256,0.135897,0.307813,-0.008381,-0.780265,-0.830788,1.000000,0.655701


In [6]:
molecule_data = pd.read_parquet('../../data/3-nrlmf/1-input_nrlmf/mol-simmilarity_bindingdb.parquet',engine="pyarrow")
molecule_data

Drug,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,...,d38709,d38710,d38711,d38712,d38713,d38714,d38715,d38716,d38717,d38718
Drug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
d1,1.000000,0.418718,0.386244,0.319489,0.404520,0.402911,0.316723,0.311400,0.301511,0.316723,...,0.422200,0.435194,0.406558,0.231627,0.270232,0.268044,0.265908,0.265908,0.259794,0.272475
d2,0.418718,1.000000,0.716541,0.470073,0.460044,0.639209,0.628092,0.617535,0.597925,0.628092,...,0.496956,0.267261,0.374513,0.474156,0.553183,0.548703,0.563771,0.544331,0.531816,0.557773
d3,0.386244,0.716541,1.000000,0.551447,0.549021,0.770329,0.820008,0.806226,0.780625,0.820008,...,0.650250,0.439138,0.496609,0.492055,0.533060,0.549080,0.544705,0.585053,0.591312,0.537484
d4,0.319489,0.470073,0.551447,1.000000,0.572604,0.584094,0.521759,0.530089,0.513256,0.521759,...,0.704792,0.554421,0.660820,0.542685,0.627479,0.639220,0.634126,0.667502,0.668459,0.632687
d5,0.404520,0.460044,0.549021,0.572604,1.000000,0.617535,0.567646,0.596595,0.577651,0.567646,...,0.605350,0.559431,0.643224,0.610772,0.687118,0.700486,0.676123,0.694905,0.660578,0.692820
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
d38714,0.268044,0.548703,0.549080,0.639220,0.700486,0.695815,0.650361,0.672221,0.650876,0.650361,...,0.675776,0.549927,0.633614,0.845556,0.975642,1.000000,0.976031,0.976031,0.937958,0.983739
d38715,0.265908,0.563771,0.544705,0.634126,0.676123,0.690271,0.645179,0.666865,0.645689,0.645179,...,0.670391,0.545545,0.645553,0.838819,0.951737,0.976031,1.000000,0.968254,0.930484,0.959635
d38716,0.265908,0.544331,0.585053,0.667502,0.694905,0.723943,0.678265,0.699395,0.677186,0.678265,...,0.705675,0.563730,0.662541,0.871081,0.951737,0.976031,0.968254,1.000000,0.961500,0.959635
d38717,0.259794,0.531816,0.591312,0.668459,0.660578,0.707298,0.694996,0.715097,0.692390,0.694996,...,0.706687,0.568535,0.663906,0.866814,0.929855,0.937958,0.930484,0.961500,1.000000,0.937572


In [7]:
interaction_data = pd.read_csv('../../data/3-nrlmf/1-input_nrlmf/matriks-chemogenomic_bindingdb.csv', index_col=0)
interaction_data

Unnamed: 0_level_0,d1,d10,d100,d1000,d10000,d10001,d10002,d10003,d10004,d10005,...,d9990,d9991,d9992,d9993,d9994,d9995,d9996,d9997,d9998,d9999
UniProt ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
O00206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O00763,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O00767,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O15264,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O15294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9ULZ1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9UNA0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9Y233,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9Y478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
jumlah_nilai_1 = (interaction_data == 1).sum().sum()
jumlah_nilai_1

40674

In [9]:
# Convert data to numpy arrays
Y = interaction_data.values
Sd = protein_data.values
St = molecule_data.values

# Define parameters for NRLMF
params = {
    'c': 5,
    'gamma': 1,
    'lambda_d': 1,
    'lambda_t': 1,
    'r': 10,
    'alpha': 0.01,
    'beta': 0.01,
    'theta': 0.01,
    'max_iter': 100
}

# Initialize and train NRLMF model
nrlmf_model = NRLMF(**params)
nrlmf_model.fix_model(Y, Sd, St)

# Perform predictions
Y_pred = nrlmf_model.predict()

# Prepare to store low probability pairs
protein_ids = interaction_data.index.values
molecule_ids = interaction_data.columns.values
low_prob_pairs = []

In [10]:
# Find low probability pairs for each protein
for i in range(Y.shape[0]):
    # Get predicted probabilities for the current protein
    pred_probs = Y_pred[i]
    # Exclude known interactions
    known_interactions = Y[i]
    pred_probs[known_interactions == 1] = np.inf
    # Get indices of lowest probabilities
    low_prob_indices = np.argsort(pred_probs)[:np.sum(known_interactions)]
    for j in low_prob_indices:
        low_prob_pairs.append((protein_ids[i], molecule_ids[j], pred_probs[j]))

In [11]:
# Create a DataFrame with protein-molecule pairs and their predicted scores
## 0=protein, 1=molecule
low_prob_pairs_df = pd.DataFrame(low_prob_pairs, columns=['UniProt ID', 'Drug', 'score'])
low_prob_pairs_df

Unnamed: 0,UniProt ID,Drug,score
0,O00206,d13900,-4.515478e-170
1,O00206,d13933,-4.503280e-170
2,O00206,d13548,-4.503280e-170
3,O00206,d13467,-4.503280e-170
4,O00206,d13547,-4.503280e-170
...,...,...,...
40669,Q9Y478,d35246,-4.409523e-09
40670,Q9Y478,d35247,-4.409523e-09
40671,Q9Y6E0,d31118,-3.285669e-09
40672,Q9Y6E0,d31095,-3.285669e-09


In [12]:
low_prob_pairs_df.to_csv("../../data/3-nrlmf/2-output_nrlmf/neg_inter_bindingdb.csv", index=False)