In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import rbf_kernel

In [2]:
class NRLMF:
    def __init__(self, c=5, gamma=1, lambda_d=1, lambda_t=1, r=10, alpha=0.01, beta=0.01, theta=0.01, max_iter=100):
        self.c = c
        self.gamma = gamma
        self.lambda_d = lambda_d
        self.lambda_t = lambda_t
        self.r = r
        self.alpha = alpha
        self.beta = beta
        self.theta = theta
        self.max_iter = max_iter

    def fix_model(self, Y, Sd, St):
        self.Y = Y
        self.Sd = Sd
        self.St = St
        self.num_drugs, self.num_targets = Y.shape
        self.D = np.random.rand(self.num_drugs, self.r)
        self.T = np.random.rand(self.num_targets, self.r)

        for _ in range(self.max_iter):
            self.update_D()
            self.update_T()

    def update_D(self):
        for i in range(self.num_drugs):
            self.D[i] = np.linalg.solve(
                self.T.T @ self.T + self.lambda_d * np.eye(self.r),
                self.Y[i] @ self.T
            )

    def update_T(self):
        for j in range(self.num_targets):
            self.T[j] = np.linalg.solve(
                self.D.T @ self.D + self.lambda_t * np.eye(self.r),
                self.Y[:, j] @ self.D
            )

    def predict(self):
        return self.D @ self.T.T

In [25]:
# # Load the protein, molecule, and interaction data
protein_data = pd.read_csv('../../data/3-nrlmf/1-input_nrlmf/prot-simmilarity_chembl.csv', index_col=0)
protein_data

Unnamed: 0_level_0,Q9UBK2,O15379,P36894,P20393,P06213,P35354,P31040,P08235,Q9NTG7,P13945,...,O75751,Q01469,P41134,Q12770,P10997,Q5NUL3,Q9H093,Q96P68,O14894,P81277
UniProt ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q9UBK2,1.000000,0.124033,-0.304538,0.703053,0.528564,-0.082183,0.055235,0.658694,-0.170334,-0.651657,...,-0.901269,0.191961,-0.138962,-0.357468,-0.499048,-0.868908,0.765716,-0.721297,-0.608353,-0.101723
O15379,0.124033,1.000000,0.472547,0.037663,0.748333,0.695818,0.094985,0.122090,-0.103689,-0.342581,...,-0.016447,-0.109970,-0.604647,-0.031207,-0.375754,-0.122732,-0.204805,0.199457,0.062177,-0.246098
P36894,-0.304538,0.472547,1.000000,-0.490480,0.326475,0.501958,-0.107800,-0.336675,-0.491087,-0.175350,...,0.335484,0.295886,-0.479159,-0.245005,0.196147,0.450608,-0.667792,0.659123,0.330075,-0.190226
P20393,0.703053,0.037663,-0.490480,1.000000,0.350182,-0.161858,0.256173,0.943384,0.267330,-0.112879,...,-0.523553,-0.184052,0.163461,0.225290,-0.339245,-0.699213,0.620324,-0.533549,-0.085914,0.129407
P06213,0.528564,0.748333,0.326475,0.350182,1.000000,0.538327,-0.014559,0.393773,-0.141708,-0.466266,...,-0.330743,0.027838,-0.570713,-0.104932,-0.403117,-0.389679,0.140253,-0.066264,-0.103027,-0.137088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q5NUL3,-0.868908,-0.122732,0.450608,-0.699213,-0.389679,0.212889,-0.329353,-0.665979,0.042426,0.636961,...,0.902511,-0.220592,-0.076655,0.315661,0.566723,1.000000,-0.848141,0.896835,0.477721,0.076651
Q9H093,0.765716,-0.204805,-0.667792,0.620324,0.140253,-0.342823,0.196563,0.523463,0.191997,-0.416286,...,-0.786808,-0.010630,0.175053,-0.159909,-0.438217,-0.848141,1.000000,-0.900847,-0.552793,0.217907
Q96P68,-0.721297,0.199457,0.659123,-0.533549,-0.066264,0.473888,-0.360450,-0.446538,-0.144136,0.471122,...,0.814569,-0.224738,-0.313071,0.278548,0.520949,0.896835,-0.900847,1.000000,0.502521,-0.108054
O14894,-0.608353,0.062177,0.330075,-0.085914,-0.103027,-0.120151,0.445027,-0.012309,0.359502,0.591772,...,0.708774,-0.076303,0.381709,0.556047,0.348763,0.477721,-0.552793,0.502521,1.000000,0.272054


In [26]:
molecule_data = pd.read_csv('../../data/3-nrlmf/1-input_nrlmf/mol-simmilarity_chembl.csv', index_col=0)
molecule_data

Unnamed: 0_level_0,COLFORSIN,DEXAMETHASONE,VORINOSTAT,TACEDINALINE,DACINOSTAT,SPLITOMICIN,ROMIDEPSIN,TRICHOSTATIN,APICIDIN,UBENIMEX,...,PHENOXYBENZAMINE,SCHISANTHERIN A,SCHISANTHEROL A,TETRACYCLINE,LINOLENIC ACID,"4,4'-DIHYDROXYCHALCONE",KISSPEPTIN-10,DAMGO,NALOXONE,NOCICEPTIN
Drug,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
COLFORSIN,1.000000,0.776396,0.415128,0.322832,0.427948,0.604165,0.585769,0.400028,0.583266,0.561644,...,0.401130,0.714488,0.720930,0.652111,0.545595,0.539164,0.567454,0.544630,0.643325,0.570937
DEXAMETHASONE,0.776396,1.000000,0.452911,0.299382,0.510252,0.442326,0.543220,0.392792,0.574705,0.498202,...,0.488240,0.640503,0.646997,0.642540,0.569210,0.500000,0.596399,0.602197,0.615840,0.602495
VORINOSTAT,0.415128,0.452911,1.000000,0.585517,0.770329,0.333890,0.551447,0.716541,0.631586,0.641026,...,0.368549,0.275086,0.293032,0.492155,0.501280,0.377426,0.556121,0.615868,0.522976,0.599501
TACEDINALINE,0.322832,0.299382,0.585517,1.000000,0.462910,0.401286,0.484322,0.593914,0.529050,0.616334,...,0.348025,0.360668,0.381529,0.617213,0.215166,0.453609,0.549021,0.608006,0.340459,0.571440
DACINOSTAT,0.427948,0.510252,0.770329,0.462910,1.000000,0.417959,0.584094,0.639209,0.750680,0.556349,...,0.571187,0.417392,0.427948,0.535714,0.478091,0.440959,0.696143,0.715868,0.727393,0.638311
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"4,4'-DIHYDROXYCHALCONE",0.539164,0.500000,0.377426,0.453609,0.440959,0.688062,0.374634,0.436436,0.450749,0.452911,...,0.464991,0.588968,0.575108,0.535450,0.368932,1.000000,0.497000,0.550395,0.545275,0.426006
KISSPEPTIN-10,0.567454,0.596399,0.556121,0.549021,0.696143,0.491398,0.722867,0.440196,0.726424,0.734874,...,0.550562,0.542387,0.567454,0.629844,0.416025,0.497000,1.000000,0.800762,0.658281,0.896718
DAMGO,0.544630,0.602197,0.615868,0.608006,0.715868,0.486908,0.654979,0.551075,0.788051,0.703849,...,0.700040,0.514851,0.544630,0.715868,0.491436,0.550395,0.800762,1.000000,0.672927,0.797993
NALOXONE,0.643325,0.615840,0.522976,0.340459,0.727393,0.595880,0.612836,0.419961,0.699395,0.501186,...,0.626412,0.616324,0.622573,0.618284,0.486864,0.545275,0.658281,0.672927,1.000000,0.562183


In [27]:
interaction_data = pd.read_csv('../../data/3-nrlmf/1-input_nrlmf/matriks-chemogenomic_chembl.csv', index_col=0)
interaction_data

Unnamed: 0_level_0,(+)-EHNA,"(E)-3,4,3',5'-TETRAMETHOXYSTILBENE",(R)-9s,(R)-PIA,(R)-THIORPHAN,(S)-PIA,(S)-THIORPHAN,"(Z)-3,4,3',5'-TETRAMETHOXYSTILBENE","1,3-DIPROPYL-8-CYCLOPENTYLXANTHINE [DPCPX]","2,4-DIHYDROXYBENZOPHENONE",...,XERUBORBACTAM,XL-019,ZAPRINAST,ZD-4190,ZD-6169,ZIDOVUDINE,ZILEUTON,ZSTK-474,p-NITROPHENYLPHOSPHATE,succinyl-CoA
UniProt ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
O14686,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O14894,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O15264,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O15379,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O15519,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9UMX1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9Y233,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Q9Y478,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9Y5S1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
jumlah_nilai_1 = (interaction_data == 1).sum().sum()
jumlah_nilai_1

679

In [29]:
# Convert data to numpy arrays
Y = interaction_data.values
Sd = protein_data.values
St = molecule_data.values

# Define parameters for NRLMF
params = {
    'c': 5,
    'gamma': 1,
    'lambda_d': 1,
    'lambda_t': 1,
    'r': 10,
    'alpha': 0.01,
    'beta': 0.01,
    'theta': 0.01,
    'max_iter': 100
}

# Initialize and train NRLMF model
nrlmf_model = NRLMF(**params)
nrlmf_model.fix_model(Y, Sd, St)

# Perform predictions
Y_pred = nrlmf_model.predict()

# Prepare to store low probability pairs
protein_ids = interaction_data.index.values
molecule_ids = interaction_data.columns.values
low_prob_pairs = []

In [30]:
# Find low probability pairs for each protein
for i in range(Y.shape[0]):
    # Get predicted probabilities for the current protein
    pred_probs = Y_pred[i]
    # Exclude known interactions
    known_interactions = Y[i]
    pred_probs[known_interactions == 1] = np.inf
    # Get indices of lowest probabilities
    low_prob_indices = np.argsort(pred_probs)[:np.sum(known_interactions)]
    for j in low_prob_indices:
        low_prob_pairs.append((protein_ids[i], molecule_ids[j], pred_probs[j]))

In [32]:
# Create a DataFrame with protein-molecule pairs and their predicted scores
## 0=protein, 1=molecule
low_prob_pairs_df = pd.DataFrame(low_prob_pairs, columns=['UniProt ID', 'Drug', 'score'])
low_prob_pairs_df

Unnamed: 0,UniProt ID,Drug,score
0,O14686,DASB,-1.554430e-113
1,O14894,DASB,-3.498159e-113
2,O15264,ADENOSINE,-5.513514e-84
3,O15264,METHYLTHIOADENOSINE,-3.961245e-84
4,O15379,ADENOSINE,-1.822643e-07
...,...,...,...
674,Q9Y5S1,FLUOXETINE,-7.206639e-24
675,Q9Y6E0,ERLOTINIB,-9.714269e-02
676,Q9Y6E0,VANDETANIB,-4.360894e-02
677,Q9Y6E0,CANERTINIB,-3.977494e-02


In [33]:
low_prob_pairs_df.to_csv("../../data/3-nrlmf/2-output_nrlmf/neg_inter_chembl.csv", index=False)