In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import rbf_kernel

In [7]:
class NRLMF:
    def __init__(self, c=5, gamma=1, lambda_d=1, lambda_t=1, r=10, alpha=0.01, beta=0.01, theta=0.01, max_iter=100):
        self.c = c
        self.gamma = gamma
        self.lambda_d = lambda_d
        self.lambda_t = lambda_t
        self.r = r
        self.alpha = alpha
        self.beta = beta
        self.theta = theta
        self.max_iter = max_iter

    def fix_model(self, Y, Sd, St):
        self.Y = Y
        self.Sd = Sd
        self.St = St
        self.num_drugs, self.num_targets = Y.shape
        self.D = np.random.rand(self.num_drugs, self.r)
        self.T = np.random.rand(self.num_targets, self.r)

        for _ in range(self.max_iter):
            self.update_D()
            self.update_T()

    def update_D(self):
        for i in range(self.num_drugs):
            self.D[i] = np.linalg.solve(
                self.T.T @ self.T + self.lambda_d * np.eye(self.r),
                self.Y[i] @ self.T
            )

    def update_T(self):
        for j in range(self.num_targets):
            self.T[j] = np.linalg.solve(
                self.D.T @ self.D + self.lambda_t * np.eye(self.r),
                self.Y[:, j] @ self.D
            )

    def predict(self):
        return self.D @ self.T.T

In [2]:
# # Load the protein, molecule, and interaction data
# protein_data = pd.read_csv('../../../data/nrlmf/similarity_aaindex.csv', index_col=0)
# molecule_data = pd.read_parquet('../../../data/nrlmf/similarity_maccs.parquet')
interaction_data = pd.read_csv('../../../data/nrlmf/nrlmfpos_aaindexmaccs.csv', index_col=0)
interaction_data

Unnamed: 0_level_0,M1,M10,M100,M1000,M10000,M100001,M100003,M100005,M100006,M100007,...,M99987,M99988,M99989,M99990,M99991,M99993,M99994,M99995,M99998,M99999
uniprot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
O00206,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O00329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O00459,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O14746,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O14757,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q9UBU3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9UHD2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9UM73,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Q9UNQ0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
jumlah_nilai_1 = (interaction_data == 1).sum().sum()
jumlah_nilai_1

78191

In [None]:
# Convert data to numpy arrays
Y = matrix_molecule_protein_pivot.values
Sd = protein_similarity_df.values
St = molecule_similarity_df.values

# Define parameters for NRLMF
params = {
    'c': 5,
    'gamma': 1,
    'lambda_d': 1,
    'lambda_t': 1,
    'r': 10,
    'alpha': 0.01,
    'beta': 0.01,
    'theta': 0.01,
    'max_iter': 100
}

# Initialize and train NRLMF model
nrlmf_model = NRLMF(**params)
nrlmf_model.fix_model(Y, Sd, St)

# Perform predictions
Y_pred = nrlmf_model.predict()

# Prepare to store low probability pairs
protein_ids = matrix_molecule_protein_pivot.index.values
molecule_ids = matrix_molecule_protein_pivot.columns.values
low_prob_pairs = []

In [None]:
# Find low probability pairs for each protein
for i in range(Y.shape[0]):
    # Get predicted probabilities for the current protein
    pred_probs = Y_pred[i]
    # Exclude known interactions
    known_interactions = Y[i]
    pred_probs[known_interactions == 1] = np.inf
    # Get indices of lowest probabilities
    low_prob_indices = np.argsort(pred_probs)[:np.sum(known_interactions)]
    for j in low_prob_indices:
        low_prob_pairs.append((protein_ids[i], molecule_ids[j], pred_probs[j]))

In [None]:
# Create a DataFrame with protein-molecule pairs and their predicted scores
## 0=protein, 1=molecule
low_prob_pairs_df = pd.DataFrame(low_prob_pairs, columns=['0', '1', 'score'])
low_prob_pairs_df