In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
import torch

In [3]:
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

2.5.1+cu121
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m101.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.1/5.1 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


In [4]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.utils import negative_sampling

In [5]:
df = pd.read_csv('./4932.protein.physical.links.v12.0.txt', sep = " ")
df.head()

Unnamed: 0,protein1,protein2,combined_score
0,4932.Q0045,4932.YGR222W,292
1,4932.Q0045,4932.Q0120,278
2,4932.Q0045,4932.YML030W,734
3,4932.Q0045,4932.YIR024C,292
4,4932.Q0045,4932.YOL027C,286


### Taking only the most confident links

In [6]:
df = df[df['combined_score'] >= 700].reset_index(drop = True)

### Cleaning the protein codes

In [7]:
def clean_names(x):
    if('4932.' in x):
        x = x.replace('4932.', '')
    return x

df['protein1'] = df['protein1'].apply(lambda x : clean_names(x))
df['protein2'] = df['protein2'].apply(lambda x : clean_names(x))

df.head()

Unnamed: 0,protein1,protein2,combined_score
0,Q0045,YML030W,734
1,Q0045,YGL187C,999
2,Q0045,Q0275,999
3,Q0045,YLR395C,998
4,Q0045,Q0105,989


### making protein index

In [8]:
protein_names = list(set(df['protein1'].to_list() + df['protein2'].to_list()))
node_idx = {name:i for i, name in enumerate(protein_names)}

In [9]:
len(protein_names)

3384

### Loading the protein descriptions

In [10]:
df_descriptions = pd.read_csv("/content/4932.protein.info.v12.0.txt", sep = "\t")
df_descriptions['#string_protein_id'] = df_descriptions['#string_protein_id'].apply(lambda x : clean_names(x))

#take descriptions of only those proteins that we had previously filtered with combined score >= 700 in their interactions.
df_descriptions = df_descriptions[df_descriptions['#string_protein_id'].isin(protein_names)].reset_index(drop = True)

The descriptions contain semicolons (';'), which i believe can hinder the quality of the embeddings being generated from the sentence transformer model. So just replace it with a full stop '.' character to form gramatically accurate descriptions and thereby generate good quality embeddings.

In [11]:
def clean_annotation(x):
    if(';' in x):
        x = x.replace(';', '. ')
    return x

df_descriptions['annotation'] = df_descriptions['annotation'].apply(lambda x : clean_annotation(x))

In [12]:
assert len(protein_names) == df_descriptions.shape[0]

In [13]:
df_descriptions['#string_protein_id'].map(node_idx).astype(np.int64)
#this should work without any errors, otherwise there is a protein thats not mapped

Unnamed: 0,#string_protein_id
0,2718
1,1833
2,2019
3,3016
4,1831
...,...
3379,2151
3380,1281
3381,73
3382,1315


### Make the embeddings for protein descriptions

In [14]:
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm

model = SentenceTransformer('all-mpnet-base-v2', device='cuda')

file_name = 'protein_desc_embeddings.dat'
embeddings = np.memmap(file_name, dtype = np.float32, mode = 'w+', shape = (df_descriptions.shape[0], 768))

for index, row in tqdm(df_descriptions.iterrows(), desc = "creating embeddings", total = df_descriptions.shape[0]):
    node_name = row['#string_protein_id']
    description = row['annotation']
    embedding = model.encode(description, convert_to_tensor=False)
    embeddings[node_idx[node_name]] = embedding

embeddings.flush()
del embeddings

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

creating embeddings:   0%|          | 0/3384 [00:00<?, ?it/s]

### Removing self loops(will be added later on when predicting links)

In [15]:
df[df['protein1'] == df['protein2']]

Unnamed: 0,protein1,protein2,combined_score


In [16]:
df['idx_1'] = df['protein1'].map(node_idx)
df['idx_2'] = df['protein2'].map(node_idx)

edge_index = torch.tensor(
    df[['idx_1', 'idx_2']].to_numpy(dtype = np.int64).T
).contiguous()

In [17]:
len(set(map(tuple, df[['idx_1', 'idx_2']].to_numpy().tolist()))) #no duplicates

86060

In [18]:
from torch_geometric.data import Data
data = Data(edge_index=edge_index, num_nodes= len(protein_names))

print(data.num_nodes)
print(data.num_edges)
print(data.has_self_loops())
print(data.is_undirected())

3384
86060
False
True


In [19]:
#follow this example below to get clarity on how the train/val/test split happens at link level

'''
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.data import Data
import torch

edge_index = torch.tensor([
    [0, 1, 1, 2, 2, 3, 3, 4, 4, 0],  # Source nodes
    [1, 0, 2, 1, 3, 2, 4, 3, 0, 4]   # Target nodes (reverse for undirected)
], dtype=torch.long)

data = Data(edge_index=edge_index)

transform = RandomLinkSplit(
    is_undirected=True,
    add_negative_train_samples=False,
    num_val = 0.2,
    num_test = 0.2
)

train_data, val_data, test_data = transform(data)

print("Train Edges for train:", train_data.edge_label_index)
print("Validation Edges for val:", val_data.edge_label_index)
print("Test Edges for test:", test_data.edge_label_index)


print("Train Edges for message passing:", train_data.edge_index)
print("Validation Edges for message passing:", val_data.edge_index)
print("Test Edges for message passing:", test_data.edge_index)
'''

'\nfrom torch_geometric.transforms import RandomLinkSplit\nfrom torch_geometric.data import Data\nimport torch\n\nedge_index = torch.tensor([\n    [0, 1, 1, 2, 2, 3, 3, 4, 4, 0],  # Source nodes\n    [1, 0, 2, 1, 3, 2, 4, 3, 0, 4]   # Target nodes (reverse for undirected)\n], dtype=torch.long)\n\ndata = Data(edge_index=edge_index)\n\ntransform = RandomLinkSplit(\n    is_undirected=True,\n    add_negative_train_samples=False,\n    num_val = 0.2,\n    num_test = 0.2\n)\n\ntrain_data, val_data, test_data = transform(data)\n\nprint("Train Edges for train:", train_data.edge_label_index)\nprint("Validation Edges for val:", val_data.edge_label_index)\nprint("Test Edges for test:", test_data.edge_label_index)\n\n\nprint("Train Edges for message passing:", train_data.edge_index)\nprint("Validation Edges for message passing:", val_data.edge_index)\nprint("Test Edges for message passing:", test_data.edge_index)\n'

In [20]:
data.validate(raise_on_error=True)

True

In [21]:
from torch_geometric.transforms import RandomLinkSplit

split_transform = RandomLinkSplit(
    num_val=0.3,  # 30% links for validation
    num_test=0.3,  # 30% links for testing
    is_undirected=True,  # Set True for undirected graphs
    add_negative_train_samples=False,  # No negative edges added in train set, but will be added in validation and test set, how many ? equal to positive edges
    split_labels = True, # split positive and negative links in train, test and validation
)

train_data, val_data, test_data = split_transform(data)

In [22]:
train_data

Data(edge_index=[2, 34424], num_nodes=3384, pos_edge_label=[17212], pos_edge_label_index=[2, 17212])

In [23]:
val_data

Data(edge_index=[2, 34424], num_nodes=3384, pos_edge_label=[12909], pos_edge_label_index=[2, 12909], neg_edge_label=[12909], neg_edge_label_index=[2, 12909])

In [24]:
test_data

Data(edge_index=[2, 60242], num_nodes=3384, pos_edge_label=[12909], pos_edge_label_index=[2, 12909], neg_edge_label=[12909], neg_edge_label_index=[2, 12909])

In [25]:
class GCNLinkPrediction(torch.nn.Module):

    def __init__(self, embeddings_path, num_nodes, embedding_dim, hidden_dim, output_dim):
        super(GCNLinkPrediction, self).__init__()

        embeddings = np.memmap(embeddings_path, mode = 'r', dtype = np.float32, shape = (num_nodes, embedding_dim))
        embeddings_tensor = torch.tensor(embeddings, dtype = torch.float32)

        self.node_embeddings = torch.nn.Embedding.from_pretrained(embeddings_tensor, freeze=True)
        self.node_embeddings.weight.requires_grad = False

        self.conv1 = GCNConv(embedding_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, output_dim)

    def forward(self, edge_index):
        x = self.node_embeddings.weight
        x = self.conv1(x, edge_index)
        x = F.relu(x)

        x = self.conv2(x, edge_index)

        return x

In [26]:
num_nodes = len(protein_names)
embedding_dim = 768
hidden_dim = 64
output_dim = 32
epochs = 100

model = GCNLinkPrediction('/content/protein_desc_embeddings.dat', num_nodes, embedding_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.01)

In [27]:
import torch.nn.functional as F

for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()

    z = model(train_data.edge_index)
    #Note :pass the message-passing behaviour(edge_index) of training set only, because message passing behaviour is different for train, val and test set.
    #The difference in message-passing behaviour is to prevent information leak between train, val and test sets.

    pos_edge_index = train_data.pos_edge_label_index
    neg_edge_index = negative_sampling(pos_edge_index, num_nodes=num_nodes)

    pos_preds = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim = 1)
    pos_labels = torch.ones(pos_preds.size(0))

    neg_preds = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim = 1)
    neg_labels = torch.zeros(neg_preds.size(0))

    preds = torch.cat([pos_preds, neg_preds], dim = 0)
    labels = torch.cat([pos_labels, neg_labels], dim = 0)

    loss = F.binary_cross_entropy_with_logits(preds, labels)
    loss.backward()
    optimizer.step()

    if(epoch%10 == 0):
        print(f"Epoch {epoch}, Loss :{loss.item()}")

Epoch 0, Loss :0.6903565526008606
Epoch 10, Loss :0.47564226388931274
Epoch 20, Loss :0.4478757679462433
Epoch 30, Loss :0.43431586027145386
Epoch 40, Loss :0.429779589176178
Epoch 50, Loss :0.4176603853702545
Epoch 60, Loss :0.4140660762786865
Epoch 70, Loss :0.4094702899456024
Epoch 80, Loss :0.4111138582229614
Epoch 90, Loss :0.4065088927745819


In [28]:
from sklearn.metrics import roc_auc_score, average_precision_score

@torch.no_grad()
def evaluate(model, data):
    model.eval()

    z = model(data.edge_index) # pass the message-passing behaviour of that particular data split

    pos_edge_index = data.pos_edge_label_index
    neg_edge_index = data.neg_edge_label_index

    pos_preds = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim = 1)
    pos_labels = torch.ones(pos_preds.size(0))

    neg_preds = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim = 1)
    neg_labels = torch.zeros(neg_preds.size(0))

    y_true = torch.cat([pos_labels, neg_labels], dim = 0)
    y_pred = torch.cat([pos_preds, neg_preds], dim = 0)

    auc = roc_auc_score(y_true, y_pred)
    ap = average_precision_score(y_true, y_pred)

    return auc, ap

# Validation
val_auc, val_ap = evaluate(model, val_data)
print(f"Validation AUC: {val_auc:.4f}, AP: {val_ap:.4f}")

# Test
test_auc, test_ap = evaluate(model, test_data)
print(f"Test AUC: {test_auc:.4f}, AP: {test_ap:.4f}")

Validation AUC: 0.9777, AP: 0.9801
Test AUC: 0.9818, AP: 0.9836
