In [109]:
import numpy as np
import pandas as pd
from pathlib import Path

## Read Node2Vec output into dataframe

In [110]:
"""
Read the node2vec embedding text file
First line = "18357 128"  → network size + embedding dim
Remaining lines: 9606.ENSP... 128 floats
"""
lut = {}                                               # lookup-table: EnsemblID → 128-dim vector

with open("/home/FCAM/juli/HRP/results/ppi_node2vec.emb.txt") as f:
    _ = f.readline()                                   # skip the count header
    for line in f:                                     # iterate over every embedding row
        items = line.split(" ")                        # whitespace-split 130 items (ID + 128 floats + '\n')
        pid    = items[0]                              # e.g. "9606.ENSP00000297591"
        values = np.array(items[1:], dtype=float)      # convert 128 strings → float array
        lut[pid] = values                              # store in dict

print(list(lut.keys())[:10])
print(len(lut))

['9606.ENSP00000297591', '9606.ENSP00000263265', '9606.ENSP00000257787', '9606.ENSP00000272317', '9606.ENSP00000244537', '9606.ENSP00000269305', '9606.ENSP00000298910', '9606.ENSP00000253024', '9606.ENSP00000221419', '9606.ENSP00000388107']
18357


In [111]:
## convert lut into a dataframe
"""
Build a DataFrame whose index is the bare ENSP ID
"""

keys = [k[5:] for k in lut.keys()]  # strip the "9606." prefix → "ENSP00000297591"

nodevec = pd.DataFrame(
    {
        "Ensembl_ID": keys,
        "Embedding": lut.values(),
    }
)

nodevec

Unnamed: 0,Ensembl_ID,Embedding
0,ENSP00000297591,"[0.24643165, -0.11183538, -0.009804543, 0.2164..."
1,ENSP00000263265,"[-0.014286688, -0.13260867, -0.1084679, 0.1239..."
2,ENSP00000257787,"[0.07073102, 0.029735815, -0.095603794, 0.3010..."
3,ENSP00000272317,"[-0.062457982, -0.0057393736, 0.081014805, 0.2..."
4,ENSP00000244537,"[-0.054631602, -0.31432754, 0.11258947, 0.2728..."
...,...,...
18352,ENSP00000398106,"[0.008160919, -0.034701902, 0.13410378, 0.0014..."
18353,ENSP00000359340,"[-0.047072683, -0.13950668, 0.09136694, 0.0386..."
18354,ENSP00000419545,"[0.092692785, -0.09160898, 0.048314556, 0.1277..."
18355,ENSP00000340979,"[-0.054372173, -0.044822164, 0.075604595, 0.15..."


## Read the Phosphsite table

The table is pre-joined with GPS to have Kinase scores

In [112]:
phos_df = pd.read_csv("phossite_gps.csv")
phos_df

Unnamed: 0,ACC_ID,MOD_RSD,PROTEIN,GENE,HU_CHR_LOC,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,...,RET,SRC,SYK,TEC,TEK,TNK2,TXK,TYK2,YES1,ZAP70
0,P31946,Y21,14-3-3 beta,YWHAB,20q13.1,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,...,22.696,20.132,16.069,283.309,0.076,-3.510000e-07,0.3820,8.706,14.834,3.295
1,P31946,Y50,14-3-3 beta,YWHAB,20q13.1,12432961,human,28.08,14-3-3,RNLLsVAykNVVGAR,...,43.900,20.266,10.192,122.378,0.828,1.730000e-06,0.3780,20.286,13.156,2.404
2,P31946,Y106,14-3-3 beta,YWHAB,20q13.1,9407297,human,28.08,14-3-3,VLELLDkyLIPNATQ,...,17.360,16.560,12.311,178.621,0.308,-2.240000e-06,-0.8760,-3.810,7.388,1.274
3,P31946,Y120,14-3-3 beta,YWHAB,20q13.1,3665374,human,28.08,14-3-3,QPEskVFyLkMkGDy,...,4.501,15.072,6.981,143.365,1.110,2.040000e-06,0.9490,-5.100,-1.100,1.431
4,P31946,Y127,14-3-3 beta,YWHAB,20q13.1,3549169,human,28.08,14-3-3,yLkMkGDyFRyLsEV,...,20.633,15.938,5.069,36.902,0.015,4.060000e-06,0.7440,10.015,7.914,-3.370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37133,Q15942,Y316,Zyxin,ZYX,7q32,454274,human,61.28,,PQPPsFtyAQQREKP,...,26.677,19.714,6.252,286.933,0.068,8.590000e-07,1.3690,-0.815,20.476,4.406
37134,O43149,Y702,ZZEF1,ZZEF1,17p13.2,479230,human,331.08,,QGLTISGyLRPARAE,...,8.153,16.037,11.752,263.107,0.028,3.720000e-06,0.5320,26.138,5.995,3.604
37135,O43149,Y976,ZZEF1,ZZEF1,17p13.2,484695,human,331.08,,GSLLSWCyLQLKSTD,...,21.171,17.088,2.987,202.841,0.345,5.200000e-06,0.7050,6.665,6.764,4.592
37136,O43149,Y2494,ZZEF1,ZZEF1,17p13.2,22778456,human,331.08,,LQMKkTDyFFLEVQk,...,24.007,14.854,6.798,47.557,-0.167,4.640000e-06,-0.9440,1.222,3.208,0.804


## Read mapping table that map Enseml ID to Uniprot ID

The table was downloaded from Uniprot id_mapping site.

The source Enseml IDs are extracted from the idmapping_2025_06_12.tsv file

In [113]:
mapping = pd.read_csv("/home/FCAM/juli/HRP/idmapping_2025_06_12.tsv", sep="\t")
mapping

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length
0,ENSP00000297591,Q69YN4,reviewed,VIR_HUMAN,Protein virilizer homolog,VIRMA KIAA1429 MSTP054,Homo sapiens (Human),1812
1,ENSP00000263265,Q9H4M7,reviewed,PKHA4_HUMAN,Pleckstrin homology domain-containing family A...,PLEKHA4 PEPP1,Homo sapiens (Human),779
2,ENSP00000257787,Q53H80,reviewed,AKIR2_HUMAN,Akirin-2,AKIRIN2 C6orf166,Homo sapiens (Human),203
3,ENSP00000272317,P62979,reviewed,RS27A_HUMAN,Ubiquitin-ribosomal protein eS31 fusion protei...,RPS27A UBA80 UBCEP1,Homo sapiens (Human),156
4,ENSP00000244537,P62805,reviewed,H4_HUMAN,Histone H4,H4C1 H4/A H4FA HIST1H4A; H4C2 H4/I H4FI HIST1H...,Homo sapiens (Human),103
...,...,...,...,...,...,...,...,...
18041,ENSP00000398106,Q49AG3,reviewed,ZBED5_HUMAN,Zinc finger BED domain-containing protein 5 (T...,ZBED5 Buster1,Homo sapiens (Human),693
18042,ENSP00000359340,Q8IVV8,reviewed,NKAI4_HUMAN,Sodium/potassium-transporting ATPase subunit b...,NKAIN4 C20orf58 FAM77A,Homo sapiens (Human),208
18043,ENSP00000419545,G5E9W9,unreviewed,G5E9W9_HUMAN,"GTPase, IMAP family member 4",GIMAP4 hCG_1643312,Homo sapiens (Human),343
18044,ENSP00000340979,Q8N4F7,reviewed,RN175_HUMAN,RING finger protein 175,RNF175,Homo sapiens (Human),328


## Join the PhosSite table with the node2vec table

The ID used are not the same. We will rely on mapping table to match IDs in these

PhosSite table has UniProt ID
nodevec table has Emsembl_ID

- Add an Emsembl_ID column in PhosSite Table (join with mapping table)
- Add embedding values into PhosSite Table (join with nodevec) rely on emsembl_ID
  

In [114]:
src = phos_df.set_index("ACC_ID")
src

Unnamed: 0_level_0,MOD_RSD,PROTEIN,GENE,HU_CHR_LOC,SITE_GRP_ID,ORGANISM,MW_kD,DOMAIN,SITE_+/-7_AA,LT_LIT,...,RET,SRC,SYK,TEC,TEK,TNK2,TXK,TYK2,YES1,ZAP70
ACC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P31946,Y21,14-3-3 beta,YWHAB,20q13.1,3426383,human,28.08,14-3-3,LAEQAERyDDMAAAM,,...,22.696,20.132,16.069,283.309,0.076,-3.510000e-07,0.3820,8.706,14.834,3.295
P31946,Y50,14-3-3 beta,YWHAB,20q13.1,12432961,human,28.08,14-3-3,RNLLsVAykNVVGAR,,...,43.900,20.266,10.192,122.378,0.828,1.730000e-06,0.3780,20.286,13.156,2.404
P31946,Y106,14-3-3 beta,YWHAB,20q13.1,9407297,human,28.08,14-3-3,VLELLDkyLIPNATQ,,...,17.360,16.560,12.311,178.621,0.308,-2.240000e-06,-0.8760,-3.810,7.388,1.274
P31946,Y120,14-3-3 beta,YWHAB,20q13.1,3665374,human,28.08,14-3-3,QPEskVFyLkMkGDy,,...,4.501,15.072,6.981,143.365,1.110,2.040000e-06,0.9490,-5.100,-1.100,1.431
P31946,Y127,14-3-3 beta,YWHAB,20q13.1,3549169,human,28.08,14-3-3,yLkMkGDyFRyLsEV,,...,20.633,15.938,5.069,36.902,0.015,4.060000e-06,0.7440,10.015,7.914,-3.370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q15942,Y316,Zyxin,ZYX,7q32,454274,human,61.28,,PQPPsFtyAQQREKP,,...,26.677,19.714,6.252,286.933,0.068,8.590000e-07,1.3690,-0.815,20.476,4.406
O43149,Y702,ZZEF1,ZZEF1,17p13.2,479230,human,331.08,,QGLTISGyLRPARAE,,...,8.153,16.037,11.752,263.107,0.028,3.720000e-06,0.5320,26.138,5.995,3.604
O43149,Y976,ZZEF1,ZZEF1,17p13.2,484695,human,331.08,,GSLLSWCyLQLKSTD,,...,21.171,17.088,2.987,202.841,0.345,5.200000e-06,0.7050,6.665,6.764,4.592
O43149,Y2494,ZZEF1,ZZEF1,17p13.2,22778456,human,331.08,,LQMKkTDyFFLEVQk,,...,24.007,14.854,6.798,47.557,-0.167,4.640000e-06,-0.9440,1.222,3.208,0.804


In [115]:
dst = mapping.rename({"Entry":"ACC_ID"}, axis=1)
dst = dst.set_index("ACC_ID")
dst

Unnamed: 0_level_0,From,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length
ACC_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Q69YN4,ENSP00000297591,reviewed,VIR_HUMAN,Protein virilizer homolog,VIRMA KIAA1429 MSTP054,Homo sapiens (Human),1812
Q9H4M7,ENSP00000263265,reviewed,PKHA4_HUMAN,Pleckstrin homology domain-containing family A...,PLEKHA4 PEPP1,Homo sapiens (Human),779
Q53H80,ENSP00000257787,reviewed,AKIR2_HUMAN,Akirin-2,AKIRIN2 C6orf166,Homo sapiens (Human),203
P62979,ENSP00000272317,reviewed,RS27A_HUMAN,Ubiquitin-ribosomal protein eS31 fusion protei...,RPS27A UBA80 UBCEP1,Homo sapiens (Human),156
P62805,ENSP00000244537,reviewed,H4_HUMAN,Histone H4,H4C1 H4/A H4FA HIST1H4A; H4C2 H4/I H4FI HIST1H...,Homo sapiens (Human),103
...,...,...,...,...,...,...,...
Q49AG3,ENSP00000398106,reviewed,ZBED5_HUMAN,Zinc finger BED domain-containing protein 5 (T...,ZBED5 Buster1,Homo sapiens (Human),693
Q8IVV8,ENSP00000359340,reviewed,NKAI4_HUMAN,Sodium/potassium-transporting ATPase subunit b...,NKAIN4 C20orf58 FAM77A,Homo sapiens (Human),208
G5E9W9,ENSP00000419545,unreviewed,G5E9W9_HUMAN,"GTPase, IMAP family member 4",GIMAP4 hCG_1643312,Homo sapiens (Human),343
Q8N4F7,ENSP00000340979,reviewed,RN175_HUMAN,RING finger protein 175,RNF175,Homo sapiens (Human),328


In [116]:
np.array(mapping.loc[mapping["Entry"] == "Q69YN4", "From"]).astype(str)[0]

'ENSP00000297591'

## Create a LUT dict. Key: PhosGrpID, Value: Nodevec embeddings

In [117]:
from tqdm import tqdm

nodevec_dict = {}
for grp_id in tqdm(np.unique(phos_df["SITE_GRP_ID"])):
    embeddings = []
    for uniprot_id in phos_df.loc[phos_df["SITE_GRP_ID"] == grp_id, "ACC_ID"]:
        ensp_ids = mapping.loc[mapping["Entry"] == uniprot_id, "From"]

        for ensp_id in np.array(ensp_ids).astype(str):
            try:
                embeddings.append(lut["9606." + ensp_id])
            except:
                pass
    
    if len(embeddings) > 0:
        nodevec_dict[grp_id] = np.mean(embeddings, axis=0)

print(f"length: {len(nodevec_dict)}")
print(nodevec_dict[3426383])

100%|██████████| 36503/36503 [00:39<00:00, 920.68it/s]

length: 32734
[ 0.20560712 -0.40410975  0.44077367  0.23842801  0.44620377  0.17134702
 -0.06048868  0.19801281  0.11811811  0.28440544 -0.08867627  0.06471123
 -0.08423166  0.20233163 -0.06681283 -0.1444069  -0.23018396 -0.29364705
 -0.02031346 -0.03437082  0.30774942  0.07809994 -0.23814562 -0.3645629
  0.18758477  0.41970226 -0.28729612  0.11624064  0.11314426  0.01782569
  0.06223418  0.06382155  0.37176144 -0.17948903  0.31896538  0.29221445
  0.54248244 -0.1097855   0.2126385   0.00502301  0.15146354  0.13151553
  0.03156454 -0.04236247  0.24652256 -0.06415425  0.05768368  0.2080914
 -0.02076083 -0.29550532  0.21443094 -0.22377634  0.00111076  0.04714504
 -0.13867499  0.20432389  0.1941468   0.17197448 -0.01914304  0.17133449
 -0.18044461  0.41257283 -0.02704768  0.12608223  0.08819468  0.26710963
  0.10048352  0.13067739 -0.08491678 -0.08913625  0.10556964 -0.15011492
 -0.03975274  0.04086697 -0.2837278  -0.05147271  0.27524266 -0.15470977
 -0.43387815 -0.10882109 -0.19292666 -0




## Create new dictionary mapping PhosGrpID to full embedding

The full embedding is two-part embedding:

- nodevec id based on protein id
- Kinase scores based on Kinpred 

In [118]:
def get_embedding(phospho_group_id, *, df = phos_df):
    """ return embedding values as a numpy array, based on phospho group id
    """

    cols = ['ABL1', 'ABL2', 'ALK', 'AXL', 'BAZ1B', 'BLK', 'BMX', 'BTK', 'CSF1R',
       'CSK', 'DDR2', 'EGFR', 'EPHA2', 'EPHA3', 'EPHA4', 'EPHA8', 'EPHB1',
       'EPHB2', 'ERBB2', 'ERBB4', 'FER', 'FES', 'FGFR1', 'FGFR2', 'FGFR3',
       'FGFR4', 'FGR', 'FLT1', 'FLT3', 'FLT4', 'FRK', 'FYN', 'HCK', 'IGF1R',
       'INSR', 'ITK', 'JAK1', 'JAK2', 'JAK3', 'KDR', 'KIT', 'LCK', 'LYN',
       'MATK', 'MERTK', 'MET', 'MST1R', 'MUSK', 'NTRK1', 'NTRK2', 'PDGFRA',
       'PDGFRB', 'PTK2', 'PTK2B', 'PTK6', 'RET', 'SRC', 'SYK', 'TEC', 'TEK',
       'TNK2', 'TXK', 'TYK2', 'YES1', 'ZAP70']

    rows = df.loc[df["SITE_GRP_ID"] == phospho_group_id]
    
    if len(rows) == 0:
        raise ValueError("Invalid group id")

    features = rows[cols].to_numpy().astype(float)

    features = features.mean(axis = 0)

    return features

In [119]:
grp_id_lut = {}

for grp_id, nodevec_embedding in tqdm(nodevec_dict.items()):
    kinpred_scores = get_embedding(grp_id)

    if not np.any(np.isnan(kinpred_scores)):
        grp_id_lut[str(grp_id)] = np.r_[kinpred_scores, nodevec_embedding].tolist()

print(len(grp_id_lut))
print(grp_id_lut["3426383"])

100%|██████████| 32734/32734 [00:10<00:00, 3049.63it/s]

31624
[3.259, 12.932, -0.521, 0.000376, -2.86, 1.466, 1.169, 0.897, 8.75e-05, 16.136, 0.258, 10.125, 0.6559999999999999, -0.0608, 1.111, -0.6, 0.6509999999999999, 0.039, 2.18, 1.54e-06, 0.009, -2.21, 8.205, -0.285, 2.48e-07, 0.7190000000000001, 2.855, 1.6130000000000002, -0.102, 12.753, 1.386, 4.137, 7.0120000000000005, 1.183, 16.302, 1.489, 9.124, 7.282999999999999, 8.12, -15.4, -0.214, 3.553, 0.08, 1.679, 0.573, 35.825, 0.000731, -0.0482, 3.92e-07, -1.99e-06, 5.36, 39.411, 5.16e-05, 283.526, -9.02, 22.696, 20.132, 16.069000000000006, 283.309, 0.076, -3.51e-07, 0.382, 8.706, 14.834, 3.295, 0.20560712, -0.40410975, 0.44077367, 0.23842801, 0.44620377, 0.17134702, -0.060488682, 0.19801281, 0.118118115, 0.28440544, -0.088676274, 0.06471123, -0.08423166, 0.20233163, -0.06681283, -0.1444069, -0.23018396, -0.29364705, -0.020313457, -0.03437082, 0.30774942, 0.078099936, -0.23814562, -0.3645629, 0.18758477, 0.41970226, -0.28729612, 0.11624064, 0.113144256, 0.017825693, 0.062234182, 0.06382155,




In [120]:
# save as a json file

import json

with open("grp_id_to_embedding.json", "w") as f:
    json.dump(grp_id_lut, f)