In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/protein-final-embeddings/protein_embeddings_combined.pkl
/kaggle/input/protein-sequences/protein_sequences.pkl
/kaggle/input/ppi-dataset/9606.protein.physical.links.v12.0.txt


In [2]:
!pip install transformers accelerate


Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch>=2.0.0->accelerate)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (

In [3]:
import pandas as pd

In [8]:
df=pd.read_csv("/kaggle/input/ppi-dataset/9606.protein.physical.links.v12.0.txt", sep=" ")
df

Unnamed: 0,protein1,protein2,combined_score
0,9606.ENSP00000000233,9606.ENSP00000257770,311
1,9606.ENSP00000000233,9606.ENSP00000226004,161
2,9606.ENSP00000000233,9606.ENSP00000434442,499
3,9606.ENSP00000000233,9606.ENSP00000262455,531
4,9606.ENSP00000000233,9606.ENSP00000303145,499
...,...,...,...
1477605,9606.ENSP00000501317,9606.ENSP00000444357,292
1477606,9606.ENSP00000501317,9606.ENSP00000296785,962
1477607,9606.ENSP00000501317,9606.ENSP00000361930,195
1477608,9606.ENSP00000501317,9606.ENSP00000370745,188


In [5]:
df.shape

(1477610, 3)

In [7]:
# Extract unique protein IDs
def extract_protein_ids(df):
    return pd.unique(df[['protein1', 'protein2']].values.ravel())

protein_ids = extract_protein_ids(df)

In [8]:
protein_ids

array(['9606.ENSP00000000233', '9606.ENSP00000257770',
       '9606.ENSP00000226004', ..., '9606.ENSP00000493768',
       '9606.ENSP00000493325', '9606.ENSP00000492978'], dtype=object)

In [9]:
# Remove "9606." prefix to get clean Ensembl IDs
df['prot1_clean'] = df['protein1'].str.replace("9606.", "", regex=False)
df['prot2_clean'] = df['protein2'].str.replace("9606.", "", regex=False)

df[['protein1', 'protein2', 'prot1_clean', 'prot2_clean', 'combined_score']]

Unnamed: 0,protein1,protein2,prot1_clean,prot2_clean,combined_score
0,9606.ENSP00000000233,9606.ENSP00000257770,ENSP00000000233,ENSP00000257770,311
1,9606.ENSP00000000233,9606.ENSP00000226004,ENSP00000000233,ENSP00000226004,161
2,9606.ENSP00000000233,9606.ENSP00000434442,ENSP00000000233,ENSP00000434442,499
3,9606.ENSP00000000233,9606.ENSP00000262455,ENSP00000000233,ENSP00000262455,531
4,9606.ENSP00000000233,9606.ENSP00000303145,ENSP00000000233,ENSP00000303145,499
...,...,...,...,...,...
1477605,9606.ENSP00000501317,9606.ENSP00000444357,ENSP00000501317,ENSP00000444357,292
1477606,9606.ENSP00000501317,9606.ENSP00000296785,ENSP00000501317,ENSP00000296785,962
1477607,9606.ENSP00000501317,9606.ENSP00000361930,ENSP00000501317,ENSP00000361930,195
1477608,9606.ENSP00000501317,9606.ENSP00000370745,ENSP00000501317,ENSP00000370745,188


In [10]:
import requests
import time

unique_proteins = set(df['prot1_clean']).union(set(df['prot2_clean']))

In [None]:
protein_seq_dict = {}
ensembl_url = "https://rest.ensembl.org/sequence/id/{}?type=protein"
headers = {"Content-Type": "text/plain"}

# Fetch sequences
for i, protein_id in enumerate(unique_proteins):
    try:
        response = requests.get(ensembl_url.format(protein_id), headers=headers, timeout=10)
        if response.ok:
            protein_seq_dict[protein_id] = response.text.strip()
        else:
            protein_seq_dict[protein_id] = ""
    except Exception as e:
        print(f"Error for {protein_id}: {e}")
        protein_seq_dict[protein_id] = ""

    if i % 500 == 0:
        print(f"Fetched {i}/{len(unique_proteins)} sequences")
    
    time.sleep(0.1)

In [11]:
import pickle

with open("/kaggle/input/protein-sequences/protein_sequences.pkl", "rb") as f:
    protein_seq_dict = pickle.load(f)

print("✅ Loaded", len(protein_seq_dict), "protein sequences")

✅ Loaded 18767 protein sequences


In [12]:
# Show first 5 
for i, (protein_id, sequence) in enumerate(protein_seq_dict.items()):
    print(f"{i+1}. {protein_id} → {sequence[:50]}...")  # Print first 50 amino acids
    if i >= 4:
        break

1. ENSP00000419970 → MGDSHVDTSSTVSEAVAEEVSLFSMTDMILFSLIVGLLTYWFLFRKKKEE...
2. ENSP00000216373 → MQQAPQPYEFFSEENSPKWRGLLVSALRKVQEQVHPTLSANEESLYYIEE...
3. ENSP00000482523 → MDARRMKKEEGLTENTGLPRKLLEKHDPWPAYVTYTSQTVKRLIEKSKTR...
4. ENSP00000295709 → MEKYHVLEMIGEGSFGRVYKGRRKYSAQVVALKFIPKLGRSEKELRNLQR...
5. ENSP00000329982 → MDGENHSVVSEFLFLGLTHSWEIQLLLLVFSSVLYVASITGNILIVFSVT...


FOR EXTRACTING EMBEDDINGS FROM PROTEIN SEQUENCES

In [13]:
import torch
from transformers import T5Tokenizer, T5EncoderModel
import numpy as np

# Load ProtT5 model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("Rostlab/prot_t5_xl_uniref50", do_lower_case=False)
model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_uniref50").eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

2025-07-26 04:25:40.696236: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753503940.929623      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753503941.001881      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/11.3G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/11.3G [00:00<?, ?B/s]

In [None]:
# def get_embedding(sequence):
#     if not sequence or sequence.strip() == "":
#         return np.zeros(1024)
#     sequence = ' '.join(list(sequence.strip())) 
#     tokens = tokenizer(sequence, return_tensors='pt', padding=True, truncation=True)
#     input_ids = tokens['input_ids'].to(device)
#     attention_mask = tokens['attention_mask'].to(device)
#     with torch.no_grad():
#         output = model(input_ids=input_ids, attention_mask=attention_mask)
#     embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
#     return embedding

In [None]:
# import pickle

# embedding_dict = {}

# for i, (protein_id, seq) in enumerate(protein_seq_dict.items()):
#     try:
#         embedding_dict[protein_id] = get_embedding(seq)
#     except Exception as e:
#         print(f"Error processing {protein_id}: {e}")
#         embedding_dict[protein_id] = np.zeros(1024)

#     if i % 500 == 0:
#         print(f"Processed {i}/{len(protein_seq_dict)} proteins")

In [15]:
def safe_get_embedding(sequence, protein_id, max_lengths=[1024, 512, 256]):
    if not sequence or sequence.strip() == "":
        return np.zeros(1024)

    sequence = ' '.join(list(sequence.strip()))

    for max_len in max_lengths:
        try:
            tokens = tokenizer(
                sequence,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=max_len
            )
            input_ids = tokens['input_ids'].to(device)
            attention_mask = tokens['attention_mask'].to(device)

            with torch.no_grad():
                output = model(input_ids=input_ids, attention_mask=attention_mask)

            embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
            return embedding

        except RuntimeError as e:
            print(f"⚠️ Failed for {protein_id} with max_length={max_len}, retrying...")

            # Free up memory (for GPU)
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
    
    print(f"Could not process {protein_id}. Returning zero embedding.")
    return np.zeros(1024)


In [None]:
protein_embeddings = {}

for i, (prot_id, seq) in enumerate(protein_seq_dict.items()):
    emb = safe_get_embedding(seq, prot_id)
    protein_embeddings[prot_id] = emb

    if i % 1000 == 0:
        print(f"✅ Processed {i}/{len(protein_seq_dict)}")
        with open("/kaggle/working/protein_embeddings_partial.pkl", "wb") as f:
            pickle.dump(protein_embeddings, f)


In [19]:
import pickle
# Save as pickle
with open("/kaggle/working/protein_embeddings.pkl", "wb") as f:
    pickle.dump(protein_embeddings, f)

print("Saved all embeddings to protein_embeddings.pkl")

✅ Saved all embeddings to protein_embeddings.pkl


In [5]:
import pickle

with open("/kaggle/input/protein-final-embeddings/protein_embeddings_combined.pkl", "rb") as f:
    embedding_dict = pickle.load(f)

print("Loaded embeddings for", len(embedding_dict), "proteins")

Loaded embeddings for 18767 proteins


In [10]:
df

Unnamed: 0,protein1,protein2,combined_score,prot1_clean,prot2_clean
0,9606.ENSP00000000233,9606.ENSP00000257770,311,ENSP00000000233,ENSP00000257770
1,9606.ENSP00000000233,9606.ENSP00000226004,161,ENSP00000000233,ENSP00000226004
2,9606.ENSP00000000233,9606.ENSP00000434442,499,ENSP00000000233,ENSP00000434442
3,9606.ENSP00000000233,9606.ENSP00000262455,531,ENSP00000000233,ENSP00000262455
4,9606.ENSP00000000233,9606.ENSP00000303145,499,ENSP00000000233,ENSP00000303145
...,...,...,...,...,...
1477605,9606.ENSP00000501317,9606.ENSP00000444357,292,ENSP00000501317,ENSP00000444357
1477606,9606.ENSP00000501317,9606.ENSP00000296785,962,ENSP00000501317,ENSP00000296785
1477607,9606.ENSP00000501317,9606.ENSP00000361930,195,ENSP00000501317,ENSP00000361930
1477608,9606.ENSP00000501317,9606.ENSP00000370745,188,ENSP00000501317,ENSP00000370745


In [11]:
# Mapping proteins to embeddings
import numpy as np

df['prot1_emb'] = df['prot1_clean'].map(embedding_dict)
df['prot2_emb'] = df['prot2_clean'].map(embedding_dict)

df = df[df['prot1_emb'].notnull() & df['prot2_emb'].notnull()]


In [12]:
df

Unnamed: 0,protein1,protein2,combined_score,prot1_clean,prot2_clean,prot1_emb,prot2_emb
0,9606.ENSP00000000233,9606.ENSP00000257770,311,ENSP00000000233,ENSP00000257770,"[0.07326313, 0.12325806, -0.006426698, -0.0015...","[0.029869774, 0.10243675, 0.0372968, 0.0233545..."
1,9606.ENSP00000000233,9606.ENSP00000226004,161,ENSP00000000233,ENSP00000226004,"[0.07326313, 0.12325806, -0.006426698, -0.0015...","[0.036910426, 0.05981237, 0.002925644, -0.0113..."
2,9606.ENSP00000000233,9606.ENSP00000434442,499,ENSP00000000233,ENSP00000434442,"[0.07326313, 0.12325806, -0.006426698, -0.0015...","[-0.013162708, 0.030456692, 0.026405046, 0.022..."
3,9606.ENSP00000000233,9606.ENSP00000262455,531,ENSP00000000233,ENSP00000262455,"[0.07326313, 0.12325806, -0.006426698, -0.0015...","[0.014434807, 0.05334871, 0.034285888, 0.01530..."
4,9606.ENSP00000000233,9606.ENSP00000303145,499,ENSP00000000233,ENSP00000303145,"[0.07326313, 0.12325806, -0.006426698, -0.0015...","[0.029557526, 0.047400177, 0.021726472, -0.030..."
...,...,...,...,...,...,...,...
1477605,9606.ENSP00000501317,9606.ENSP00000444357,292,ENSP00000501317,ENSP00000444357,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.014785956, 0.056924693, 0.06229085, -0.0340..."
1477606,9606.ENSP00000501317,9606.ENSP00000296785,962,ENSP00000501317,ENSP00000296785,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.035299234, -0.027414408, -0.027663784, 0.05..."
1477607,9606.ENSP00000501317,9606.ENSP00000361930,195,ENSP00000501317,ENSP00000361930,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.05269495, 0.05626375, -0.016120307, -0.0209..."
1477608,9606.ENSP00000501317,9606.ENSP00000370745,188,ENSP00000501317,ENSP00000370745,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.030423379, 0.07824146, 0.027084596, 0.01682..."


In [14]:
len(df['prot1_emb'][0])

1024