This notebook predicts Protein-Protein Interface Hotspots using the ESM2-based sequence representation. To explore a sequence and structure feature-based approach, please utilize the following web server:  https://ppihotspotid.limlab.dnsalias.org/

In [None]:
#@title Install dependencies
%%time
!pip install -q fair-esm
!python -m pip install --upgrade pip
!python -m pip install -q autogluon
!pip install -q gdown

!gdown https://drive.google.com/uc?id=1IeKyYtby7I8KrGEirfMhbnktC_G1ZoUi
!unzip -q -o data.zip

In [2]:
#@title Input protein sequence, then hit `Runtime` -> `Run all` (in case sequence is to long and memory error appears, paste just the region of interest)
sequence = 'VQNKTHENICARYHWPSMCCCDQMRNDHDLFQSETGGRYSSVQTVHEWHHGNKWIYEARNDMKLTNLRAKCRSDKMSLQIFVQEKMQDRRVPYNSRTKCQLNIRGKEFYQWEHSISLSKLHCDWIKSSLDTIMPTNIHLNQMKFRKQWLCQPDFWWCIGPEGPQCFIFMDTWAPALQMFRAHNKLSAQQALQCYGDMFHFLHIDAGYYYKAHCGCAHVMMWTQSPYYWAKWQWPNKWHLRMWYEHCNEQWTAKDDFHMFLEHNKCHIYHVHAEMCDHFQGIWLLFCSLKKCGVKMLIERC'#@param {type:"string"}

In [None]:
#@title Run Prediction
import torch
import esm
from autogluon.tabular import TabularDataset, TabularPredictor
import pandas as pd
import random
import string
import numpy as np

model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()

def prep_seq(seq):
    datum = []
    pos = {}
    for i in range(len(seq)):
        if i <50:
            datum.append((str(i),seq[:101]))
            pos[i] = i
        elif i+50 > len(seq):
            datum.append((str(i),seq[-101:]))
            pos[i] = i+99-len(seq)
        else:
            datum.append((str(i),seq[i-50:i+50]))
            pos[i] = 49
    return (datum,pos)

data = prep_seq(sequence)[0]
pos = prep_seq(sequence)[1]

batch_converter = alphabet.get_batch_converter()
model.eval()

#data = [('some_id;', "100_letter_aminoacid_sequence")]
batch_labels, batch_strs, batch_tokens = batch_converter(data)
batch_lens = (batch_tokens != alphabet.padding_idx).sum(1)
with torch.no_grad():
     results = model(batch_tokens, repr_layers=[33], return_contacts=True)

with open('results.csv', 'w') as f:
     for i, el in enumerate(results['representations'][33]):
           st = ''
           for number in el[pos[i],:]:
               st += str(number.cpu().numpy()) + ','
           st = st[:-1]+'\n'
           f.write(st)

df = pd.read_csv('results.csv', names = [i for i in range(1280)])
df.to_csv('results.csv')

predictor = TabularPredictor.load("AutogluonModels/ag-20240209_040758")

predict_set = TabularDataset('results.csv')
predictions = predictor.predict(predict_set)

seq_res = ''
for el in predictions:
    seq_res += el[0]

print ("")
print ("the prediction is:")
print (seq_res)