In [None]:
# We're using the UniRep paper by Alley et al 2019, that has trained an encoding
# space for proteins (taking variable-length sequences into a vector of 64 
# encodings). To make this work, we need a specific older version of TensorFlow.
!pip install tensorflow==1.15.5

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==1.15.5
  Downloading tensorflow-1.15.5-cp37-cp37m-manylinux2010_x86_64.whl (110.5 MB)
[K     |████████████████████████████████| 110.5 MB 1.2 kB/s 
Collecting tensorboard<1.16.0,>=1.15.0
  Downloading tensorboard-1.15.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 32.2 MB/s 
[?25hCollecting keras-applications>=1.0.8
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 3.9 MB/s 
Collecting h5py<=2.10.0
  Downloading h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 26.6 MB/s 
[?25hCollecting tensorflow-estimator==1.15.1
  Downloading tensorflow_estimator-1.15.1-py2.py3-none-any.whl (503 kB)
[K     |████████████████████████████████| 503 kB 5.3 MB/s 
Collecting gast==0.2.2
  Downloading gast-0.2.2.tar.gz (10 kB)
Collecting 

In [None]:
# We're cloning the github repo that ahs the code we need to run this - note
# that the first time you run it, it will do the cloning; subsequent times will
# give a 'directory already made' error.
!git clone https://github.com/churchlab/UniRep.git

In [None]:
# Go into our cloned github repo with code from Alley et al 2019.
cd UniRep

In [None]:
# Load in our dependencies
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
import random as python_random
import tensorflow as tf
from google.colab import files

# Note: this must be run in a kernel with TF version 1.15 - that's what the 
# above cells are doing

# set seeds
np.random.seed(768)
python_random.seed(869)
tf.random.set_random_seed(1234)

from google.colab import drive
drive.mount('/content/drive')

from unirep import babbler64 as babbler

print(tf.__version__)

Mounted at /content/drive

1.15.5


In [None]:
# Load in premade weights from UniRep encoding paper (Alley et al 2019)
weights_path = "/content/drive/MyDrive/DeepLearning_Summer2022/Final_Project/Data/UniRep_Weights/64_weights/"

In [None]:
# Let's test to make sure we can run their weights + model
batch_size = 12
b = babbler(batch_size=batch_size, model_path=weights_path)



Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API

Instructions for updating:
dim is deprecated, use axis instead
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Please use `layer.__call__` method instead.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all 

In [None]:
# Testing an individual sequence
test_sequence = "MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGGNRYPPQGGGGWGQPHGGGWGQPHGGGWGQPHGGGWGQPHGGGWGQGGGTHSQWNKPSKPKTNMKHMAGAAAAGAVVGGLGGYMLGSAMSRPIIHFGSDYEDRYYRENMHRYPNQVYYRPMDEYSNQNNFVHDCVNITIKQHTVTTTTKGENFTETDVKMMERVVEQMCITQYERESQAYYQRGSSMVLFSSPPVILLISFLIFLIVG"
avg_hidden, final_hidden, final_cell = b.get_rep(test_sequence)






In [None]:
print(avg_hidden.shape)
print(final_hidden.shape)
print(final_cell.shape)

(64,)
(64,)
(64,)


In [None]:
final_cell

array([-0.03392499,  0.4642311 ,  0.5833186 , -2.6741865 ,  0.12040318,
       -0.98775893,  1.378801  ,  1.0598173 ,  0.96727955,  0.6003299 ,
        1.251148  ,  1.9921563 ,  0.09310557,  1.6740963 ,  0.15809879,
       -2.1900048 , -1.4988172 ,  0.05234914,  0.60021186, -3.717322  ,
       -0.34242752,  3.0133345 , -2.5500371 , -0.5607802 ,  4.998315  ,
       -0.12100416,  0.13841838,  0.218849  ,  4.139292  , -3.3598323 ,
        1.5299058 ,  0.5911101 ,  4.848772  , -0.4276914 ,  0.868774  ,
        0.19071291, -0.6580446 ,  0.15731928,  0.57896996, -0.5961143 ,
        0.08143958,  0.47155276,  0.15586367,  2.195258  , -0.04211713,
        2.1800363 , -5.5114126 , -9.715301  ,  1.9706156 ,  1.7465513 ,
        0.42815694,  0.32075572, -0.21909057, -0.21982038,  0.31077018,
        0.27156875,  0.06501038,  0.07555409, -0.28645086, 18.441698  ,
       -0.16832477, -0.05181318,  6.2898474 , -0.32685822], dtype=float32)

In [12]:
# Here's the big run. We're going to load in my list of all canonical proteins
# and their sequences, and run them individually through the encodings module
# to get 64-depth encodings for each sequence. This takes a while to run.
df = pd.read_csv('/content/drive/MyDrive/DeepLearning_Summer2022/Final_Project/Data/precursor_files/uniprot_canonical_human_proteins.tsv',sep='\t')
N = len(df)

from tqdm import tqdm

# Store output in a dictionary, with Uniprot identifier as the key and the 
# array of 64 encodings as the value
protein_encodings = {}
for i in tqdm(range(N)):
  _, _, final_cell = b.get_rep(df.iloc[i]['Sequence'])
  uniprot = df.iloc[i]['Entry']
  protein_encodings[uniprot] = final_cell
  tf.reset_default_graph() 

100%|██████████| 20387/20387 [5:01:02<00:00,  1.13it/s]


In [13]:
# Finally, let's dump this dictionary in a pickle file so that we can access it
# later
import pickle
encodings_file = '/content/drive/MyDrive/DeepLearning_Summer2022/Final_Project/Data/protein_full_length_encodings.pickle'
with open(encodings_file,'wb') as f:
  pickle.dump(protein_encodings,f,protocol=pickle.HIGHEST_PROTOCOL)