<a href="https://colab.research.google.com/github/igemto-drylab/igemto-drylab/blob/master/Track-2/UniRep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!nvidia-smi #make sure to have a tesla p100 to use unirep properly

In [0]:
%%capture
#installing aws interaction to get weight files
!pip install awscli

In [0]:
#getting the github repo
push_acc = input("push access? (y/n):")

if push_acc == "y":

    from getpass import getpass

    password = getpass("type password:")
    !git clone https://epicrunze:{password}@github.com/epicrunze/UniRep.git 

else:

    !git clone https://github.com/epicrunze/UniRep.git 
%cd UniRep
!git pull


In [0]:
#selecting a working version of tensorflow
%tensorflow_version 1.x

# Setup


In [0]:
USE_FULL_1900_DIM_MODEL = True # if True use 1900 dimensional model, else use 64 dimensional one.

In [0]:
import tensorflow as tf
import numpy as np

print(tf.__version__) #should be  1.15

# Set seeds
tf.set_random_seed(42)
np.random.seed(42)

if USE_FULL_1900_DIM_MODEL:
    # Sync relevant weight files
    !aws s3 sync --no-sign-request --quiet s3://unirep-public/1900_weights/ 1900_weights/
    
    # Import the mLSTM babbler model
    from unirep import babbler1900 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./1900_weights"
    
else:
    # Sync relevant weight files
    !aws s3 sync --no-sign-request --quiet s3://unirep-public/64_weights/ 64_weights/
    
    # Import the mLSTM babbler model
    from unirep import babbler64 as babbler
    
    # Where model weights are stored.
    MODEL_WEIGHT_PATH = "./64_weights"

# Creating a pipeline for converting sequences to unirep vectors


In [0]:
# time to convert sequences to unirep vectors
# initialize unirep babbler (lstm network)
model = babbler(model_path=MODEL_WEIGHT_PATH)

In [0]:
import pandas as pd

# defining a directory to pull files out of
data_dir = "/content/UniRep/finished_files/"
# reading csv to dataframe
df = pd.read_csv(data_dir + "humans_with_seq.csv")
# defining a directory to store all the vectors
storage_dir = "/content/UniRep/UniRep_Vecs"
df.head() # displays the 1st 5 rows

In [0]:
# converting dataframe to a dictionary with column headers as keys, and the values being a list of the columns
dfDict = df.to_dict(orient="list")
del df

In [0]:
# creating functions to write data to folder
from time import time

sequence_limit = 1000 # due to memory concerns

size = len(dfDict["proteinID"]) # size of dataset

stepsize = 500 # how many sequences to process in one graph initiation

for i in range(0, size, stepsize):
    st = time()
    cap = i + stepsize if i + stepsize < size else size # making sure we don't go out of index

    tempProtIDs = []
    tempSeqs = []
    for protID, seq in zip(dfDict["proteinID"][i:cap], dfDict["sequence"][i:cap]):
        if len(seq) < sequence_limit:
            tempProtIDs.append(protID)
            tempSeqs.append(seq)


    fusionVecs = model.get_rep(tempSeqs)

    for protID, vec in zip(tempProtIDs, fusionVecs):
        np.save("{}/{}.npy".format(storage_dir, protID), vec)
    
    del fusionVecs
    del tempSeqs

    !git add -A;git commit -m "add {i} to {cap} human vectors";git push

    et = time()

    print("Process took {} seconds".format(et-st))
