In [87]:
import numpy as np


def DSSPseq(dssp,chain="A"):
    """This function returns the sequence of a given chain in a dssp object"""

    return ''.join([dssp.property_dict[x][1] for x in dssp.keys() if x[0]==chain ])

def fastaSeq(fasta, chain="A"):

    from Bio import SeqIO
    import re
    for record in SeqIO.parse(fasta, "fasta"):
        outer = re.compile("[Cc]hains? ([A-Z,]+)")
        m = outer.search(record.description.split("|")[1])
        if re.search(chain,m.groups()[0]):
            return record.seq
    return ''

def oneHot(residue):
    mapping = dict(zip("ACDEFGHIKLMNPQRSTVWY", range(20)))
    if residue in "ACDEFGHIKLMNPQRSTVWY":
        return np.eye(20)[mapping[residue]]
    else:
        return np.zeros(20)
    
    
def reverseOneHot(encoding):
    mapping = dict(zip(range(20),"ACDEFGHIKLMNPQRSTVWY"))
    seq=''
    for i in range(len(encoding)):
        if np.max(encoding[i])>0:
            seq+=mapping[np.argmax(encoding[i])]
    return seq

def dssp8(classLetter):    
    mapping = dict(zip("GHIBESTC", range(8))) 
    if classLetter in "GHIBESTC":
        return np.eye(8)[mapping[classLetter]]
    else:
        return np.zeros(8)

In [28]:
#this is just to show how to download files and parse them with Bio.PDB
from Bio.PDB import PDBParser
from Bio.PDB.DSSP import DSSP
from Bio import pairwise2
import wget
urlBase='https://files.rcsb.org/download/'
fastaBase='https://www.rcsb.org/fasta/entry/'
#dataFolder="/Users/pamar/Desktop/projects/august/data/"
currentStructure='1OAT'

filename=wget.download(urlBase+currentStructure+'.pdb', out="/Users/pamar/Downloads/"+currentStructure+".pdb")
sequenceFilename=wget.download(fastaBase+currentStructure, out="/Users/pamar/Downloads/"+currentStructure+".fasta")

p = PDBParser()
structure = p.get_structure(currentStructure, filename)
model = structure[0]
dssp = DSSP(model, filename)
seq1=DSSPseq(dssp,"A")
seq2=fastaSeq(sequenceFilename,"A")




alignments = pairwise2.align.globalms(seq1, seq2,1,-1,-5,-.5,penalize_extend_when_opening=True,penalize_end_gaps=False)
dsspCont=0
dsspList=[]
for i in range(len(alignments[0][0])):
#    print(alignments[0][0][i])
    if (alignments[0][0][i]=="-"):
        #this is a disordered residue, add a dssp element like this
        print(        alignments[0][1][i],oneHot(alignments[0][1][i]), reverseOneHot(oneHot(alignments[0][1][i])))

        dsspList.append([oneHot(alignments[0][1][i])])
    else:
        #this is a norml element, add dssp to it
        print(        alignments[0][1][i],oneHot(alignments[0][1][i]))
        print(dssp.property_dict[dssp.keys()[dsspCont]])
        dsspCont+=1

reverseOneHot(dsspList)




LE 20
LE0 (20,)
M [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.] 


AttributeError: 'numpy.ndarray' object has no attribute 'append'

In [18]:
np.array(dsspList)

array([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]],

       [[0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
         0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
         0., 0., 0

In [10]:
dssp.property_dict[dssp.keys()[1]]

(2,
 'P',
 '-',
 0.2867647058823529,
 -47.0,
 118.6,
 0,
 0.0,
 490,
 -0.1,
 0,
 0.0,
 489,
 -0.1)

In [11]:
import concurrent.futures
import os
length = []


pdbList=["a","b","cc"]


def processPdb(pdb,chain,fasta):
    #function to process a PDB file, returns the formatted dssp
    
    return [pdb,chain,fasta]


def appList(l):
    length.append(l)

def startProcessing():
    with concurrent.futures.ProcessPoolExecutor() as executor:
        pdbFiles = [f for f in pdbList ]
#        breakpoint()
        future_proc = {executor.submit(processPdb, f,"A","X"): f for f in pdbFiles}
        for future in concurrent.futures.as_completed(future_proc):
            appList(future.result())


if __name__ == "__main__":
    startProcessing()
    print(len(length))
    
    
    
    


3


In [50]:
from concurrent import futures
import threading
import time


def task(n):
    print('{}: sleeping {}'.format(
        threading.current_thread().name,
        n)
    )
    time.sleep(n / 10)
    print('{}: done with {}'.format(
        threading.current_thread().name,
        n)
    )
    return n / 10


ex = futures.ThreadPoolExecutor(max_workers=2)
#ex = futures.ProcessPoolExecutor(max_workers=2)


print('main: starting')
results = ex.map(task, range(5, 0, -1))
print('main: unprocessed results {}'.format(results))
print('main: waiting for real results')
real_results = list(results)
print('main: results: {}'.format(real_results))

main: starting
ThreadPoolExecutor-0_0: sleeping 5ThreadPoolExecutor-0_1: sleeping 4
main: unprocessed results <generator object Executor.map.<locals>.result_iterator at 0x7fa3beed3fc0>
main: waiting for real results

ThreadPoolExecutor-0_1: done with 4
ThreadPoolExecutor-0_1: sleeping 3
ThreadPoolExecutor-0_0: done with 5
ThreadPoolExecutor-0_0: sleeping 2
ThreadPoolExecutor-0_1: done with 3
ThreadPoolExecutor-0_1: sleeping 1
ThreadPoolExecutor-0_0: done with 2
ThreadPoolExecutor-0_1: done with 1
main: results: [0.5, 0.4, 0.3, 0.2, 0.1]


In [119]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.PDB import PDBList
import os
from Bio.PDB.DSSP import DSSP
import pickle
from Bio.PDB import PDBParser
#from Bio.PDB.DSSP import DSSP
from Bio import pairwise2



SeqID50 = []
Sequence50 = []
for seq1 in SeqIO.parse("../data/test.fasta", "fasta"):
    Sequence50.append(str(seq1.seq))
    SeqID50.append(seq1.id)
SeqDict50 = dict(zip(SeqID50,Sequence50))
p = PDBParser()
pdbl=PDBList()
OHCAA = []
OHCSS = []
for pdb in SeqID50:
    currentStructure = pdb[:4]
    pdbl.retrieve_pdb_file(currentStructure,pdir='PDB',file_format='pdb')
    os.chdir(owd + '/PDB')
    structure = p.get_structure(currentStructure,'pdb' + currentStructure + '.ent')
    model = structure[0]
    dssp = DSSP(model, 'pdb' + currentStructure + '.ent')
    seq1=DSSPseq(dssp,pdb[-1])
    seq2=SeqDict50[pdb]
    alignments = pairwise2.align.globalms(seq1, seq2,1,-1,-5,-.5,penalize_extend_when_opening=True,penalize_end_gaps=False)
    dsspCont=0
    dsspList=[]
    a = np.array([])
    b = np.array([])
    os.remove(owd +'/PDB/'+'pdb'+currentStructure+'.ent')
    os.chdir(owd)
    dsspCont = 0
    for i in range(len(alignments[0][0])):
        if alignments[0][0][i] == '-':
            #error, this should be the real residue not -
            a = np.hstack((a,oneHot(alignments[0][1][i])))

            b = np.hstack((b,dssp8('-')))
        else:
            #this is a norml element, add dssp to it
            a = np.hstack((a,oneHot(alignments[0][1][i])))
            #OHCSSList.append([dssp.property_dict[dssp.keys()[dsspCont]][2]])
            c = dssp.property_dict[dssp.keys()[dsspCont]][2]
            b = np.hstack((b,dssp8(c)))
            #print(c)
            dsspCont+=1
    OHCAA.append(a.reshape(len(alignments[0][0]),20))
    OHCSS.append(b.reshape(len(alignments[0][0]),8))
maxlength = max([x.shape[0] for x in OHCAA])
# Extrapolating sequences
for i in range(len(OHCAA)):
    if maxlength > OHCAA[i].shape[0]:
        diff = np.zeros((maxlength-OHCAA[i].shape[0],20))
        OHCAA[i] = np.vstack((OHCAA[i],diff))
        diff = np.zeros((maxlength-OHCSS[i].shape[0],8))
        OHCSS[i] = np.vstack((OHCSS[i],diff))
# os.chdir(owd)
# pickle.dump([OHCAA,OHCSS],open('red50.pickle','wb'))
#print(OHCAA)
#print(OHCSS)
print('Finished with 50 redundancy data set')
print('Done')

/Users/pamar/Desktop/projects/august/script
Downloading PDB structure '12AS'...




Downloading PDB structure '16PK'...
Finished with 50 redundancy data set
Done


In [121]:
OHCAA[0][0:10]


array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
        0., 0., 0., 0.]])

In [116]:
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.PDB import PDBList
import os
from Bio.PDB.DSSP import DSSP
import pickle
from Bio.PDB import PDBParser
#from Bio.PDB.DSSP import DSSP
from Bio import pairwise2
import pickle

os.chdir("/Users/pamar/Desktop/projects/august/script/")
owd = os.getcwd()

with open('../data/testSet.fasta', 'w') as out_file_big:
    cont=0
    with open('../data/Casp12Data.pickle','rb') as dataFile:
        data = pickle.load(dataFile)
        with open('../data/CASP.fasta', 'w') as out_file:
            for i in range(data[0].shape[0]):
                out_file.write(">CASP_"+str(cont)+"\n")
                out_file.write(reverseOneHot(data[0][i,:,0:20])+"\n")
                out_file_big.write(">CASP_"+str(cont)+"\n")
                out_file_big.write(reverseOneHot(data[0][i,:,0:20])+"\n")
                cont+=1
            
            
    cont=0
    with open('../data/CB513.pickle','r
              b') as dataFile:
        data = pickle.load(dataFile)

        with open('../data/CB513.fasta', 'w') as out_file:
            for i in range(data[0].shape[0]):
                out_file.write(">CB513_"+str(cont)+"\n")
                out_file.write(reverseOneHot(data[0][i,:,0:20])+"\n")
                out_file_big.write(">CB513_"+str(cont)+"\n")
                out_file_big.write(reverseOneHot(data[0][i,:,0:20])+"\n")
                cont+=1
    cont=0
    with open('../data/TS115.pickle','rb') as dataFile:
        data = pickle.load(dataFile)
        with open('../data/TS115.fasta', 'w') as out_file:
            for i in range(data[0].shape[0]):
                out_file.write(">TS115_"+str(cont)+"\n")
                out_file.write(reverseOneHot(data[0][i,:,0:20])+"\n")
                out_file_big.write(">TS115_"+str(cont)+"\n")
                out_file_big.write(reverseOneHot(data[0][i,:,0:20])+"\n")
                cont+=1




In [52]:
import numpy as np
import pandas as pd
from Bio import SeqIO
import pickle

def cleanDF(dataset,seq_threshold,length_threshold=0.5):
    qseqid = list(dataset.iloc[:,0])
    sseqid = list(dataset.iloc[:,1])
    pident = list(dataset.iloc[:,2])
    length = list(dataset.iloc[:,3])
    qlen   = list(dataset.iloc[:,4])
    redundantSeqs = []
    for i in range(len(qseqid)):
        if ((pident[i] > seq_threshold) and (length_threshold * qlen[i] < length[i])):
            if qseqid[i] not in redundantSeqs:
                #print(qseqid[i])
                redundantSeqs.append(qseqid[i])
    return redundantSeqs


seqid = 25
minCov=.50

for blastFile in ['../data/dataset70','../data/dataset50','../data/dataset30']:

    blastData = pd.read_csv(blastFile+'.blast.out',sep='\t',header=None).iloc[:,0:5]

    labelsRed = cleanDF(blastData,seqid,minCov)
    finalSeqs=[]
    for seqs in SeqIO.parse(blastFile+".fasta", "fasta"):
        if seqs.id not in labelsRed:
                finalSeqs.append(seqs)
    SeqIO.write(finalSeqs, blastFile+"_reduced.fasta", "fasta")
    

KeyboardInterrupt: 

In [160]:
# =============================================================================
# OneHotEncode new data
# =============================================================================
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio.PDB import PDBList
import os
from Bio.PDB.DSSP import DSSP
import pickle
import argparse
from concurrent import futures
import threading
import time



parser = argparse.ArgumentParser(description='create NSP2 training sets.')

parser.add_argument('--file', dest='file',
                    help='fasta file to process')
parser.add_argument('--procs', dest='procs', default=2, type=int,
                    help='number of processors to use - default 2')


args = parser.parse_args()



def DSSPseq(dssp,chain="A"):
    """This function returns the sequence of a given chain in a dssp object"""

    return ''.join([dssp.property_dict[x][1] for x in dssp.keys() if x[0]==chain ])

def fastaSeq(fasta, chain="A"):

    from Bio import SeqIO
    import re
    for record in SeqIO.parse(fasta, "fasta"):
        outer = re.compile("[Cc]hains? ([A-Z,]+)")
        m = outer.search(record.description.split("|")[1])
        if re.search(chain,m.groups()[0]):
            return record.seq
    return ''

def oneHot(residue):
    mapping = dict(zip("ACDEFGHIKLMNPQRSTVWY", range(20)))
    if residue in "ACDEFGHIKLMNPQRSTVWY":
        return np.eye(20)[mapping[residue]]
    else:
        return np.zeros(20)

def dssp8(classLetter):    
    mapping = dict(zip("GHIBESTC", range(8))) 
    if classLetter in "GHIBESTC":
        return np.eye(8)[mapping[classLetter]]
    else:
        return np.zeros(8)
    
    
def fasta2dssp(pdb,seq2):
    currentStructure = pdb[:4]
    print("PDB is",pdb)
    pdbl.retrieve_pdb_file(currentStructure,pdir='../data/PDB',file_format='pdb')
    try:
        structure = p.get_structure(currentStructure,'../data/PDB/pdb' + currentStructure + '.ent')
    except FileNotFoundError:
        return [[],[]]
    model = structure[0]
    print(model)
    dssp = DSSP(model, '../data/PDB/pdb' + currentStructure + '.ent')
    seq1=DSSPseq(dssp,pdb[-1])
    #seq2=seqDict[pdb]
    alignments = pairwise2.align.globalms(seq1, seq2,1,-1,-5,-.5,penalize_extend_when_opening=True,penalize_end_gaps=False)
    dsspCont=0
    dsspList=[]
    a = np.array([])
    b = np.array([])
    os.remove('../data/PDB/pdb'+currentStructure+'.ent')
    dsspCont = 0
    for i in range(len(alignments[0][0])):
        if alignments[0][0][i] == '-':
            a = np.hstack((a,oneHot(alignments[0][1][i])))
            b = np.hstack((b,dssp8('-')))
        else:
            #this is a norml element, add dssp to it
            a = np.hstack((a,oneHot(alignments[0][1][i])))
            #OHCSSList.append([dssp.property_dict[dssp.keys()[dsspCont]][2]])
            c = dssp.property_dict[dssp.keys()[dsspCont]][2]
            b = np.hstack((b,dssp8(c)))
            #print(c)
            dsspCont+=1
    return [a.reshape(len(alignments[0][0]),20),b.reshape(len(alignments[0][0]),8)]
            
            
            
            

from Bio.PDB import PDBParser
#from Bio.PDB.DSSP import DSSP
from Bio import pairwise2
#
#urlBase='https://files.rcsb.org/download/'
#fastaBase='https://www.rcsb.org/fasta/entry/'

SeqID30 = []
Sequence30 = []
file = args.file
procs=args.procs
for seq1 in SeqIO.parse(file, "fasta"):
    Sequence30.append(str(seq1.seq))
    SeqID30.append(seq1.id)
SeqDict30 = dict(zip(SeqID30,Sequence30))
p = PDBParser()
pdbl=PDBList()
OHCAA = []
OHCSS = []
# for pdb in SeqID30:
#     currentStructure = pdb[:4]
#     val=fasta2dssp(currentStructure,SeqDict30)
#     OHCAA.append(val[0])
#     OHCSS.append(val[1])
    
    
    
    
ex = futures.ProcessPoolExecutor(max_workers=procs)
#ex = futures.ProcessPoolExecutor(max_workers=2)


print('main: starting')
results = ex.map(fasta2dssp, SeqID30,Sequence30)
print('main: waiting for real results')
real_results = list(results)

[OHCAA,OHCSS]=list(zip(*real_results))
OHCAA=[np.array(x) for x in OHCAA]
OHCSS=[np.array(x) for x in OHCSS]

maxlength = max([x.shape[0] for x in OHCAA])
# Extrapolating sequences
for i in range(len(OHCAA)):
    if maxlength > OHCAA[i].shape[0]:
        diff = np.zeros((maxlength-OHCAA[i].shape[0],20))
        OHCAA[i] = np.vstack((OHCAA[i],diff))
        diff = np.zeros((maxlength-OHCSS[i].shape[0],8))
        OHCSS[i] = np.vstack((OHCSS[i],diff))
pickle.dump([OHCAA,OHCSS],open(file+'.pickle','wb'))
#print(OHCAA)
#print(OHCSS)
print('Finished with {} redundancy data set'.format(file))

main: starting
PDB is 16PKA
PDB is 12ASA
Downloading PDB structure '12AS'...
Downloading PDB structure '16PK'...
main: unprocessed results <generator object _chain_from_iterable_of_lists at 0x7fa3bae94200>
main: waiting for real results
<Model id=0>




<Model id=0>
main: results: [[array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])], [array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]]), array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])]]

In [148]:
[OHCAA,OHCSS]=list(zip(*real_results))

In [159]:
len(OHCSS[1])

415

In [132]:
Sequence30

['MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL',
 'EKKSINECDLKGKKVLIRVDFNVPVKNGKITNDYRIRSALPTLKKVLTEGGSCVLMSHLGRPKGIPMAQAGKIRSTGGVPGFQQKATLKPVAKRLSELLLRPVTFAPDCLNAADVVSKMSPGDVVLLENVRFYKEEGSKKAKDREAMAKILASYGDVYISDAFGTAHRDSATMTGIPKILGNGAAGYLMEKEISYFAKVLGNPPRPLVAIVGGAKVSDKIQLLDNMLQRIDYLLIGGAMAYTFLKAQGYSIGKSKCEESKLEFARSLLKKAEDRKVQVILPIDHVCHTEFKAVDSPLITEDQNIPEGHMALDIGPKTIEKYVQTIGKCKSAIWNGPMGVFEMVPYSKGTFAIAKAMGRGTHEHGLMSIIGGGDSASAAELSGEAKRMSHVSTGGGASLELLEGKTLPGVTVLDDK']

In [None]:
import matplotlib.pyplot as plt import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from torch.nn.functional import relu, elu, relu6, sigmoid, tanh, softmax from torch.autograd import Variable
import torch.nn as nn
import sklearn.model_selection as model_selection
import pickle
# Defining a function to obtain the prediction accuracies
def getAccuracy(predictions,targets):
targets = np.squeeze(targets.permute(0,2,1).argmax(dim=2).view(-1,targets.shape[0]*ta pred = np.squeeze(softmax(predictions.permute(0,2,1),dim=2).argmax(dim=2).view(-1,pre correct = 0
pred = pred[targets != 8]
targets = targets[targets != 8] correct = (pred == targets).sum() return correct/np.size(targets)
### Load train and test data
traindata = pickle.load(open('train.pickle','rb'))
X = traindata[0][:,:,0:20]
y = traindata[0][:,:,57:65]
CASP12data = pickle.load(open('Casp12Data.pickle','rb')) X_casp = CASP12data[0][:,:,0:20]
y_casp = CASP12data[0][:,:,57:65]
TS115data = pickle.load(open('TS115.pickle','rb')) X_TS115 = TS115data[0][:,:,0:20]
y_TS115 = TS115data[0][:,:,57:65]
CB513data = pickle.load(open('CB513.pickle','rb')) X_CB513 = CB513data[0][:,:,0:20]
y_CB513 = CB513data[0][:,:,57:65]
https://colab.research.google.com/drive/1wbAxkPjUyiX68kEKdazAHBFgn751f7D7#scrollTo=9dXoZb1A7g0h&printMode=true 4/11
r d
 14.12.2020 Data redundancy in developing deep learning tools on biological data.ipynb - Colaboratory
# Since smaller sequences are extrapolated with zeros to match the length of # the longest sequence in the dataset we need to add a "ninth" class of SS in # order not to make Pythn confuse the first class with the extrapolated ones if y.shape[2] == 8:
EPzeros = np.expand_dims(np.zeros((y.shape[1])),1)
EPzeros_casp = np.expand_dims(np.zeros((y_casp.shape[1])),1)
EPzeros_TS115 = np.expand_dims(np.zeros((y_TS115.shape[1])),1)
EPzeros_CB513 = np.expand_dims(np.zeros((y_CB513.shape[1])),1)
y = np.asarray([np.hstack((y[i],EPzeros)) for i in range(y.shape[0])])
y_casp = np.asarray([np.hstack((y_casp[i],EPzeros_casp)) for i in range(y_casp.shape[ y_TS115 = np.asarray([np.hstack((y_TS115[i],EPzeros_TS115)) for i in range(y_TS115.sh y_CB513 = np.asarray([np.hstack((y_CB513[i],EPzeros_CB513)) for i in range(y_CB513.sh for i in range(y.shape[0]):
for j in range(y[i].shape[0]): if np.all(y[i][j,:] == 0):
y[i][j,:][-1] = 1 for i in range(y_casp.shape[0]):
for j in range(y_casp[i].shape[0]): if np.all(y_casp[i][j,:] == 0):
y_casp[i][j,:][-1] = 1 for i in range(y_TS115.shape[0]):
for j in range(y_TS115[i].shape[0]): if np.all(y_TS115[i][j,:] == 0):
y_TS115[i][j,:][-1] = 1 for i in range(y_CB513.shape[0]):
for j in range(y_CB513[i].shape[0]): if np.all(y_CB513[i][j,:] == 0):
y_CB513[i][j,:][-1] = 1
# Converting to tensors for PyTorch
X_train = torch.tensor(X, dtype = torch.float)
y_train = torch.tensor(y, dtype = torch.float).permute(0,2,1) X_casp = torch.tensor(X_casp,dtype = torch.float)
y_casp = torch.tensor(y_casp,dtype=torch.float).permute(0,2,1) X_TS115 = torch.tensor(X_TS115,dtype=torch.float)
y_TS115 = torch.tensor(y_TS115,dtype=torch.float).permute(0,2,1) X_CB513 = torch.tensor(X_CB513,dtype=torch.float)
y_CB513 = torch.tensor(y_CB513,dtype=torch.float).permute(0,2,1)
class MyDataset(Dataset):
    def __init__(self, X, y):
self.data = X self.targets = y
    def __getitem__(self, index):
x = self.data[index]
y = self.targets[index]
return x, y
def __len__(self): return len(self.data)
# Creating dataloaders for training in batches for later.
batch_size = 65
TrainLoader = DataLoader(MyDataset(X_train,y_train),batch_size=batch_size) CASPLoader = DataLoader(MyDataset(X_casp,y_casp),batch_size=3)
 https://colab.research.google.com/drive/1wbAxkPjUyiX68kEKdazAHBFgn751f7D7#scrollTo=9dXoZb1A7g0h&printMode=true 5/11
0 a a

 14.12.2020 Data redundancy in developing deep learning tools on biological data.ipynb - Colaboratory
TS115Loader = DataLoader(MyDataset(X_TS115,y_TS115),batch_size=batch_size) CB513Loader = DataLoader(MyDataset(X_CB513,y_CB513),batch_size=batch_size)
# Defining the CNN architecture
 channels = 20 kernel_size_conv1 padding_conv1 = 7 stride_conv1 = 1 kernel_size_conv2 padding_conv2 = 4 stride_conv2 = 1 kernel_size_conv3 padding_conv3 = 2 stride_conv3 = 1
= 15 = 9 = 5
class Net(nn.Module): def __init__(self):
''''''
super().__init__()
self.conv1 = nn.Sequential( nn.Conv1d(in_channels=channels,out_channels=25,kernel_size=kernel_size_conv1 nn.ReLU(),
nn.BatchNorm1d(25),
nn.Dropout(p=0.5))
self.conv2 = nn.Sequential(
nn.Conv1d(in_channels=25, out_channels=35, kernel_size=kernel_size_conv2, str nn.ReLU(),
nn.BatchNorm1d(35),
nn.Dropout(p=0.5))
self.conv3 = nn.Sequential(
nn.Conv1d(in_channels=35,out_channels=40, kernel_size=kernel_size_conv3, stri nn.ReLU(),
nn.BatchNorm1d(40),
nn.Dropout(p=0.5))
# Using a convolutional layer instead of linear. self.fc1_encode1 = nn.Sequential(
nn.Conv1d(in_channels=40,out_channels=9,kernel_size=1,stride=1,padding=0,bias
nn.BatchNorm1d(9) )
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = self.conv3(x)
x = self.fc1_encode1(x) return x
net = Net()
# Defining criterion and optimizer
https://colab.research.google.com/drive/1wbAxkPjUyiX68kEKdazAHBFgn751f7D7#scrollTo=9dXoZb1A7g0h&printMode=true 6/11
,
i
d
=

 14.12.2020 Data redundancy in developing deep learning tools on biological data.ipynb - Colaboratory
import torch.optim as optim
 def criterion(input, target):
labels = torch.argmax(target,2) #print(input)
#print("T")
#print(input.shape)
#print(labels.shape)
return nn.CrossEntropyLoss()(input, labels)
# Using thr ADAM optimizer
optimizer = optim.Adam(net.parameters(), lr=0.001,betas=(0.85,0.95),weight_decay=1e-6)
# Training for 50 epochs
num_epoch = 50 # Your code here!
# Creating lists to store loss and accuracies in train_loss = []
train_accuracy = []
casp_loss = []
casp_accuracy = []
TS115_loss = []
TS115_accurac = []
CB513_loss = []
CB513_accuracy = []
# Training
for epoch in range(num_epoch):  # loop over the dataset multiple times
print('Epoch ',epoch+1,' of ',num_epoch) running_loss = 0.0
running_acc = 0.0
net.train()
print('Training net') for data in TrainLoader:
inputs, labels = data
# Wrap them in Variable
inputs, labels = Variable(inputs.permute(0,2,1)), Variable(labels) # zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
running_acc += getAccuracy(outputs,labels)
loss = criterion(outputs,labels.permute(0,2,1))
loss.backward()
running_loss += loss.data.numpy()
optimizer.step()
# Nromalizing the error and append it to list. train_loss.append(running_loss/len(TrainLoader))
# Turning off gradients while validating on test set. net.eval()
with torch.no_grad():
running_loss = 0.0 running_acc = 0.0 print('Testing on CASP') for data in CASPLoader:
inputs, labels = data
tt t( ibl(i t t(02)))
https://colab.research.google.com/drive/1wbAxkPjUyiX68kEKdazAHBFgn751f7D7#scrollTo=9dXoZb1A7g0h&printMode=true 7/11

14.12.2020 Data redundancy in developing deep learning tools on biological data.ipynb - Colaboratory
outputs = net(Variable(inputs.permute(0,2,1))) loss = criterion(outputs,labels.permute(0,2,1)) running_acc += getAccuracy(outputs,labels) running_loss += loss.data.numpy()
casp_loss.append(running_loss/len(CASPLoader)) casp_accuracy.append(running_acc/len(CASPLoader)) running_loss = 0.0
running_acc = 0.0
print('Testing on TS115') for data in TS115Loader:
inputs, labels = data
outputs = net(Variable(inputs.permute(0,2,1))) loss = criterion(outputs,labels.permute(0,2,1)) running_acc += getAccuracy(outputs,labels) running_loss += loss.data.numpy()
TS115_loss.append(running_loss/len(TS115Loader)) TS115_accuracy.append(running_acc/len(TS115Loader)) running_loss = 0.0
running_acc = 0.0
print('Testing on CB513') for data in CB513Loader:
inputs, labels = data
outputs = net(Variable(inputs.permute(0,2,1))) loss = criterion(outputs,labels.permute(0,2,1)) running_acc += getAccuracy(outputs,labels) running_loss += loss.data.numpy()
CB513_loss.append(running_loss/len(CB513Loader))
CB513_accuracy.append(running_acc/len(CB513Loader)) print('Finished Training')
scale = list(range(1,num_epoch+1))
# Plotting of results
fig, axs = plt.subplots(1, 2, constrained_layout=False) axs[0].plot(scale, train_loss,label='Train') axs[0].plot(scale, casp_loss,label='CASP12') axs[0].plot(scale, TS115_loss,label='TS115') axs[0].plot(scale, CB513_loss,label='CB513') axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[0].legend(loc='upper right')
axs[1].plot(scale, train_accuracy,label='Train') axs[1].plot(scale, casp_accuracy,label='CASP12') axs[1].plot(scale, TS115_accuracy,label='TS115') axs[1].plot(scale, CB513_accuracy,label='CB513') axs[1].set_xlabel('Epochs') axs[1].set_ylabel('Accuracy')
axs[1].legend(loc='lower right') fig.suptitle('Dropout, BN and L2', fontsize=16,y=1.05) plt.tight_layout()
plt.show()
    
    