In [6]:
import re
import numpy as np
import glob
import os
import tensorflow as tf


In [7]:
helixCord=[]
sheetCord=[]
coilCord=[]

patt=re.compile('(ATOM)\s+([0-9]+)\s+(CA)\s+([A-Z]+)\s+([A-Z]{1})\s+([0-9]+)\s+([-+]?\d+\.\d+)\s+([-+]?\d+\.\d+)\s+([-+]?\d+\.\d+)\s+([-+]?\d+\.\d+)\s+([-+]?\d+\.\d+)\s+', flags=re.S)

amino_acid_id={'ALA':0, 'CYS':1, 'ASP':2, 'GLU':3, 'PHE':4, 'GLY':5, 'HIS':6, 'ILE':7, 'LYS':8, 'LEU':9, 'MET':10,
              'ASN':11, 'PRO':12, 'GLN':13, 'ARG':14, 'SER':15, 'THR':16, 'VAL':17, 'TRP':18, 'TYR':19}


In [8]:
def get_files(path): #opens each file and extract coordinates of CA+extra 2 values and stores in temp
    arr=[]

    for file in glob.glob(path, recursive=True):
        temp=[]
        with open(file) as fi:
            try:
                for results in patt.findall(fi.read()):
                    val=results[3:9]
                    val_=list(val)
                    del val_[2]
                    del val_[1]
                    if len(val_[0])!=3: #for replacing 4word acid name with 3word
                        name=val_[0][1:]
                        val_[0]=name
                    acid_id=amino_acid_id[val_[0]]
                    val_[0]=acid_id
                    val_final=[float(x) for x in val_]
                    temp.append(val_final)
            except UnicodeDecodeError:
                os.remove(file)
                #check.append(results)
    
        arr.append(temp)    # finally append temp array which contains details of CA of a single file 
                                #into another array
   
    return arr   

In [5]:
# reads inputs
helixCord=get_files('sample/helix/*')
sheetCord=get_files('sample/sheet/*')
coilCord=get_files('sample/coil/*')

    

In [4]:
def remove_unwanted(arr): #remove array with lengths other than given in list
    for val in arr:
        if len(val) not in [9,6,5,3]:
            arr.remove(val)
    arr_=[np.array(val, dtype=np.float32) for val in arr]
    arr_=np.array(arr_)
    return arr_


def pad(arr):     # pads array with zero
    final=[]
    for val in arr:
        zeros=np.zeros((9,4))
        zeros[:val.shape[0], :val.shape[1]]=val
        final.append(zeros)
    return final

    

In [7]:
helixfinal=[]
sheetfinal=[]
coilfinal=[]

In [12]:
helixfinal=remove_unwanted(helixCord)
sheetfinal=remove_unwanted(sheetCord)
coilfinal=remove_unwanted(coilCord)

In [9]:
print(len(helixfinal))
print(len(sheetfinal))
print(len(coilfinal))

39485
39905
39837


In [13]:
helixfinal=pad(helixfinal)
sheetfinal=pad(sheetfinal)
coilfinal=pad(coilfinal)

In [None]:
# just for verification purpose..

from collections import Counter
sam=[]
for i in coilfinal:
    sam.append(i.shape)
    
print(Counter(sam))

In [14]:
#stores no. of input values of each shapes (for preparing labels)

num_sheet=np.array(sheetfinal).shape[0]
num_helix=np.array(helixfinal).shape[0]
num_coil=np.array(coilfinal).shape[0]

#scales array values between -1 and 1

sheetfinal_=np.array([2.*(a - np.min(a))/np.ptp(a)-1 for a in sheetfinal])
helixfinal_=np.array([2.*(a - np.min(a))/np.ptp(a)-1 for a in helixfinal])
coilfinal_=np.array([2.*(a - np.min(a))/np.ptp(a)-1 for a in coilfinal])

features1=np.concatenate((sheetfinal_,helixfinal_, coilfinal_), axis=0) #concatenate each array into single one
features_=np.array([arr[:, :, np.newaxis] for arr in features1]) #adds extra axis to mention depth of array

#labels creation..
# index 0 - sheet
#index 1 - helix
#index 2 - coil
labels1=np.concatenate((np.array([[1.,0.,0.]*num_sheet]), np.array([[0.,1.,0.]*num_helix]), np.array([[0.,0.,1.]*num_coil]) ), axis=1 )
labels_=np.reshape(labels1, (num_sheet+num_helix+num_coil,3))



In [15]:
print(len(features_)) #for verifying features and labels are of same length..
print(len(labels_))

119218
119218


In [16]:
#randomly shuffle the dataset
from random import shuffle
idx=[i for i in range(len(features_))]
shuffle(idx)
features_=features_[idx, :, :, :]
labels_=labels_[idx, ]

In [17]:
#split dataset into train, test and validation
#test dataset consist of 10% total 
#valid dataset consist of 10-20 % of train dataset

features=features_[:109218]
labels=labels_[:109218]

test_feat=features_[109218:]
test_lab=labels_[109218:]

valid_feat=features_[:21843]
valid_labels=labels_[:21843]



In [14]:
#Hyperparameters

epochs=200
batch_size=512
display_step=10
num_classes=3


In [15]:
tf.reset_default_graph() #for resetting current graph
train_graph=tf.Graph()

def get_summary(vars_):
    tf.contrib.slim.model_analyzer.analyze_vars(vars_, print_info=True)

#def model(input_, num_classes, dropout): #model (CNN) definition    
with train_graph.as_default():

    x=tf.placeholder(dtype=tf.float32,shape=[None, 9, 4,1])
    y=tf.placeholder(dtype=tf.float32, shape=[None, 3])
    keep_prob=tf.placeholder(dtype=tf.float32)

    layer0=tf.layers.conv2d(inputs=x, filters=16, kernel_size=2, padding='SAME',
                                    activation=tf.nn.relu)
    print(layer0.shape)
    layer1=tf.layers.conv2d(inputs=layer0, filters=16, kernel_size=2, padding='SAME',
                                    activation=tf.nn.relu)
    print(layer1.shape)
    layer2=tf.layers.conv2d(inputs=layer1, filters=32, kernel_size=2, padding='SAME',
                                    activation=tf.nn.relu)
    print(layer2.shape)
    layer2=tf.layers.max_pooling2d(layer2, strides=1, pool_size=2)
    print(layer2.shape)
    layer3=tf.layers.conv2d(inputs=layer2, filters=64, kernel_size=2, padding='SAME',
                                    activation=tf.nn.relu)
    print(layer3.shape)
    layer4=tf.layers.conv2d(inputs=layer3, filters=64, kernel_size=2, padding='SAME',
                                    activation=tf.nn.relu)
    layer5=tf.layers.conv2d(inputs=layer4, filters=64, kernel_size=2, padding='SAME',
                                    activation=tf.nn.relu)

    #layer4=tf.contrib.layers.max_pool2d(layer4, 2)
    print(layer4.shape)

    fc1=tf.contrib.layers.flatten(layer5)
    
    fc1=tf.contrib.layers.fully_connected(fc1,128)
    fc1=tf.contrib.layers.dropout(fc1, keep_prob=keep_prob, is_training=True)

    fc2=tf.contrib.layers.fully_connected(fc1,256)
    fc2=tf.contrib.layers.dropout(fc2, keep_prob=keep_prob, is_training=True)
    fc3=tf.contrib.layers.fully_connected(fc2,512)
    fc4=tf.contrib.layers.fully_connected(fc3,1024)
    fc4=tf.contrib.layers.dropout(fc4, keep_prob=keep_prob, is_training=True)

        #drop=tf.contrib.layers.dropout(fc2, keep_prob=dropout, is_training=True)
        #fc3=tf.contrib.layers.fully_connected(fc2,512)
        #fc4=tf.contrib.layers.fully_connected(fc3,1024)
        #drop=tf.contrib.layers.dropout(fc4, keep_prob=dropout, is_training=True)
    logits=tf.contrib.layers.fully_connected(fc2, num_outputs=num_classes, activation_fn=tf.nn.softmax)
    loss=tf.losses.softmax_cross_entropy(y, logits)#mention loss function
    optm=tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss)#mention optimizer with learning rate
    correctPred=tf.equal(tf.argmax(logits,1), tf.argmax(y,1))# an array which stores 1 for each correct prediction and                                                    # 0 for wrong prediction 
    acc=tf.reduce_mean(tf.cast(correctPred, tf.float32)) # acc calculated by taking mean of above array

    tf.summary.scalar('loss', loss)
    tf.summary.scalar('accuracy', acc)
    

    saver=tf.train.Saver()
    
    save_file='./training_logs11.ckpt'
    model_vars=tf.trainable_variables()
    get_summary(model_vars)

    


(?, 9, 4, 16)
(?, 9, 4, 16)
(?, 9, 4, 32)
(?, 8, 3, 32)
(?, 8, 3, 64)
(?, 8, 3, 64)
---------
Variables: name (type shape) [size]
---------
conv2d/kernel:0 (float32_ref 2x2x1x16) [64, bytes: 256]
conv2d/bias:0 (float32_ref 16) [16, bytes: 64]
conv2d_1/kernel:0 (float32_ref 2x2x16x16) [1024, bytes: 4096]
conv2d_1/bias:0 (float32_ref 16) [16, bytes: 64]
conv2d_2/kernel:0 (float32_ref 2x2x16x32) [2048, bytes: 8192]
conv2d_2/bias:0 (float32_ref 32) [32, bytes: 128]
conv2d_3/kernel:0 (float32_ref 2x2x32x64) [8192, bytes: 32768]
conv2d_3/bias:0 (float32_ref 64) [64, bytes: 256]
conv2d_4/kernel:0 (float32_ref 2x2x64x64) [16384, bytes: 65536]
conv2d_4/bias:0 (float32_ref 64) [64, bytes: 256]
conv2d_5/kernel:0 (float32_ref 2x2x64x64) [16384, bytes: 65536]
conv2d_5/bias:0 (float32_ref 64) [64, bytes: 256]
fully_connected/weights:0 (float32_ref 1536x128) [196608, bytes: 786432]
fully_connected/biases:0 (float32_ref 128) [128, bytes: 512]
fully_connected_1/weights:0 (float32_ref 128x256) [32768, b

In [26]:
def get_batches(features, labels, batch_size):
    out_batches=[]
    
    for start in range(0,len(features),batch_size):
        end=start+batch_size
        batches=[features[start:end], labels[start:end]]
        out_batches.append(batches)
    
    return out_batches

def epoch_stats(sess, epoch, last_feat, last_lab, valid_features,valid_labels, trn_loss, merge, train_writer2,train_writer3, counter):
    
    summary, cur_cost=sess.run([merge, loss], feed_dict={x:last_feat, y:last_lab,keep_prob:0.6})
    train_writer2.add_summary(summary, counter)
    summ, cur_Acc=sess.run([merge, acc], feed_dict={x:valid_features, y:valid_labels, keep_prob:0.6})
    train_writer3.add_summary(summ, counter)
    print('epoch : {:<4} - train_loss : {:<8.3} - val_loss : {:<8.3} - acc: {:<5.3}'.format(epoch, trn_loss, cur_cost, cur_Acc) )


In [35]:
with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())
    
    train_writer1=tf.summary.FileWriter('summary/train_loss', train_graph)
    train_writer2=tf.summary.FileWriter('summary/valid_loss', train_graph)
    train_writer3=tf.summary.FileWriter('summary/', train_graph)

    merge=tf.summary.merge_all()
    counter=0

    for e in range(epochs):
        for values in get_batches(features, labels, batch_size):
            train_feat=values[0]
            train_lab=values[1]
            counter+=1
            summ, loss_, _ = sess.run([merge, loss, optm], feed_dict={x:train_feat, y:train_lab, keep_prob:0.6})# runs optimizer on input features
            train_writer1.add_summary(summ, counter)
        if (e%display_step==0):
            epoch_stats(sess, e, train_feat, train_lab, valid_feat, valid_labels, loss_, merge, train_writer2,train_writer3,counter)
            
    model_acc=sess.run(acc, feed_dict={x:test_feat, y:test_lab, keep_prob:0.6})
    print(model_acc)
    saver.save(sess, save_file)

epoch : 0    - train_loss : 0.992    - val_loss : 0.975    - acc: 0.532
epoch : 10   - train_loss : 0.783    - val_loss : 0.765    - acc: 0.742
epoch : 20   - train_loss : 0.727    - val_loss : 0.717    - acc: 0.79 
epoch : 30   - train_loss : 0.69     - val_loss : 0.689    - acc: 0.802
epoch : 40   - train_loss : 0.671    - val_loss : 0.672    - acc: 0.818
epoch : 50   - train_loss : 0.653    - val_loss : 0.648    - acc: 0.835
epoch : 60   - train_loss : 0.662    - val_loss : 0.654    - acc: 0.84 
epoch : 70   - train_loss : 0.644    - val_loss : 0.643    - acc: 0.843
epoch : 80   - train_loss : 0.629    - val_loss : 0.637    - acc: 0.845
epoch : 90   - train_loss : 0.659    - val_loss : 0.659    - acc: 0.858
epoch : 100  - train_loss : 0.628    - val_loss : 0.62     - acc: 0.858
epoch : 110  - train_loss : 0.616    - val_loss : 0.612    - acc: 0.856
epoch : 120  - train_loss : 0.62     - val_loss : 0.616    - acc: 0.874
epoch : 130  - train_loss : 0.609    - val_loss : 0.607    - acc

In [None]:
"87/80 keep prob- 0.6 for train, 0.7 for valid"
"86/80 log10"

In [5]:
#loads sample test cases

testcases=get_files('s_93.txt')
testcases=[np.array(i) for i in testcases]
padded=pad(testcases)
inputs=np.array([2.*(a - np.min(a))/np.ptp(a)-1 for a in padded])
inputs_=np.array([arr[:, :, np.newaxis] for arr in inputs])


NameError: name 'patt' is not defined

In [3]:
with tf.Session(graph=train_graph) as sess:
    loader = tf.train.import_meta_graph('./training_logs11.ckpt.meta') #restore trained weights
    loader.restore(sess, save_file) 
    predict=[]
    predict.append(sess.run( tf.argmax(logits, 1), feed_dict={x:inputs_, keep_prob:0.7}))#sample test cases are 
                                                                            #given here
    
print(predict)

NameError: name 'train_graph' is not defined