# Bioinformatics - Protein subcellular location

In [58]:
import os
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import xgboost
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

from  sklearn import preprocessing
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,RandomizedLogisticRegression
from sklearn.metrics import f1_score,confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import pandas as pd
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from collections import Counter
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import SGD
from Bio import SeqIO
import re
import tensorflow as tf







##  Load data and feature extraction

In [2]:
def preprocess_pipeline(*files):
    #p = re.compile("(\w+\|\w+)\|(\w+\s[0-9a-zA-Z_\s\(\)\-\/,\.\>\:\'\[\]\+]+)OS=([0-9a-zA-Z_\s\(\)\-\/,\.\>\:\']+)GN=([0-9a-zA-Z_\s\(\)\-\/,\.\>\:\']+)PE=([0-9])+\s[SV=]+([0-9])|(\w+\|\w+)\|(\w+\s[0-9a-zA-Z_\s\(\)\-\/,\.\>\:\'\[\]]+)OS=([0-9a-zA-Z_\s\(\)\-\/,\.\>\:\']+)PE=([0-9])+\s[SV=]+([0-9])")
    p=re.compile("\|\w+\s(.+)OS=([0-9a-zA-Z_\s\(\)\-\/,\.\>\:\']+)(?:\sGN|\sPE)")
    data_features = []
    data_labels = []
    sequence = ''
    list_meta=[]
    for file in files:
        label = os.path.splitext(file)[0]
        f = open(file, "r")
        dict_meta = defaultdict(float)
        first_line = f.readline()
        meta_info = p.search(first_line)
        try:
            dict_meta["organism"] = meta_info.group(2)
            dict_meta["protein"] = meta_info.group(1)
            dict_meta["class"] = label
        except:
            print(first_line)
        list_meta.append(dict_meta)
        for line in f:
            line = line.rstrip('\n')
            if line[0] != '>':
                sequence += line
            else:
                dict_meta["sequence"] = sequence
                dict_meta = defaultdict(float)
                meta_info = p.search(line)
                try:
                    dict_meta["organism"] = meta_info.group(2)
                    dict_meta["protein"] = meta_info.group(1)
                    dict_meta["class"] = label
                except:
                    print(line)
                list_meta.append(dict_meta)
                data_features.append(sequence)
                data_labels.append(label)
                sequence = ''
        #Last input
        list_meta[-1]["sequence"] = sequence
        data_features.append(sequence)
        data_labels.append(label)
        sequence = ''



    return data_features, data_labels,list_meta

dic_properties = {
    'small' : ['A','G','C','S','P','N','C','T','D'],
    'tiny' : ['A','G','C','S'],
    'polar' : ['K','H','R','D','E','Q','N','S','C','T','Y','W'],
    'charged' : ['K','H','R','D','E'],
    'positive' : ['K','H','R'],
    'negative' :  ['D','E'],
    'hidrophobic' : ['F','Y','W','H','I','L','V','A','G','C','M','K','T'],
    'aromatic' : ['F','Y','W','H'],
    'aliphatic' : ['I','L','V']
    
}

def feat_extract(sequences):
    list_dict_feat = []
    for sequence in sequences:
        
        protein = ProteinAnalysis(sequence)
        sequence_feat = defaultdict(float)
        sequence_len = len(sequence)

        sequence_feat["sequence_length"] = sequence_len        
        sequence_feat["aromaticty"] = protein.aromaticity()
        sequence_feat["isoeletric_point"] = protein.isoelectric_point()
        #sequence_feat["flexibility"] = protein.flexibility()
        if ('X' not in sequence) and ('O' not in sequence) and ('U' not in sequence) and ('B' not in sequence):
            sequence_feat["molecular_weight"] = protein.molecular_weight()
        for letter in sequence:
            sequence_feat["relative_fre_{}".format(letter)] += 1/sequence_len
            for property in dic_properties:
                if letter in dic_properties[property]:
                    sequence_feat['freq_{}'.format(property)] += 1
        for letter in sequence[0:50]:    
            sequence_feat["relative_fre_start{}".format(letter)] += 1/50
        for letter in sequence[-51:-1]:    
            sequence_feat["relative_fre_end{}".format(letter)] += 1/50
        list_dict_feat.append(sequence_feat)
    return list_dict_feat

label_encoder = preprocessing.LabelBinarizer()
vectorizer = DictVectorizer(sparse=False)

## Linear Models

In [3]:
def train(x,y):
    
    labels_enc = label_encoder.fit_transform(y)
    features_enc = vectorizer.fit_transform(feat_extract(x))
    
    
    #model = RandomForestClassifier(class_weight='balanced',n_estimators=15)
    
    model = xgboost.XGBClassifier(
                 learning_rate =0.1,
                 n_estimators=1000,
                 max_depth=5,
                 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
    
    #model = SVC(class_weight='balanced', probability=True)
    #model = LogisticRegression(class_weight='balanced')
    #model = RandomizedLogisticRegression()
    model.fit(features_enc, labels_enc)
    
    return model

def validate(x,model):
    
    
    features = vectorizer.transform(feat_extract(x))
    predicts = model.predict_proba(features)
    predicts_label = np.argmax(predicts,1)
    labels_predicted = label_encoder.inverse_transform(predicts_label)
    label_and_confidence = list(zip(labels_predicted,np.amax(predicts,1)))

    return labels_predicted#,np.amax(predicts,1)

## Neural Network

In [54]:
def create_model(ip_dim=80):
    model = Sequential()
    model.add(Dense(1000,input_dim=ip_dim,activation="tanh",init='uniform'))
    model.add(Dense(4,activation="softmax",init='uniform'))
    model.compile(loss='categorical_crossentropy', optimizer=SGD(lr=0.00001, momentum=0.8, decay=0.0, nesterov=False), metrics=['accuracy'])
    return model
    
def train_nn(x,y):
    labels_enc = label_encoder.fit_transform(y)
    features_enc = vectorizer.fit_transform(feat_extract(x))
    print(labels_enc)
    print(features_enc.shape)
    #features_enc = np.array([[0.3,0.1,0.9],[0.45,0.7,0.1],[0.7,0.7,0.1],[0.3,0.9,0.1], [1,0,0],[0.11,4,2],[0,0,5]])
    #labels_enc = np.array([[0,0,1],[0,1,0],[1,0,0],[0,1,0], [1,0,0],[0,1,0],[0,0,1]])
    model = create_model(features_enc.shape[1])
    model.fit(features_enc, labels_enc, nb_epoch=1000, batch_size=1,verbose=2)
    return model

def validate_nn(x,y,model):
    labels_enc = label_encoder.fit_transform(y)
    features_enc = vectorizer.fit_transform(feat_extract(x))
    loss_and_metrics = model.evaluate(features_enc,labels_enc, batch_size=32)
    return loss_and_metrics


In [55]:
#data_sequence, data_labels, meta_info = preprocess_pipeline('cyto.fasta', 'mito.fasta','nucleus.fasta','secreted.fasta')
#train_x, val_x, train_y, val_y = train_test_split(data_sequence,data_labels,test_size=0.998,random_state=3)

#rf = train(train_x,train_y)

#pred_y = validate(val_x,rf)
nn = train_nn(train_x,train_y)
#print(validate_nn(val_x,val_y,nn))
#print(meta_info)

#df = pd.DataFrame(meta_info)

#print(df)
#with open('ola.csv','w') as f:
#    df.to_csv(f)
#df.groupby("organism").count()

[[1 0 0 0]
 [0 0 0 1]
 [1 0 0 0]
 [1 0 0 0]
 [0 1 0 0]
 [0 1 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [0 0 1 0]
 [0 0 0 1]
 [1 0 0 0]
 [0 0 1 0]
 [1 0 0 0]
 [0 0 0 1]
 [0 0 1 0]
 [0 0 0 1]
 [1 0 0 0]
 [0 0 1 0]]
(18, 73)
Epoch 1/1000
0s - loss: 1.4103 - acc: 0.0556
Epoch 2/1000
0s - loss: 1.3690 - acc: 0.3333
Epoch 3/1000
0s - loss: 1.3616 - acc: 0.2222
Epoch 4/1000
0s - loss: 1.3503 - acc: 0.2222
Epoch 5/1000
0s - loss: 1.3417 - acc: 0.3333
Epoch 6/1000
0s - loss: 1.3474 - acc: 0.3889
Epoch 7/1000
0s - loss: 1.3291 - acc: 0.3889
Epoch 8/1000
0s - loss: 1.3152 - acc: 0.3889
Epoch 9/1000
0s - loss: 1.3160 - acc: 0.3889
Epoch 10/1000
0s - loss: 1.3145 - acc: 0.3889
Epoch 11/1000
0s - loss: 1.3116 - acc: 0.3889
Epoch 12/1000
0s - loss: 1.3171 - acc: 0.3889
Epoch 13/1000
0s - loss: 1.3207 - acc: 0.3889
Epoch 14/1000
0s - loss: 1.3103 - acc: 0.3889
Epoch 15/1000
0s - loss: 1.3101 - acc: 0.3889
Epoch 16/1000
0s - loss: 1.3256 - acc: 0.3889
Epoch 17/1000
0s - loss: 1.3148 - acc: 0.3889
Epoch 18/1000
0s - 

In [93]:
n_hidden = 10
lambda_l2 = 0
shp = 73
### MODEL ###
tf.reset_default_graph()
## PLACEHOLDERS
features_tf = tf.placeholder(tf.float32, [None,shp], "feat")       
label_tf = tf.placeholder(tf.int32, [None], "label")             

batch_size = tf.shape(features_tf)[0]


### WEIGHTS AND BIASES ######

weights = {
    'h1': tf.get_variable(name='wh1',shape=[shp, n_hidden]),
    #'h2': tf.get_variable(name='wh2',shape=[n_hidden, n_hidden],initializer=tf.contrib.layers.xavier_initializer()),
    'out': tf.get_variable(name='whout',shape=[n_hidden, 4])
}

biases ={
    'h1': tf.get_variable(name='bh1',shape=[1,n_hidden],initializer=tf.contrib.layers.xavier_initializer()),
    #'h2': tf.get_variable(name='bh2',shape=[1,n_hidden],initializer=tf.contrib.layers.xavier_initializer()),
    'out': tf.get_variable(name='bout',shape=[1,4],initializer=tf.contrib.layers.xavier_initializer())
}




###### Layers ######
h1 = tf.nn.relu(tf.add(tf.matmul(features_tf,weights['h1']),biases['h1']))
#h2 = tf.nn.relu(tf.add(tf.matmul(h1,weights['h2']),biases['h2']))

logits_flat = tf.add(tf.matmul(h1,weights['out']),biases['out'])   # [batch_size x 5*target_size]


# loss 
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits_flat, label_tf))
                    

# prediction function
#softmaxes = [tf.nn.softmax(tensor) for tensor in logits_flat]
#softmaxed_logits = tf.pack(softmaxes, axis=1)
#predict = tf.arg_max(softmaxed_logits, 2)

opt_op = tf.train.AdamOptimizer(2).minimize(loss)
BATCH_SIZE = 1
import random
import time
with tf.Session() as sess:
    sess.run(tf.initialize_all_variables())
    

    for epoch in range(10):
        print('----- Epoch', epoch, '-----')
        total_loss = 0
        t = time.time()
        n=18
        for i in range(n // BATCH_SIZE):
            #labels_enc = label_encoder.fit_transform(train_y)
            labels_enc = preprocessing.LabelEncoder().fit(train_y).transform(train_y)
            features_enc = vectorizer.fit_transform(feat_extract(train_x))
            feed_dict= {features_tf:features_enc,label_tf:labels_enc}
            wei,lf,current_loss = sess.run([weights['h1'],logits_flat,loss], feed_dict=feed_dict)
            total_loss += current_loss
            
        #print(wei)
        print(' Train loss:', total_loss / n)

Instructions for updating:
Use `tf.global_variables_initializer` instead.
----- Epoch 0 -----
 Train loss: 11340.0195313
----- Epoch 1 -----
 Train loss: 11340.0195313
----- Epoch 2 -----
 Train loss: 11340.0195313
----- Epoch 3 -----
 Train loss: 11340.0195313
----- Epoch 4 -----
 Train loss: 11340.0195313
----- Epoch 5 -----
 Train loss: 11340.0195313
----- Epoch 6 -----
 Train loss: 11340.0195313
----- Epoch 7 -----
 Train loss: 11340.0195313
----- Epoch 8 -----
 Train loss: 11340.0195313
----- Epoch 9 -----
 Train loss: 11340.0195313


In [102]:
# Parameters
tf.reset_default_graph()

learning_rate = 0.001
training_epochs = 50
batch_size = 1
display_step = 1

# Network Parameters
n_hidden_1 = 25 # 1st layer number of features
n_hidden_2 = 25 # 2nd layer number of features
n_input = 73 # MNIST data input (img shape: 28*28)
n_classes = 4 # MNIST total classes (0-9 digits)

# tf Graph input
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes])


# Create model
def multilayer_perceptron(x, weights, biases):
    # Hidden layer with RELU activation
    layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
    layer_1 = tf.nn.relu(layer_1)
    # Hidden layer with RELU activation
    layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
    layer_2 = tf.nn.relu(layer_2)
    # Output layer with linear activation
    out_layer = tf.matmul(layer_2, weights['out']) + biases['out']
    return out_layer

# Store layers weight & bias
weights = {
    'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
    'h2': tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_hidden_2, n_classes]))
}
biases = {
    'b1': tf.Variable(tf.random_normal([n_hidden_1])),
    'b2': tf.Variable(tf.random_normal([n_hidden_2])),
    'out': tf.Variable(tf.random_normal([n_classes]))
}

# Construct model
pred = multilayer_perceptron(x, weights, biases)
soft_m = tf.nn.softmax(pred)
# Define loss and optimizer
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

# Initializing the variables
init = tf.global_variables_initializer()

# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # Training cycle
    for epoch in range(training_epochs):
        avg_cost = 0.
        
        # Loop over all batches
        for i in range(18//batch_size):
            batch_y =label_encoder.fit_transform(train_y)
            batch_x =vectorizer.fit_transform(feat_extract(train_x))
            # Run optimization op (backprop) and cost op (to get loss value)
            _, c,predi = sess.run([optimizer, cost,soft_m], feed_dict={x: batch_x,
                                                          y: batch_y})
            # Compute average loss
            avg_cost += c / 18
        # Display logs per epoch step
        if epoch % display_step == 0:
            print("Epoch:", '%04d' % (epoch+1), "cost=", \
                "{:.9f}".format(avg_cost))
            print(predi)
    print("Optimization Finished!")

Epoch: 0001 cost= 151785.533420139
[[ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]
 [ 1.  0.  0.  0.]]
Epoch: 0002 cost= 47390.713541667
[[ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]
 [ 0.  0.  1.  0.]]
Epoch: 0003 cost= 23158.296440972
[[  0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   1.00000000e+00   0.00000000e+00

## Validation

In [None]:
cm = confusion_matrix(val_y,pred_y)
stats =  precision_recall_fscore_support(val_y,pred_y)

stats = pd.DataFrame(data=np.transpose(np.array(stats[0:3])),columns=['precision','recall','f1'])
print(cm)

display(stats)


In [None]:
#ola = feat_extract(val_x)
#ola[23]
count = 0
for i in range(0,len(val_y)):
    if val_y[i] == pred_y[i]:
        count += 1
print(count/len(val_y))
#len(pred_y)

## Test set

In [None]:
def test_blind(file,model):
    f = open(file,'r')
    preds = open('blind_predictions.txt','w')
    sequence = ''
    
    first_line= f.readline()
    first_line = first_line.rstrip('\n')
    preds.write(first_line + ' ')
    
    for line in f.readlines():
        line = line.rstrip('\n')
        if line[0] != '>':
            sequence += line
        else:
            #import pdb;pdb.set_trace()
            feature = vectorizer.transform(feat_extract([sequence]))
            predict = model.predict_proba(feature)
            predict_label = np.argmax(predict,1)
            label_predicted = label_encoder.inverse_transform(predict_label)
            #preds.write(label_predicted[0] + ' \t\t' + str(np.amax(predict,1)[0]) + '\n' + line + ' ')
            preds.write("{0} {1:>8} \n{2} ".format(label_predicted[0],str(np.amax(predict,1)[0]),line))
            sequence = ''
    feature = vectorizer.transform(feat_extract([sequence]))
    predict = model.predict_proba(feature)
    predict_label = np.argmax(predict,1)
    label_predicted = label_encoder.inverse_transform(predict_label)
    #preds.write(label_predicted[0] + ' \t\t' + str(np.amax(predict,1)[0]) + '\n' + line + ' ')
    preds.write("{0} {1}".format(label_predicted[0],str(np.amax(predict,1)[0]),line))
    sequence = ''
    preds.close()
    f.close()
    

# Playground

In [None]:
test_blind('blind.fasta',rf)

In [None]:
data_sequence, data_labels = preprocess_pipeline('cyto.fasta', 'mito.fasta','nucleus.fasta','secreted.fasta')
train_x, val_x, train_y, val_y = train_test_split(data_sequence,data_labels,test_size=0.3,random_state=3)

hist_train = Counter(train_y)
hist_val = Counter(val_y)

print(hist_train)
print(hist_val)


In [None]:
hist = Counter([len(x) for x in data_sequence])
#df = pd.DataFrame(hist,index=[0])
#df = pd.DataFrame.from_dict(hist,orient='index')
transposed = np.array(list(hist.items())).T
x, y = transposed

plt.plot(x,y)
plt.show()


In [None]:
vectorizer.fit_transform(feat_extract(train_x)).shape[0]

In [None]:
with open('cyto.fasta') as fasta_file:  # Will close handle cleanly
    identifiers = []
    lengths = []
    for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
        identifiers.append(seq_record.id)
        lengths.append(len(seq_record.seq))

In [None]:
p=re.compile("OS=([0-9a-zA-Z_\s\(\)\-\/,\.\>\:\']+) (?:GN|PE)")#
meta_info = p.search('OS=Penicillium funiculosum PE=1 SV=1')
meta_info.group(1)

In [None]:
import re
(\w+\|\w+)\|(\w+\s[0-90-9a-zA-Z_\s\(\)\-\/]+)OS=([0-90-9a-zA-Z_\s\(\)\-\/]+)GN=([0-90-9a-zA-Z_\s-]+)PE=([0-9])+\s[SV=]+([0-9])

In [None]:
label_encoder.fit_transform(train_y)

In [None]:
train_y[-4:-1]


In [39]:
nn.predict(np.array([[1,0,0]]))

ValueError: Error when checking : expected dense_input_10 to have shape (None, 76) but got array with shape (1, 3)

In [66]:
aa = preprocessing.LabelEncoder().fit(train_y)

In [103]:
train_y

['cyto',
 'secreted',
 'cyto',
 'cyto',
 'mito',
 'mito',
 'nucleus',
 'cyto',
 'nucleus',
 'secreted',
 'cyto',
 'nucleus',
 'cyto',
 'secreted',
 'nucleus',
 'secreted',
 'cyto',
 'nucleus']