In [1]:
import csv
import numpy as np
import keras.utils.np_utils as kutils
from tensorflow.keras import backend

#input format   label,proteinName, postion,sites, shortsequence,
#input must be a .csv file
#positive_position_file_name is an csv file


def getMatrixInput(positive_position_file_name,sites, window_size=51, empty_aa = '*'):
    # input format  proteinName, postion, shortsequence,
    prot = []  # list of protein name
    pos = []  # list of position with protein name
    rawseq = []
    # all_label = []

    short_seqs = []
    half_len = (window_size - 1) // 2

    with open(positive_position_file_name, 'r') as rf:
        reader = csv.reader(rf)
        for row in reader:
            sseq = row[2]
            position = int(row[1])
            center = sseq[position-1]
            if center in sites:
                prot.append(row[0])
                pos.append(row[1])
                rawseq.append(sseq)
                # print rawseq

                #short seq
                if position - half_len > 0:
                    
                    start = position - half_len
#                     print(position,half_len,start)
                    left_seq = sseq[start - 1:position - 1]
                    
                else:
                    left_seq = sseq[0:position - 1]

                end = len(sseq)
                if position + half_len < end:
                    end = position + half_len
                right_seq = sseq[position:end]

                if len(left_seq) < half_len:
                    nb_lack = half_len - len(left_seq)
                    left_seq = ''.join([empty_aa for count in range(nb_lack)]) + left_seq

                if len(right_seq) < half_len:
                    nb_lack = half_len - len(right_seq)
                    right_seq = right_seq + ''.join([empty_aa for count in range(nb_lack)])
                shortseq = left_seq + center + right_seq
                short_seqs.append(shortseq)
                # coding = one_hot_concat(shortseq)
                # all_codings.append(coding)

        all_label = [0] *5 + [1]*(len(short_seqs)-5)
        targetY = kutils.to_categorical(all_label)

        ONE_HOT_SIZE = 21
        # _aminos = 'ACDEFGHIKLMNPQRSTVWY*'
        letterDict = {}
        letterDict["A"] = 0
        letterDict["C"] = 1
        letterDict["D"] = 2
        letterDict["E"] = 3
        letterDict["F"] = 4
        letterDict["G"] = 5
        letterDict["H"] = 6
        letterDict["I"] = 7
        letterDict["K"] = 8
        letterDict["L"] = 9
        letterDict["M"] = 10
        letterDict["N"] = 11
        letterDict["P"] = 12
        letterDict["Q"] = 13
        letterDict["R"] = 14
        letterDict["S"] = 15
        letterDict["T"] = 16
        letterDict["V"] = 17
        letterDict["W"] = 18
        letterDict["Y"] = 19
        letterDict["*"] = 20

        # print len(short_seqs)
        Matr = np.zeros((len(short_seqs), window_size, ONE_HOT_SIZE))
        samplenumber = 0
        for seq in short_seqs:
            AANo = 0
            for AA in seq:
                index = letterDict[AA]
                # print index
                Matr[samplenumber][AANo][index] = 1
                # print samplenumber
                AANo = AANo+1
            samplenumber = samplenumber + 1

    return Matr, targetY, prot, pos

Using TensorFlow backend.


In [2]:
import functools
import itertools
import os
import random
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import csv
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, cross_val_score

from keras.layers import Dense, Activation, Flatten, Dropout, Reshape
from keras.layers import Conv1D,Conv2D, MaxPooling2D
from keras.models import Sequential,Model
from keras.utils.np_utils import to_categorical
from keras import optimizers
from keras.optimizers import Adam,SGD
from keras.layers.normalization import BatchNormalization
from keras.regularizers import l2
import copy

def predict_for_deepphos(train_file_name,sites,predictFrame = 'general',
                         hierarchy=None, kinase=None):
    '''

    :param train_file_name: input of your prdict file
                            it must be a .csv file and theinput format  is proteinName, postion,sites, shortseq
    :param sites: the sites predict: site = 'S','T' OR 'Y'
    :param predictFrame: 'general' or 'kinase'
    :param hierarchy: if predictFrame is kinse: you must input the hierarchy:
            group,family,subfamily,kinase to choose corresponding model
    :param kinase: kinase name
    :return:
     a file with the score
    '''


    win1 = 51
    win2 = 33
    win3 = 15
#     from methods.dataprocess_predict import getMatrixInput
    [X_test1,y_test,ids,position] = getMatrixInput(train_file_name, sites, win1)
    [X_test2,_,_,_] = getMatrixInput(train_file_name, sites, win2)
    [X_test3,_,_,_]  = getMatrixInput(train_file_name, sites, win3)

#     print X_test1.shape
#     print len(position)

    from methods.model_n import model_net
    model = model_net(X_test1, X_test2, X_test3, y_test,nb_epoch = 0)

    #load model weight
    if predictFrame == 'general':
        outputfile = 'general_{:s}'.format(site)
        if site == ('S','T'):
            model_weight = './models/model_general_S,T.h5'
        if site == 'Y':
            model_weight = './models/model_general_Y.h5'


    if predictFrame == 'kinase':
        outputfile = 'kinase_{:s}_{:s}'.format(hierarchy, kinase)
        model_weight = './models/model_{:s}_{:s}.h5'.format(hierarchy, kinase)
#     print model_weight
    model.load_weights(model_weight)
    predictions_t = model.predict([X_test1, X_test2, X_test3])
    results_ST = np.column_stack((ids, position,predictions_t[:, 1]))

    result = pd.DataFrame(results_ST)
    result.to_csv(outputfile + "prediction_phosphorylation.txt", index=False, header=None, sep='\t',
                  quoting=csv.QUOTE_NONNUMERIC)
    
# if __name__ == '__main__':
#     train_file_name = 'test data.csv'
#     site = 'S','T'
#     predict_for_deepphos(train_file_name, site, predictFrame='kinase',
#                          hierarchy='group', kinase='AGC')






In [3]:
predict_for_deepphos('/home/k4thryn/Repos/EMBER_revised/data_dev/test_deepphos.csv','Y','kinase',hierarchy='family',kinase='Src')

AttributeError: module 'tensorflow' has no attribute 'get_default_graph'

In [None]:
data = pd.read_csv("test_deepphos.csv") 
# Preview the first 5 lines of the loaded data 
data.head()
data.dtypes