In [78]:
import pandas as pd
import numpy as np
import os
import math as mt
import scipy as sc
import matplotlib.pyplot as plt

def onehot(seq_list,alphabet):
    length_seq = []
    for i in range(0,len(seq_list)):
        length_seq += [len(seq_list.iloc[i])]
    max_len = max(length_seq)
    onehot_seq = []
    for i in range(0,len(seq_list)):
        seq_str = seq_list.iloc[i]
        seq_str = seq_str + '-'*(max_len - len(seq_str))
        onehot_matrix = np.zeros((len(alphabet),max_len), dtype=int)
        for j in range(0,len(seq_str)):
            ind = alphabet.find(seq_str[j])
            if ind!=-1:
                onehot_matrix[ind,j] = 1 
        onehot_matrix = onehot_matrix.transpose()
        onehot_list = []
        for k in range(0,len(seq_str)):
            for j in range(0,len(alphabet)):
                onehot_list += [onehot_matrix[k,j]]
        onehot_seq.append(onehot_list)
    return onehot_seq

def ordinal(seq_list,alphabet):
    length_seq = []
    for i in range(0,len(seq_list)):
        length_seq += [len(seq_list.iloc[i])]
    max_len = max(length_seq)
    ordinal_seq = []
    for i in range(0,len(seq_list)):
        seq_str = seq_list.iloc[i]
        seq_str = seq_str + '-'*(max_len - len(seq_str))
        ordinal_vec = []
        for j in range(0,len(seq_str)):
            ind = alphabet.find(seq_str[j])
            if ind!=-1:
                ordinal_vec += [ind+1]
            else:
                ordinal_vec += [0]
        ordinal_seq.append(ordinal_vec)
    return ordinal_seq


def composition(seq_list,alphabet):
    AAC_list = []
    if type(seq_list)==str:
        n=1
    else:
        n=len(seq_list)
    for i in range(0,n):
        AAC_vec = [0]*len(alphabet)
        if type(seq_list)==str:
            seq_str = seq_list
        else:
            seq_str = seq_list.iloc[i]
        for j in range(0,len(seq_str)):
            ind = alphabet.find(seq_str[j])
            if ind!=-1:
                AAC_vec[ind] = AAC_vec[ind]+1
        N = len(seq_str)
        AAC_vec = [AAC_vec[i]/N for i in range(0,len(AAC_vec))]
        if type(seq_list)==str:
            for k in range(0,len(AAC_vec)):
                AAC_list += [AAC_vec[k]]
        else:
            AAC_list.append(AAC_vec)
    return AAC_list


def PC_prop(seq_list,alphabet,AAindex_loc,PCprop):
    AAindex = pd.read_csv(AAindex_loc)
    PCrow = AAindex.loc[AAindex['Property']==PCprop]
    PC_list = []
    for i in range(0,len(seq_list)):
        PC_vec = []
        seq_str = seq_list.iloc[i]
        for j in range(0,len(seq_str)):
            ind = alphabet.find(seq_str[j])
            if ind!=-1:
                PC_vec += PCrow[alphabet[ind]].tolist()
        PC_list.append(PC_vec)
    return PC_list

def dipeptide_comp(seq_list,alphabet):
    seq = seq_list
    dipept_list = []
    for k in range(0,len(seq)):
        dipept_comp = np.zeros((len(alphabet),len(alphabet)),dtype=int)   ## rows=first aa, columns=second aa
        dipept_quant = len(seq.iloc[k])-1
        for j in range(0,len(seq.iloc[k])-1):
            ind_first = alphabet.find(seq.iloc[k][j])
            ind_second = alphabet.find(seq.iloc[k][j+1])
            if ind_first!=-1 and ind_second!=-1:
                dipept_comp[ind_first,ind_second] += 1
        dipept_comp = dipept_comp/dipept_quant
        dipept_comp = np.reshape(dipept_comp, len(alphabet)**2)
        dipept_list.append(dipept_comp)
    return dipept_list

def digitalize(PC_list):
    length_seq = []
    for i in range(0,len(PC_list)):
        length_seq += [len(PC_list[i])]
    max_len = max(length_seq)
    pad_power = mt.ceil(mt.log(max_len, 2))
    pad_len = 2**pad_power
    nyq_lim = pad_len/2
    for i in range(0,len(PC_list)):
        PC_list[i] = PC_list[i] + [0]*(pad_len - len(PC_list[i]))
    digi_list = []
    for i in range(0,len(PC_list)):
        digi_vec = np.abs(sc.fft(PC_list[i]))
        digi_vec = digi_vec[0:int(nyq_lim)]
        digi_vec = [2*elem/pad_len for elem in digi_vec]
        digi_list.append(digi_vec)
    return digi_list
    
    

class encoding:
    
    alphabet = 'ARNDCQEGHILKMFPSTWYV'
    
    def __init__(self,dataset_path):
        if dataset_path.endswith('.csv'):
            self.datasets = pd.read_csv(dataset_path)
            self.subsetnames = os.path.basename(dataset_path)[0:-4]
            self.setname = self.subsetnames[0].split(' ', 1)[0]
        else:
            print('ARCHIVO NO APLICABLE')
                  
    def remove_outliers(self):
        col_names=list(self.datasets.columns.values)
        if col_names[2]=='is_outlier':
            outliers = self.datasets['is_outlier']
            self.datasets = self.datasets[outliers != 1]
            self.datasets.reset_index(drop=True, inplace=True)
        
    def one_hot_encoding(self,exportpath):
        seq = self.datasets['sequence']
        onehot_list = onehot(seq,self.alphabet)
        col_names = list(self.datasets.columns.values)
        columns = ['p'+str(k) for k in range(0,len(onehot_list[0]))]
        rows = [k for k in range(0,len(seq))]
        datacells = np.zeros((len(seq),len(onehot_list[0])))
        for k in range(0,len(seq)):
            for j in range(0,len(onehot_list[0])):
                datacells[k,j] = onehot_list[k][j]         
        df = pd.DataFrame(datacells,index=rows,columns=columns)
        df.insert(0,col_names[1],self.datasets[col_names[1]])
        if exportpath!='':
            outfilename = exportpath + 'One_hot ' + self.subsetnames + '.csv'
            outfile = open(outfilename, 'wb')
            df.to_csv(outfilename, index = False, header = True, sep = ',', encoding = 'utf-8')
            outfile.close()
        else:
            self.onehot_enc = df
            return self.onehot_enc
    
    def ordinal_encoding(self,exportpath):
        seq = self.datasets['sequence']
        ordinal_list = ordinal(seq,self.alphabet)
        columns = ['p'+str(k) for k in range(0,len(ordinal_list[0]))]
        col_names = list(self.datasets.columns.values)
        rows = [k for k in range(0,len(seq))]
        datacells = np.zeros((len(seq),len(ordinal_list[0])))
        for k in range(0,len(seq)):
            for j in range(0,len(ordinal_list[0])):
                datacells[k,j] = ordinal_list[k][j]         
        df = pd.DataFrame(datacells,index=rows,columns=columns)
        df.insert(0,col_names[1],self.datasets[col_names[1]])
        if exportpath!='':
            outfilename = exportpath + 'Ordinal ' + self.subsetnames + '.csv'
            outfile = open(outfilename, 'wb')
            df.to_csv(outfilename, index = False, header = True, sep = ',', encoding = 'utf-8')
            outfile.close()
        else:
            self.ordinal_enc = df
            return self.ordinal_enc
    
    def AAC_encoding(self,exportpath):
        seq = self.datasets['sequence']
        ACC_list = composition(seq,self.alphabet)
        columns = [letter for letter in self.alphabet]
        col_names = list(self.datasets.columns.values)
        rows = [k for k in range(0,len(seq))]
        datacells = np.zeros((len(seq),len(ACC_list[0])))
        for k in range(0,len(seq)):
            for j in range(0,len(ACC_list[0])):
                datacells[k,j] = ACC_list[k][j]         
        df = pd.DataFrame(datacells,index=rows,columns=columns)
        df.insert(0,col_names[1],self.datasets[col_names[1]])
        if exportpath!='':
            outfilename = exportpath + 'AAC ' + self.subsetnames + '.csv'
            outfile = open(outfilename, 'wb')
            df.to_csv(outfilename, index = False, header = True, sep = ',', encoding = 'utf-8')
            outfile.close()
        else:
            self.AAC_enc = df
            return self.AAC_enc

    def dipept_encoding(self,exportpath):
        seq = self.datasets['sequence']
        dipept_list = dipeptide_comp(seq,self.alphabet)
        columns = []
        for i in range(0,len(self.alphabet)):
            for j in range(0,len(self.alphabet)):
                dipeptide = self.alphabet[i] + self.alphabet[j]
                columns += [dipeptide]
        col_names = list(self.datasets.columns.values)
        rows = [k for k in range(0,len(seq))]
        datacells = np.zeros((len(seq),len(dipept_list[0])))
        for k in range(0,len(seq)):
            for j in range(0,len(dipept_list[0])):
                datacells[k,j] = dipept_list[k][j]         
        df = pd.DataFrame(datacells,index=rows,columns=columns)
        df.insert(0,col_names[1],self.datasets[col_names[1]])
        if exportpath!='':
            outfilename = exportpath + 'dipeptide_comp ' + self.subsetnames + '.csv'
            outfile = open(outfilename, 'wb')
            df.to_csv(outfilename, index = False, header = True, sep = ',', encoding = 'utf-8')
            outfile.close()
        else:
            self.dipept_enc = df
            return self.dipept_enc
        
    def digit_encoding(self,exportpath,AAindex_loc,PCprop):
        seq = self.datasets['sequence']
        PC_list = PC_prop(seq,self.alphabet,AAindex_loc,PCprop)
        digit_list = digitalize(PC_list)
        columns = ['p'+str(k) for k in range(0,len(digit_list[0]))]
        col_names = list(self.datasets.columns.values)
        rows = [k for k in range(0,len(seq))]
        datacells = np.zeros((len(seq),len(digit_list[0])))
        for k in range(0,len(seq)):
            for j in range(0,len(digit_list[0])):
                datacells[k,j] = digit_list[k][j]         
        df = pd.DataFrame(datacells,index=rows,columns=columns)
        df.insert(0,col_names[1],self.datasets[col_names[1]])
        if exportpath!='':
            outfilename = exportpath + 'digitalization ' + PCprop + ' ' + self.subsetnames + '.csv'
            outfile = open(outfilename, 'wb')
            df.to_csv(outfilename, index = False, header = True, sep = ',', encoding = 'utf-8')
            outfile.close()
        else:
            return df
        

    def fasta_conv(self,exportpath):
        df = self.datasets 
        fasta_list = []
        prob_type = self.datasets.columns[1]
        if prob_type == 'class':
            subsettypes = set(self.datasets['class'])
            subsettypes = list(subsettypes)
            for subset in subsettypes:
                fasta_list = []
                for i in range(0,len(df['sequence'])):
                    if df['class'].iloc[i]==subset:
                        fasta_list += ['>']
                        fasta_list += [df['sequence'].iloc[i]]
                rows = [i for i in range(0,len(fasta_list))]
                fasta_df = pd.DataFrame(fasta_list,index=rows,columns=['sequence'])
                outfilename = exportpath + 'fasta ' + str(subset) + ' ' + self.subsetnames + '.fasta'
                with open(outfilename, 'w') as outfile:
                    for row in fasta_df['sequence']:
                        outfile.write("".join(row)+'\n')
                outfile.close()
        if prob_type == 'response':
            for i in range(0,len(df['sequence'])):
                fasta_list += ['>']
                fasta_list += [df['sequence'].iloc[i]]
            rows = [i for i in range(0,len(fasta_list))]
            fasta_df = pd.DataFrame(fasta_list,index=rows,columns=['sequence'])
            outfilename = exportpath + 'fasta ' + self.subsetnames + '.fasta'
            with open(outfilename, 'w') as outfile:
                for row in fasta_df['sequence']:
                    outfile.write("".join(row)+'\n')
            outfile.close()

# El presente script puede ejecutarse por sí solo utilizando las funciones y métodos definidos.
# Input: Archivo que se desea codificar, debe ser un .csv con 3 columnas: sequence, class o response, is_outlier.
# Output: Archivo codificado según el método especificado, en formato .csv. Contiene class o response en la primera columna.

## Instrucciones de uso:
# 1. Declarar el archivo que se desea codificar
# 2. Declarar la carpeta donde se desea guardar el archivo codificado
# 3. En el caso de la digitalización, declarar el archivo que corresponde al AAindex, en .csv.
# 4. Declarar el código de la propiedad fisicoquímica que se pretende emplear.
# 5. Definir el conjunto de datos mediante una instancia de la clase "encoding"
# 6. Emplear el método "remove_outliers()" para eliminar los outliers
# 7. Utilizar los distintos métodos de codificación

# Ejemplo (ajustar directorios según corresponda):

inputpath = 'C:/Users/Kevin/Desktop/Datasets CSV/VaxinPad.csv'
exportpath = 'C:/Users/Kevin/Desktop/'
AAindex_path = 'C:/Users/Kevin/Desktop/AAindex.csv'
props = ['PRAM900102','PRAM900103','COSI940101','HOPT810101','JOND750101','RADA880106','GRAR740103','FASG760101']

Dataset = encoding(inputpath) #Define el conjunto de datos, el objeto posee los atributos "subsetname y dataset"
Dataset.remove_outliers() #Elimina los outliers según la columna is_outlier
#Dataset.digit_encoding(exportpath,AAindex_path,PCprop) #Retorna un archivo con el nombre "digitalization ANDN920101 VaxinPad.csv"
#Dataset.one_hot_encoding(exportpath) #Retorna un archivo con el nombre "One_hot VaxinPad.csv"
#Dataset.ordinal_encoding(exportpath) #Retorna un archivo con el nombre "Ordinal VaxinPad.csv"
#Dataset.AAC_encoding(exportpath) #Retorna un archivo con el nombre "AAC VaxinPad.csv"
Dataset.dipept_encoding(exportpath) #Retorna un archivo con el nombre "dipeptide_comp VaxinPad.csv"


In [51]:
#for prop in props:
#    print(prop)
#    exportpath = exportloc + prop + '/' 
#    for root,dirs,files in os.walk(inputpath):
#        for file in files:
#            if file.endswith(".csv"):
#                filename = inputpath + '/' + file
#                print(filename)
#                subset = encoding(filename)
#                subset.remove_outliers()
#                subset.digit_encoding(exportpath,AAindex_path,prop)

255


In [12]:
#fastapath = 'C:/Users/Kevin/Desktop/fasta sequences/'
#for root,dirs,files in os.walk(inputpath):
#    for file in files:
#        if file.endswith(".csv"):
#            filename = inputpath + '/' + file
#            print(filename)
#            subset = encoding(filename)
#            subset.remove_outliers()
#            subset.fasta_conv(fastapath)

C:/Users/Kevin/Desktop/Datasets CSV//ACP-DL.csv
C:/Users/Kevin/Desktop/Datasets CSV//AntiTb_primary.csv
C:/Users/Kevin/Desktop/Datasets CSV//AntiTb_secondary.csv
C:/Users/Kevin/Desktop/Datasets CSV//DBP.csv
C:/Users/Kevin/Desktop/Datasets CSV//enantioselectivity.csv
C:/Users/Kevin/Desktop/Datasets CSV//iACP.csv
C:/Users/Kevin/Desktop/Datasets CSV//iAMP-2L_binary.csv
C:/Users/Kevin/Desktop/Datasets CSV//iAMP-2L_multiclass.csv
C:/Users/Kevin/Desktop/Datasets CSV//localization.csv
C:/Users/Kevin/Desktop/Datasets CSV//Pop_ara.csv
C:/Users/Kevin/Desktop/Datasets CSV//Pop_chlamy.csv
C:/Users/Kevin/Desktop/Datasets CSV//Pop_yeast.csv
C:/Users/Kevin/Desktop/Datasets CSV//QSP.csv
C:/Users/Kevin/Desktop/Datasets CSV//RT_hela.csv
C:/Users/Kevin/Desktop/Datasets CSV//RT_misc.csv
C:/Users/Kevin/Desktop/Datasets CSV//RT_yeast.csv
C:/Users/Kevin/Desktop/Datasets CSV//Solub.csv
C:/Users/Kevin/Desktop/Datasets CSV//t50.csv
C:/Users/Kevin/Desktop/Datasets CSV//VaxinPad.csv


In [53]:
A = np.array([[1,2,3],[4,5,6]])
B =[]
print(np.reshape(A,-1))
B += [np.reshape(A,-1)]

[1 2 3 4 5 6]
2
