In [1]:
import os
import pandas as pd
import numpy as np
import random
import math
from decimal import Decimal
from sklearn import linear_model

In [2]:
path = 'D:/Bioinformatics/Diplom/'
os.chdir(path)

file = 'Lists.xlsx'
# Load data from excel
xf = pd.ExcelFile(file)    

#dfNames = xf.parse('Set200')
dfNames = xf.parse('Set45')

# Convert names of proteins in dataFrame to list
names = dfNames["Name"].tolist()
protLen = dfNames["Length"].tolist()

#os.chdir(path+"Set/")
os.chdir(path+"Set45/")
f = '.xlsx'
# Series of AA to numbers 
tableAN = pd.Series([10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29],
                 index=['A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V'])

In [3]:
# Load parameters in dataframe dfCalc:
file = 'Params_FullSet_new.xlsx'
# Load spreadsheet
xf = pd.ExcelFile(file)    
# Load a sheet into a DataFrames by name: dfParams
dfParams = xf.parse('KBES')    # to be used for energy calc
dfCalc = dfParams              # to be used in entropy calc

In [4]:
def factorialLog(number):
    flog = 0
    for n in range(1, number + 1):
          flog = flog + math.log(n)
    return(flog)

In [5]:
def logPow(a, b):
    x = b*math.log(a)
    return x

In [6]:
def calcLogEntropy(protein, dfCalc):
    # Table of frequencies in the protein    
    dfN = pd.crosstab(index = protein["Residue"], columns = protein["b0e1"])

    # Add a column of the number (n) of any residue class (40 classes)
    R = dfCalc.Residue.values
    B = []
    E = []
    for r in R:
        if r in dfN.index:
            if 0 in dfN.columns:
                B.append(dfN.at[r, 0])
            else:
                B.append(0)
            if 1 in dfN.columns:
                E.append(dfN.at[r, 1])
            else:
                E.append(0)
        else:
            B.append(0)
            E.append(0)
    dfCalc['B'] = B
    dfCalc['E'] = E
    
    # Calculate the frequencies on power of n
    PBpowN = []
    PEpowN = []
    for i in range(len(dfCalc)):
        PBpowN.append(logPow(dfCalc.F_RB[i], B[i]))
        PEpowN.append(logPow(dfCalc.F_RE[i], E[i]))
    # Calculate the LOG of frequencies on power of n
    dfCalc['PBpowN'] = PBpowN
    dfCalc['PEpowN'] = PEpowN
    
    # Calculate the LOG of factorials of n
    Bfactorial = []
    Efactorial = []
    for b in B:
        Bfactorial.append(factorialLog(b))
    for e in E:
        Efactorial.append(factorialLog(e))
    dfCalc['Bfactorial'] = Bfactorial
    dfCalc['Efactorial'] = Efactorial
    
    # Calculate the SUM (because of the log!!)
    nb = 1
    for i in dfCalc.Bfactorial:
        nb = nb+i
    ne = 1
    for i in dfCalc.Efactorial:
        ne = ne+i
    
    pb = 1
    for i in dfCalc.PBpowN:
        pb = pb+i
    pe = 1
    for i in dfCalc.PEpowN:
        pe = pe+i

    lf = factorialLog(dfCalc.B.sum()+dfCalc.E.sum())  # const!!
        
    #Calculate entropy
    entropy = (pb+pe+lf)-(nb+ne)
    
    return entropy

In [7]:
def calcEnergy(protein, dfParams): 
    energy = 0
    for num, r in enumerate(protein.Residue):
        if protein.b0e1.iloc[num] == 0:
            kbe = dfParams.KBenergy_RB[dfParams["Residue"] == r].item()
            energy = energy + kbe
        elif protein.b0e1.iloc[num] == 1:
            kbe = dfParams.KBenergy_RE[dfParams["Residue"] == r].item()
            energy = energy + kbe
    return energy

In [10]:
# Create a DF to hold the values for the Coding /X/ and Energy, Entropy and Likelihood for the 200 proteins
dfRawData = pd.DataFrame()
# lists to hold the sum of the coding per protein
X1 = []
X2 = []
# list to hold Energy values
E = []
# list to hold Entropy values
S = []
# list to hold Likelihood values
L = [] 

for count, name in enumerate(names):
    protFile = name+f
    
    # Load spreadsheet
    xf = pd.ExcelFile(protFile)    
    # Load a sheet into a DataFrames by name of the protein
    dfProtein = xf.parse('data')
    
    aa = [] # to hold residues
    be = [] # to hold pattern
    ac = [] # to hold coded residues - X1
    pc = [] # to hold coded pattern - X2

    # DF to hold the protein code, seq and B/E pattern
    dfCodeProt = pd.DataFrame()
    
    for c, i in enumerate(dfProtein.Residue):
        aa.append(i)
        be.append(dfProtein.b0e1[c])
        # Number corresponding to AA
        n = tableAN[i]
        ac.append(n)
        
        # Multibply 'n' with 0.2 if the residue is buried (0) or 0.5 if it is exposed (1)
        if dfProtein.b0e1[c] == 0: 
            pc.append(0.2)
        elif dfProtein.b0e1[c] == 1: 
            pc.append(0.5)
        else: 
            pc.append('err')
            
    dfCodeProt['AA'] = aa
    dfCodeProt['BE'] = be
    dfCodeProt['X1'] = ac
    dfCodeProt['X2'] = pc
    
    # Save the DF of coging in csv per every protein
    dfCodeProt.to_csv(path+'ML/Coding/ Coding_'+str(count)+'_'+name+'.csv') 
    
    # Calculate needed values  - codeSum, Energy and Likelihood
    x1sum = dfCodeProt.X1.sum() 
    x2sum = dfCodeProt.X2.sum() 
    energy = calcEnergy(dfProtein, dfParams)
    entropy = calcLogEntropy(dfProtein, dfCalc)
    likelihood = energy - entropy

    # Add the values to the lists for the 200 proteins
    X1.append(x1sum)
    X2.append(x2sum)
    E.append(energy)
    S.append(entropy)
    L.append(likelihood)
    
# Save the lists to a DF: 
dfRawData['name'] = names
dfRawData['Length'] = protLen
dfRawData['X1'] = X1
dfRawData['X2'] = X2
dfRawData['E'] = E
dfRawData['S'] = S
dfRawData['L'] = L
    
# Save the DF of the values needed for ML in a .csv file
dfRawData.to_csv(path+'ML/ RawData_200.csv') 

In [8]:
# For the 45 Test Data set

# Create a DF to hold the values for the Coding /X/ and Energy, Entropy and Likelihood for the 200 proteins
dfRawData = pd.DataFrame()
# lists to hold the sum of the coding per protein
X1 = []
X2 = []
# list to hold Energy values
E = []
# list to hold Entropy values
S = []
# list to hold Likelihood values
L = [] 

for count, name in enumerate(names):
    protFile = name+f
    
    # Load spreadsheet
    xf = pd.ExcelFile(protFile)    
    # Load a sheet into a DataFrames by name of the protein
    dfProtein = xf.parse('data')
    
    aa = [] # to hold residues
    be = [] # to hold pattern
    ac = [] # to hold coded residues - X1
    pc = [] # to hold coded pattern - X2

    # DF to hold the protein code, seq and B/E pattern
    dfCodeProt = pd.DataFrame()
    
    for c, i in enumerate(dfProtein.Residue):
        aa.append(i)
        be.append(dfProtein.b0e1[c])
        # Number corresponding to AA
        n = tableAN[i]
        ac.append(n)
        
        # Multibply 'n' with 0.2 if the residue is buried (0) or 0.5 if it is exposed (1)
        if dfProtein.b0e1[c] == 0: 
            pc.append(0.2)
        elif dfProtein.b0e1[c] == 1: 
            pc.append(0.5)
        else: 
            pc.append('err')
            
    dfCodeProt['AA'] = aa
    dfCodeProt['BE'] = be
    dfCodeProt['X1'] = ac
    dfCodeProt['X2'] = pc
    
    # Save the DF of coging in csv per every protein
    dfCodeProt.to_csv(path+'ML/Coding/ Coding_'+str(count)+'_'+name+'.csv') 
    
    # Calculate needed values  - codeSum, Energy and Likelihood
    x1sum = dfCodeProt.X1.sum() 
    x2sum = dfCodeProt.X2.sum() 
    energy = calcEnergy(dfProtein, dfParams)
    entropy = calcLogEntropy(dfProtein, dfCalc)
    likelihood = energy - entropy

    # Add the values to the lists for the 200 proteins
    X1.append(x1sum)
    X2.append(x2sum)
    E.append(energy)
    S.append(entropy)
    L.append(likelihood)
    
# Save the lists to a DF: 
dfRawData['name'] = names
dfRawData['Length'] = protLen
dfRawData['X1'] = X1
dfRawData['X2'] = X2
dfRawData['E'] = E
dfRawData['S'] = S
dfRawData['L'] = L
    
# Save the DF of the values needed for ML in a .csv file
dfRawData.to_csv(path+'ML/ RawData_45.csv') 

In [45]:
# Coding for one protein: 

count = 0
name = "1atx_"
protFile = name+f
    
# Load spreadsheet
xf = pd.ExcelFile(protFile)    
# Load a sheet into a DataFrames by name of the protein
dfProtein = xf.parse('data')
aa = []
be = []
ac = []
code = []

for c, i in enumerate(dfProtein.Residue):
    #print(i, dfProtein.b0e1[c])
    aa.append(i)
    be.append(dfProtein.b0e1[c])
    # Number corresponding to AA
    n = tableAN[i]
    ac.append(n)
    # Multibply 'n' with 0.2 if the residue is buried (0) or 0.5 if it is exposed (1)
    if dfProtein.b0e1[c] == 0: 
        code.append(n*0.2)
    elif dfProtein.b0e1[c] == 1: 
        code.append(n*0.5)
    else: 
        code.append('err')
        
dfCodeProt['AA'] = aa
dfCodeProt['BE'] = be
dfCodeProt['N'] = ac
dfCodeProt['Code'] = code

x = dfCodeProt.Code.sum()

energy = calcEnergy(dfProtein, dfParams)
entropy = calcLogEntropy(dfProtein, dfCalc)
likelihood = energy - entropy

dfCodeProt.to_csv(path+'ML/Coding/ Coding_'+str(count)+'_'+name+'.csv')    

In [25]:
dfCodeProt.head()

Unnamed: 0,AA,BE,N,Code
0,G,1,17,8.5
1,A,1,10,5.0
2,A,1,10,5.0
3,C,0,14,2.8
4,L,1,20,10.0


In [37]:
print(x, likelihood)

393.40000000000003 57.36518240998795
