In [1]:
import os
import pandas as pd
import numpy as np
import random
import math
from sklearn.utils import resample

In [2]:
# Change directory 
os.chdir("C:/Kate/Bioinformatics/Diplom")

# Assign spreadsheet filename to `file`
file = 'Selection_434.xlsx'

# Load spreadsheet
xf = pd.ExcelFile(file)

# Print the sheet names
print(xf.sheet_names)

# Load a sheet into a DataFrames by name: dfSelection
dfSelection = xf.parse('Selection')

# Convert names of proteins in dataFrame to list
dataSet = dfSelection["ID"].tolist()

['Sheet1', 'Sheet2', 'Selection', 'List', 'AA', 'tableAA', 'Test']


In [3]:
AA = ['A','R','N','D','C','E','Q','G','H','I','L','K','M','F','P','S','T','W','Y','V']
R = sorted(AA)

# Create a DataFrame that will hold the sum of important params from every sample:
dfSum = pd.DataFrame(columns=['Residue','F_RB','F_RE','KBenergy_RB','KBenergy_RE'])
dfSum.Residue = R
dfSum.F_RB = 0
dfSum.F_RE = 0
dfSum.KBenergy_RB = 0
dfSum.KBenergy_RE = 0

In [5]:
# Resample and calculate parameters

path = 'C:/Kate/Bioinformatics/Diplom/Set/'
f = '.xlsx'

repeats = 100 
size = 50

for n in range(repeats):
    # Change directory 
    os.chdir("C:/Kate/Bioinformatics/Diplom/Set")
    
    # Resample
    # to use random seed 1: random_state=1
    names = resample(dataSet, replace=True, n_samples=size)
    dfNames = pd.DataFrame()
    dfNames['Names'] = names
    
    oob = [x for x in dataSet if x not in names]
    dfoob = pd.DataFrame()
    dfoob['OutOfBox'] = oob
    
    dfWhole = pd.DataFrame(columns=['Index','Residue','sumSASA','rSASA','maxProtOr','RSA','b0e1'])
    
    for i in names:
        #Load a file from the sample 
        protFile = i+f
        # Load spreadsheet
        xf = pd.ExcelFile(protFile)    
        # Load a sheet into a DataFrames by name of the protein
        dfData = xf.parse('data')
    
        # Append the file to form a whole set from the selection files
        dfWhole = dfWhole.append(dfData, ignore_index=True)
        
    # Calculate the Length of the data set
    Length = len(dfWhole)
        
    # Count residue types 
    # R = RE + RB
    numR = dfWhole.groupby('Residue')['b0e1'].count()
        
    # Make sure that the set containes all types of residues
    if len(numR) < 20:
        break
        
    dfParams = pd.DataFrame()
    dfParams['Residue'] = numR.index
    dfParams['numR'] = numR.values
               
    # Count the buried residues per type
    subNumRB = dfWhole[dfWhole['b0e1'] == 0].groupby('Residue')['b0e1'].count()
    # Count the buried residues per type
    subNumRE = dfWhole[dfWhole['b0e1'] == 1].groupby('Residue')['b0e1'].count()
        
    R = dfParams.Residue.values
    numRB = []
    numRE = []
    for r in R:
        if r in subNumRB.index:
            numRB.append(subNumRB.at[r])
        else:
            numRB.append(0)
                
        if r in subNumRE.index:
            numRE.append(subNumRE.at[r])
        else:
            numRE.append(0)
        
    dfParams['RB'] = numRB
    dfParams['RE'] = numRE
    
    dfParams['F_RB'] = dfParams.RB/Length
    dfParams['F_RE'] = dfParams.RE/Length
    
    # Count buried and exposed residues
    numBE = dfWhole.groupby('b0e1')['Residue'].count()
        
    # Calculate the expected probability for a residue to be Buried 
    expPB = numBE.at[0]/Length
    # Calculate the expected probability for a residue to be Exposed 
    expPE = numBE.at[1]/Length
        
    # Calculate the oberved probability that a residue of certain type is buried 
    dfParams['PobsRB'] = dfParams.RB/numR.values
    # Calculate the oberved probability that a residue of certain type is buried
    dfParams['PobsRE'] = dfParams.RE/numR.values
        
    # LOG base 10 - 'log10'
    KBenergyRB_log10 = []
    for p in dfParams.PobsRB.values:
        KBenergyRB_log10.append(math.log10(expPB/p))           
    # LOG base 10 - 'log10'
    KBenergyRE_log10 = []
    for p in dfParams.PobsRE.values:
        KBenergyRE_log10.append(math.log10(expPE/p))
        
    # LOG with base 'e' (ln) !!  
    KBenergyRB = []
    for p in dfParams.PobsRB.values:
        KBenergyRB.append(math.log(expPB/p))
    # LOG with base 'e' (ln) !!  
    KBenergyRE = []
    for p in dfParams.PobsRE.values:
         KBenergyRE.append(math.log(expPE/p))
        
    # Save the rest of all the parameters in a DataFrame dfParams
    dfParams['KBenergy_RB'] = KBenergyRB
    dfParams['KBenergy_RE'] = KBenergyRE
    dfParams['KBenergyRB_log10'] = KBenergyRB_log10
    dfParams['KBenergyRE_log10'] = KBenergyRE_log10
    
    # Save the parameters for Entropy and Energy calc in a DataFrame dfPKBES
    dfKBES = pd.DataFrame()
    dfKBES['Residue'] = numR.index
    dfKBES['F_RB'] = dfParams.F_RB
    dfKBES['F_RE'] = dfParams.F_RE
    dfKBES['KBenergy_RB'] = KBenergyRB
    dfKBES['KBenergy_RE'] = KBenergyRE
    
    # Add values to the DataFrame that holds the sum of important params:
    dfSum.F_RB = dfSum.F_RB + dfKBES.F_RB
    dfSum.F_RE = dfSum.F_RE + dfKBES.F_RE
    dfSum.KBenergy_RB = dfSum.KBenergy_RB + dfKBES.KBenergy_RB
    dfSum.KBenergy_RE = dfSum.KBenergy_RE + dfKBES.KBenergy_RE
        
    # Change directory 
    os.chdir("C:/Kate/Bioinformatics/Diplom/Resample50x100/")
    writer = pd.ExcelWriter(('Sample' + str(n) + '.xlsx'), engine='xlsxwriter')

    # Write your DataFrame to a file     
    dfNames.to_excel(writer, 'Sample')
    dfKBES.to_excel(writer, 'KBES')
    dfParams.to_excel(writer, 'Params')
    numBE.to_excel(writer, 'numBE')
    dfWhole.to_excel(writer, 'FullSet')
    dfoob.to_excel(writer, 'OutOfBox')

    # Save the result 
    writer.save()    

In [6]:
dfSum

Unnamed: 0,Residue,F_RB,F_RE,KBenergy_RB,KBenergy_RE
0,A,3.830236,4.318942,-32.75299,22.26131
1,C,0.98679,0.666559,-56.735525,50.779491
2,D,0.858479,5.185268,88.489936,-26.404662
3,E,0.606413,6.05046,133.827374,-32.231511
4,F,2.277693,1.65071,-53.895687,45.822031
5,G,2.458405,4.803401,0.55675,-0.119808
6,H,0.607858,1.903445,34.915067,-13.835595
7,I,3.675005,2.105293,-63.367671,60.253415
8,K,0.284813,5.944492,204.329039,-37.158736
9,L,5.38376,3.731375,-55.871664,48.383284


In [7]:
dfAverage = pd.DataFrame()
dfAverage['Residue'] = dfSum.Residue
dfAverage['F_RB'] = dfSum.F_RB/repeats
dfAverage['F_RE'] = dfSum.F_RE/repeats
dfAverage['KBenergy_RB'] = dfSum.KBenergy_RB/repeats
dfAverage['KBenergy_RE'] = dfSum.KBenergy_RE/repeats
dfAverage

Unnamed: 0,Residue,F_RB,F_RE,KBenergy_RB,KBenergy_RE
0,A,0.038302,0.043189,-0.32753,0.222613
1,C,0.009868,0.006666,-0.567355,0.507795
2,D,0.008585,0.051853,0.884899,-0.264047
3,E,0.006064,0.060505,1.338274,-0.322315
4,F,0.022777,0.016507,-0.538957,0.45822
5,G,0.024584,0.048034,0.005568,-0.001198
6,H,0.006079,0.019034,0.349151,-0.138356
7,I,0.03675,0.021053,-0.633677,0.602534
8,K,0.002848,0.059445,2.04329,-0.371587
9,L,0.053838,0.037314,-0.558717,0.483833


In [9]:
# Change directory 
os.chdir("C:/Kate/Bioinformatics/Diplom/Set/")
writer = pd.ExcelWriter(('Params_size'+str(size)+'_repeats' + str(repeats) + '.xlsx'), engine='xlsxwriter')

# Write your DataFrame to a file     
dfSum.to_excel(writer, 'Total')
dfAverage.to_excel(writer, 'KBES')

# Save the result 
writer.save()    