# ConvBoost-CPP

## Calculation of molecular descriptors
* cLogP, HBA, NAR, NRB, Fsp3, NPA, NG, NetC, NNCAA, N, O
* Hydrophobiciy - calculated in the pep-fun framework, available at https://github.com/rochoa85/PepFun/blob/master/tutorial_PepFun.ipynb

In [3]:
pip install rdkit



In [4]:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski, rdMolDescriptors
from Bio import PDB
from google.colab import files
import pandas as pd
import os

os.makedirs('descriptors_results', exist_ok=True)

def calculate_descriptors(pdb_file):
    mol = Chem.MolFromPDBFile(pdb_file)
    if mol is None:
        raise ValueError("It was not possible to create the molecule from SMILES.")

    parser = PDB.PDBParser(QUIET=True)
    structure = parser.get_structure('structure', pdb_file)
    residues = [residue for residue in structure.get_residues()]

    descriptors = {
        'file': [pdb_file],
        'Fsp3': [rdMolDescriptors.CalcFractionCSP3(mol)],
        'cLogP': [Descriptors.MolLogP(mol)],
        'HBA': [Lipinski.NumHAcceptors(mol)],
        'NAR': [rdMolDescriptors.CalcNumAromaticRings(mol)],
        'NRB': [Lipinski.NumRotatableBonds(mol)],
        'NetC': [sum(atom.GetSymbol() == 'C' for atom in mol.GetAtoms())],
        'NPA': [rdMolDescriptors.CalcNumRotatableBonds(mol)],
        'N': [sum(atom.GetSymbol() == 'N' for atom in mol.GetAtoms())],
        'O': [sum(atom.GetSymbol() == 'O' for atom in mol.GetAtoms())],
        'NG': [Chem.Fragments.fr_guanido(mol)],
        'NNCAA': [sum(1 for residue in residues if residue.get_resname() == 'NNCAA')]
    }
    return pd.DataFrame(descriptors)

if __name__ == "__main__":
    pdb_file = 'data/Aminopeptase.pdb'  # change the PDB file containing CPP information here

    descriptors = calculate_descriptors(pdb_file)
    descriptors.to_csv('descriptors_results/descriptors.csv')
    files.download('descriptors_results/descriptors.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## ConvBoost-CPP classifier

In [5]:
from tensorflow.keras.models import load_model
import xgboost as xgb
import numpy as np

def imCNN(data:pd.DataFrame()):
    imCNN = load_model('models/imCNN.h5')
    data = np.array(data)
    data = data.reshape(data.shape[0], data.shape[1], -1)

    return pd.DataFrame(imCNN.predict(data))

def xgboost(data:pd.DataFrame()):
    data = np.array(data)
    xgboost_model = xgb.XGBClassifier()
    xgboost_model.load_model('models/xgboost.json')

    return pd.DataFrame(xgboost_model.predict_proba(data))

def ConvBoost_CPP(data):
    probas = [imCNN(data[data.columns[1:]]),xgboost(data[data.columns[1:]])]
    probs = pd.concat(probas, axis=1)
    c0 = probs[0].mean(axis=1)
    c1 = probs[1].mean(axis=1)

    final_pred = np.argmax([c0, c1], axis=0)
    return final_pred

## Prediction

In [6]:
classes = ['non-CPP','CPP']

data = pd.read_csv('descriptors_results/descriptors.csv', index_col = 0)
data['hydroph'] = [-2.06] # calculated in the pep-fun framework

os.makedirs('classification_results', exist_ok=True)

result = ConvBoost_CPP(data)[0]
df_result = pd.DataFrame([[data.file[0][5:-4],classes[result]]], columns=['Peptide','label'])
df_result.to_csv('classification_results/cls_result.csv')

print('\n\n',50*'-','RESULTS',50*'-')
df_result



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 883ms/step


 -------------------------------------------------- RESULTS --------------------------------------------------


Unnamed: 0,Peptide,label
0,Aminopeptase,CPP
