In [81]:
!pip install rdkit mordred joblib pandas



In [82]:
import os
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors
import joblib

In [145]:
import ipywidgets as widgets
from IPython.display import display

# Create a text area widget for user input
smile_input = "C[C@@H](O)[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@H](CC(N)=O)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC(=O)[C@H](CO)NC1=O" # @param {"type":"string","placeholder":"Input your SMILES here and press the play button located on the left."}
display(smiles_input)

# Process input into a DataFrame
input_smiles = smiles_input.value.strip().split('\n')  # <- note: single backslash here
dataset = pd.DataFrame(input_smiles)
seq_list = [str(x) for x in dataset.iloc[:, 0]]
id_list = list(range(1, len(seq_list) + 1))

dataset['SMILES_cano'] = canonical_smiles(dataset.iloc[:, 0])
dataset_new = dataset.copy()


Textarea(value='[H]NCCCC[C@H](NC(CNC([C@H](CSSC[C@H]1C(N[C@@H](CCC(N[H])=O)C(N[C@@H](CC2=CN([H])C3=C2C=CC=C3)C…

In [146]:
def custom_Kappa3(A, alpha, Pi):
    if A % 2 == 0:
        return ((A + alpha - 3) * (A + alpha - 2) ** 2 / ((Pi + alpha) ** 2))
    else:
        return (A + alpha - 1) * (A + alpha - 3) ** 2 / ((Pi + alpha) ** 2)

def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    return [Chem.MolToSmiles(mol) for mol in mols]


In [147]:
mols = [Chem.MolFromSmiles(i) for i in dataset_new['SMILES_cano']]
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
desc_names = calc.GetDescriptorNames()
kappa3_idx = desc_names.index('Kappa3')

Mol_descriptors = []
for mol in mols:
    mol = Chem.AddHs(mol)
    descriptors_values = list(calc.CalcDescriptors(mol))
    Pi = len(Chem.FindAllPathsOfLengthN(mol, 3))
    alpha = Descriptors.HallKierAlpha(mol)
    A = mol.GetNumAtoms()
    custom_kappa3_value = custom_Kappa3(A, alpha, Pi)
    descriptors_values[kappa3_idx] = custom_kappa3_value
    Mol_descriptors.append(descriptors_values)

df_with_rdkit_descriptors = pd.DataFrame(Mol_descriptors, columns=desc_names)
dataset_new = pd.concat([dataset_new, df_with_rdkit_descriptors], axis=1)


In [148]:
Morgan_fpts = []
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

for smi in dataset_new['SMILES_cano']:
    mol = Chem.MolFromSmiles(smi)
    scfp = mfpgen.GetCountFingerprintAsNumPy(mol)
    Morgan_fpts.append(scfp)

df_Morgan_fpts = pd.DataFrame(Morgan_fpts, columns=[f"morgan{i+1}" for i in range(len(Morgan_fpts[0]))])
dataset_new = pd.concat([dataset_new, df_Morgan_fpts], axis=1)


In [149]:
calc_mordred = Calculator([
    descriptors.WalkCount,
    descriptors.Autocorrelation,
    descriptors.ExtendedTopochemicalAtom,
    descriptors.AtomCount
], ignore_3D=False)

mordred_descriptors = calc_mordred.pandas(mols)
dataset_new = pd.concat([dataset_new, mordred_descriptors], axis=1)


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 12.42it/s]


In [150]:
from sklearn.ensemble import ExtraTreesRegressor
import pandas as pd

# === PAMPA MODEL ===
# Load training data
data_training_pampa = pd.read_csv('/home/jpvelloso/cipps_cicpep_permeability/database_CycPeptMPDB1.2/PAMPA/PAMPA_greedy-23-01-25/all/ExtraTrees_12/CycPeptMPDB_Peptide_Assay_PAMPA_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv')
# Prepare features and labels
X_pampa = pd.get_dummies(data_training_pampa)
y_pampa = X_pampa['PAMPA']
X_pampa.drop(['PAMPA', 'ID'], axis=1, inplace=True)
# Train model
model_pampa = ExtraTreesRegressor(n_estimators=100, n_jobs=-1, random_state=0)
model_pampa.fit(X_pampa, y_pampa)

# === CACO-2 MODEL ===
# Load training data
data_training_caco2 = pd.read_csv('/home/jpvelloso/cipps_cicpep_permeability/database_CycPeptMPDB1.2/caco2/Caco2_greedy-23-01-25/all/ExtraTrees_14/CycPeptMPDB_Peptide_Assay_Caco2_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv')
# Prepare features and labels
X_caco2 = pd.get_dummies(data_training_caco2)
y_caco2 = X_caco2['Caco2']
X_caco2.drop(['Caco2', 'ID'], axis=1, inplace=True)
# Train model
model_caco2 = ExtraTreesRegressor(n_estimators=100, n_jobs=-1, random_state=0)
model_caco2.fit(X_caco2, y_caco2)

PAMPA_Features = ['BCUT2D_CHGLO','MolLogP','ATSC3c','VSA_EState2','ATSC0c','ATS8s',
                  'AATSC7p','SRW05','fr_NH0','GATS4c','morgan688','morgan799']

Caco2_Features = ['ATSC0se','Chi0n','AATS0i','morgan66','morgan131','AATSC0d','fr_halogen',
                  'AATS8m','BalabanJ','ATSC7p','morgan1952','AATSC0m','MaxAbsPartialCharge','TSRW10']

methods_df_pampa = pd.DataFrame(dataset_new, columns=PAMPA_Features)
methods_df_caco2 = pd.DataFrame(dataset_new, columns=Caco2_Features)

prediction_pampa = list(map(float, model_pampa.predict(methods_df_pampa).round(2)))
prediction_caco2 = list(map(float, model_caco2.predict(methods_df_caco2).round(2)))

In [151]:
prediction_df = pd.DataFrame({
    'id': id_list,
    'seq': seq_list,
    'PAMPA_pred': prediction_pampa,
    'Caco2_pred': prediction_caco2
})

prediction_df


Unnamed: 0,id,seq,PAMPA_pred,Caco2_pred
0,1,[H]NCCCC[C@H](NC(CNC([C@H](CSSC[C@H]1C(N[C@@H]...,-6.76,-6.58
