In [1]:
!pip install rdkit==2025.3.2 mordred==1.2.0 networkx==2.8.8 joblib pandas numpy==1.26.4

Collecting rdkit==2025.3.2
  Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting mordred==1.2.0
  Downloading mordred-1.2.0.tar.gz (128 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/128.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m122.9/128.8 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m128.8/128.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting networkx==2.8.8
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Collecting numpy==1.26.4
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m381.5 kB/s[0m eta [36m0:00:00[0m
Downloading rdkit-2025.3.2-cp311-cp311-manyl

In [1]:
import os
import pandas as pd
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors
from mordred import Calculator, descriptors
import joblib

In [3]:
import ipywidgets as widgets
from IPython.display import display

# Create a text area widget for user input
smiles_input = "O=C(N[C@@H](CC1CCCCC1)C(N[C@@H]([C@H](O)C)C(N[C@@H](CC2=CC=CC=C2)C(N[C@@H](CC3CCCCC3)C(N4[C@H]5CCC4)=O)=O)=O)=O)[C@H](C)NC5=O" # @param {"type":"string","placeholder":"Input your SMILES here and press the play button located on the left."}
display(smiles_input)

# Process input into a DataFrame
input_smiles = smiles_input.strip().split('\n')
dataset = pd.DataFrame(input_smiles)
seq_list = [str(x) for x in dataset.iloc[:, 0]]
id_list = list(range(1, len(seq_list) + 1))

def canonical_smiles(smiles):
  mols = [Chem.MolFromSmiles(smi) for smi in smiles]
  return [Chem.MolToSmiles(mol) for mol in mols]

dataset['SMILES_cano'] = canonical_smiles(dataset.iloc[:, 0])
dataset_new = dataset.copy()

'O=C(N[C@@H](CC1CCCCC1)C(N[C@@H]([C@H](O)C)C(N[C@@H](CC2=CC=CC=C2)C(N[C@@H](CC3CCCCC3)C(N4[C@H]5CCC4)=O)=O)=O)=O)[C@H](C)NC5=O'

In [4]:
def custom_Kappa3(A, alpha, Pi):
    if A % 2 == 0:
        return ((A + alpha - 3) * (A + alpha - 2) ** 2 / ((Pi + alpha) ** 2))
    else:
        return (A + alpha - 1) * (A + alpha - 3) ** 2 / ((Pi + alpha) ** 2)

def canonical_smiles(smiles):
    mols = [Chem.MolFromSmiles(smi) for smi in smiles]
    return [Chem.MolToSmiles(mol) for mol in mols]


In [5]:
mols = [Chem.MolFromSmiles(i) for i in dataset_new['SMILES_cano']]
calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
desc_names = calc.GetDescriptorNames()
kappa3_idx = desc_names.index('Kappa3')

Mol_descriptors = []
for mol in mols:
    mol = Chem.AddHs(mol)
    descriptors_values = list(calc.CalcDescriptors(mol))
    Pi = len(Chem.FindAllPathsOfLengthN(mol, 3))
    alpha = Descriptors.HallKierAlpha(mol)
    A = mol.GetNumAtoms()
    custom_kappa3_value = custom_Kappa3(A, alpha, Pi)
    descriptors_values[kappa3_idx] = custom_kappa3_value
    Mol_descriptors.append(descriptors_values)

df_with_rdkit_descriptors = pd.DataFrame(Mol_descriptors, columns=desc_names)
dataset_new = pd.concat([dataset_new, df_with_rdkit_descriptors], axis=1)


In [6]:
Morgan_fpts = []
mfpgen = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=2048)

for smi in dataset_new['SMILES_cano']:
    mol = Chem.MolFromSmiles(smi)
    scfp = mfpgen.GetCountFingerprintAsNumPy(mol)
    Morgan_fpts.append(scfp)

df_Morgan_fpts = pd.DataFrame(Morgan_fpts, columns=[f"morgan{i+1}" for i in range(len(Morgan_fpts[0]))])
dataset_new = pd.concat([dataset_new, df_Morgan_fpts], axis=1)


In [7]:
calc_mordred = Calculator([
    descriptors.WalkCount,
    descriptors.Autocorrelation,
    descriptors.ExtendedTopochemicalAtom,
    descriptors.AtomCount
], ignore_3D=False)

mordred_descriptors = calc_mordred.pandas(mols)
dataset_new = pd.concat([dataset_new, mordred_descriptors], axis=1)


100%|██████████| 1/1 [00:00<00:00,  3.49it/s]


In [8]:
from sklearn.ensemble import ExtraTreesRegressor
import pandas as pd

# === PAMPA MODEL ===
# Load training data
!wget https://raw.githubusercontent.com/jpvlinhares/cypps_files/refs/heads/main/training_testing_files/CycPeptMPDB_Peptide_Assay_PAMPA_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv
data_training_pampa = pd.read_csv('https://raw.githubusercontent.com/jpvlinhares/cypps_files/refs/heads/main/training_testing_files/CycPeptMPDB_Peptide_Assay_PAMPA_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv')
# Prepare features and labels
X_pampa = pd.get_dummies(data_training_pampa)
y_pampa = X_pampa['PAMPA']
X_pampa.drop(['PAMPA', 'ID'], axis=1, inplace=True)
# Train model
model_pampa = ExtraTreesRegressor(n_estimators=100, n_jobs=-1, random_state=0)
model_pampa.fit(X_pampa, y_pampa)

# === CACO-2 MODEL ===
!wget https://raw.githubusercontent.com/jpvlinhares/cypps_files/refs/heads/main/training_testing_files/CycPeptMPDB_Peptide_Assay_Caco2_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv
# Load training data
data_training_caco2  = pd.read_csv('CycPeptMPDB_Peptide_Assay_Caco2_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv')
# Prepare features and labels
X_caco2 = pd.get_dummies(data_training_caco2)
y_caco2 = X_caco2['Caco2']
X_caco2.drop(['Caco2', 'ID'], axis=1, inplace=True)
# Train model
model_caco2 = ExtraTreesRegressor(n_estimators=100, n_jobs=-1, random_state=0)
model_caco2.fit(X_caco2, y_caco2)

PAMPA_Features = ['BCUT2D_CHGLO','MolLogP','ATSC3c','VSA_EState2','ATSC0c','ATS8s',
                  'AATSC7p','SRW05','fr_NH0','GATS4c','morgan688','morgan799']

Caco2_Features = ['ATSC0se','Chi0n','AATS0i','morgan66','morgan131','AATSC0d','fr_halogen',
                  'AATS8m','BalabanJ','ATSC7p','morgan1952','AATSC0m','MaxAbsPartialCharge','TSRW10']

methods_df_pampa = pd.DataFrame(dataset_new, columns=PAMPA_Features)
methods_df_caco2 = pd.DataFrame(dataset_new, columns=Caco2_Features)

prediction_pampa = list(map(float, model_pampa.predict(methods_df_pampa).round(2)))
prediction_caco2 = list(map(float, model_caco2.predict(methods_df_caco2).round(2)))

--2025-06-12 05:07:00--  https://raw.githubusercontent.com/jpvlinhares/cypps_files/refs/heads/main/training_testing_files/CycPeptMPDB_Peptide_Assay_PAMPA_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1006616 (983K) [text/plain]
Saving to: ‘CycPeptMPDB_Peptide_Assay_PAMPA_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv’


2025-06-12 05:07:01 (16.0 MB/s) - ‘CycPeptMPDB_Peptide_Assay_PAMPA_onlyInfo.csv_0.01features_final_df.csv.train_set_butina_0.3.csv_afterGreedyBLIND.csv’ saved [1006616/1006616]

--2025-06-12 05:07:02--  https://raw.githubusercontent.com/jpvlinhares/cypps_files/refs/heads/main/training_testing_files/CycPeptMPDB_Peptide_A

In [9]:
prediction_df = pd.DataFrame({
    'id': id_list,
    'seq': seq_list,
    'PAMPA_pred': prediction_pampa,
    'Caco2_pred': prediction_caco2
})

prediction_df


Unnamed: 0,id,seq,PAMPA_pred,Caco2_pred
0,1,O=C(N[C@@H](CC1CCCCC1)C(N[C@@H]([C@H](O)C)C(N[...,-5.05,-6.25
