In [None]:
!pip install rdkit-pypi chembl_webresource_client pandas


In [None]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.FilterCatalog import *

# Initialize ChEMBL client
activity = new_client.activity

# Fetch activities; remove the slice to attempt fetching all data
activities = activity.filter(target_chembl_id='CHEMBL279', pchembl_value__isnull=False).order_by('-pchembl_value')

# Define filtering function
def filter_compounds(activities):
    compounds = []
    for act in activities:
        chembl_id = act.get('molecule_chembl_id')
        smiles = act.get('canonical_smiles')
        pchembl_value = act.get('pchembl_value')

        if chembl_id and smiles:
            try:
                mol = Chem.MolFromSmiles(smiles)
                if mol:  # Ensure molecule could be parsed
                    compounds.append({
                        'chembl_id': chembl_id,
                        'smiles': smiles,
                        'pchembl_value': pchembl_value,
                        'mol': mol
                    })
            except Exception as e:
                print(f"Failed to process molecule {chembl_id}: {e}")

    df = pd.DataFrame(compounds)
    if df.empty:
        return df

    # Calculate molecular descriptors needed for filtering
    df['mw'] = df['mol'].apply(Descriptors.MolWt)
    df['logp'] = df['mol'].apply(Descriptors.MolLogP)
    df['tpsa'] = df['mol'].apply(Descriptors.TPSA)
    df['hba'] = df['mol'].apply(Descriptors.NumHAcceptors)
    df['hbd'] = df['mol'].apply(Descriptors.NumHDonors)
    df['rot_bonds'] = df['mol'].apply(Descriptors.NumRotatableBonds)

    # Apply combined filtering criteria
    df_filtered = df[(df['mw'] <= 300) & (df['tpsa'] <= 140) & (df['rot_bonds'] <= 10) &
          (df['logp'] <= 3) & (df['hbd'] <= 3) & (df['hba'] <= 3)]

    # PAINS filtering
    params = FilterCatalogParams()
    params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
    catalog = FilterCatalog(params)
    df_filtered['is_pains'] = df_filtered['mol'].apply(lambda x: not catalog.HasMatch(x))

    return df_filtered[df_filtered['is_pains']]

# Filter compounds
final_df = filter_compounds(activities)
if len(final_df) < 100:
    print("Less than 100 valid compounds found after filtering.")

# Sort by pChEMBL value (most potent first) and select top 200
final_df = final_df.sort_values(by='pchembl_value', ascending=False).head(200)

# Save to CSV
final_df[['chembl_id', 'smiles', 'pchembl_value']].to_csv('top_potent_vegf2.csv', index=False)

# Output some of the results
print(final_df[['chembl_id', 'smiles', 'pchembl_value']])


In [None]:
import pandas as pd

df = pd.read_csv('top_potent.csv')
df = df.drop_duplicates(subset = ['chembl_id'], keep = 'first')
df = df.sort_values(by='pchembl_value', ascending=False)
df = df.head(100)
df = df.to_csv('Sorted100jak2.csv', index=False)


In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools

# Load your DataFrame (assuming it's already loaded as df)
df = pd.read_csv('Sorted100jak2.csv')  # Uncomment and modify if needed

# Add an RDKit molecule column from SMILES
PandasTools.AddMoleculeColumnToFrame(df, smilesCol='smiles', molCol='Mol')

# Define properties to include in the SDF (optional)
properties = list(df.columns)
properties.remove('Mol')  # Exclude the molecule column from properties

# Write to SDF
writer = Chem.SDWriter('output_jak2.sdf')
for _, row in df.iterrows():
    mol = row['Mol']
    if mol is not None:
        # Set properties as molecule attributes
        for prop in properties:
            if pd.notna(row[prop]):
                mol.SetProp(prop, str(row[prop]))
        writer.write(mol)
writer.close()

print("SDF file created successfully.")
