In [10]:
import pandas as pd

In [11]:
# key property list (ADMET)

key_properties = {'absorption': '[Absorption/Human Oral Bioavailability 50%] Predictions',
                  'distribution': '[Distribution/Blood-Brain Barrier] Predictions',
                  'metabolism': '[Metabolism/CYP 1A2 Inhibitor] Predictions',
                  'excretion': '[Excretion/Half-Life of Drug] Predictions',
                  'toxicity': '[Toxicity/Liver Injury I (DILI)] Predictions',} 

# extract key_property values from the input dataframe
property_file_path = "/home/hoon/dd-agent/llm_dd/examples/BCL-2/property/combined_dummy.csv"
assess_file_path = "/home/hoon/dd-agent/llm_dd/examples/BCL-2/assessment/objective_dummy2.csv"

df_prop = pd.read_csv(property_file_path)

# extract entries from df_prop where the entry in 'LLM_Assessment' column  starts with 'YES'
df_assess = pd.read_csv(assess_file_path)

#  check whether SMILES in df_assess is the same as in df_prop
if df_assess['SMILES'].equals(df_prop['SMILES']):
    print("The SMILES columns in both DataFrames match.")

else:
    print("The SMILES columns in the DataFrames do not match.")
    # Optionally, you can print the mismatched rows or handle the mismatch as needed
    
df_in = df_prop[df_assess['LLM_Assessment'].str.startswith('YES')]
df_out = df_prop[df_assess['LLM_Assessment'].str.startswith('NO')]


df = df_in.copy()
# Extract the relevant columns based on the key_properties dictionary
key_property_values = {}
for key, column in key_properties.items():
    if column in df.columns:
        key_property_values[key] = df[column].tolist()
    else:
        print(f"Column '{column}' not found in the DataFrame.")
        key_property_values[key] = []  # Assign an empty list if the column is not found
# Print the extracted key_property values
for key, values in key_property_values.items():
    print(f"{key}: {values}")


The SMILES columns in both DataFrames match.
absorption: ['Bioavailable']
distribution: ['Non-Penetrable']
metabolism: ['Non-Inhibitor']
excretion: ['Half-Life < 3hs']
toxicity: ['Toxic']


In [None]:
def drug_chemical_feasibility(smiles: str):
    '''
    This tool inputs a SMILES of a drug candidate and outputs chemical feasibility, Synthetic Accessibility (SA),
      and Quantitative drug-likeness(QED) scores. SA the ease of synthesis of compounds
        according to their synthetic complexity which combines starting materials information and structural complexity. 
        Lower SA, means better synthesiability. QED combines eight physicochemical properties(molecular weight, LogP, 
        H-bond donors, H-bond acceptors, charge, aromaticity, stereochemistry and solubility), generating a score between 0 and 1.
    '''
    smiles = smiles.replace("\n", "")
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return "Invalid SMILES", 0 ,0

    # Calculate SA score
    sa_score = sascorer.calculateScore(mol)

    # Optionally calculate QED score
    # molecular_weight = Descriptors.MolWt(mol)
    qed_score = QED.qed(mol)
    return "Valid SMILES", sa_score, qed_score

In [15]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import QED

# Define key ADMET properties
key_properties = {
    'absorption': '[Absorption/Human Oral Bioavailability 50%] Predictions',
    'distribution': '[Distribution/Blood-Brain Barrier] Predictions',
    'metabolism': '[Metabolism/CYP 1A2 Inhibitor] Predictions',
    'excretion': '[Excretion/Half-Life of Drug] Predictions',
    'toxicity': '[Toxicity/Liver Injury I (DILI)] Predictions',
    'binding_affinity':  'Affinity [pKa]', 
    'qed_score': 'QED',
}

# Load property and assessment files
property_file_path = "/home/hoon/dd-agent/llm_dd/examples/BCL-2/property/combined_dummy.csv"
assess_file_path = "/home/hoon/dd-agent/llm_dd/examples/BCL-2/assessment/objective_dummy2.csv"

df_prop = pd.read_csv(property_file_path)
df_assess = pd.read_csv(assess_file_path)

# Merge based on SMILES to ensure alignment
df_merged = pd.merge(df_prop, df_assess[['SMILES', 'LLM_Assessment']], on='SMILES', how='inner')

# Add QED column 
df_merged['QED'] = df_merged['SMILES'].apply(lambda x: QED.qed(Chem.MolFromSmiles(x)))

# Split into YES and NO groups
df_yes = df_merged[df_merged['LLM_Assessment'].str.startswith('YES')]
df_no = df_merged[df_merged['LLM_Assessment'].str.startswith('NO')]




# Function to extract key properties
def extract_key_properties(df, group_label):
    print(f"\n--- {group_label.upper()} GROUP ---")
    for key, column in key_properties.items():
        if column in df.columns:
            values = df[column].tolist()
            print(f"{key}: {values}")
        else:
            print(f"{key}: Column '{column}' not found.")

# Extract and print
extract_key_properties(df_yes, "yes")
extract_key_properties(df_no, "no")


--- YES GROUP ---
absorption: ['Bioavailable']
distribution: ['Non-Penetrable']
metabolism: ['Non-Inhibitor']
excretion: ['Half-Life < 3hs']
toxicity: ['Toxic']
binding_affinity: [6.181617736816406]
qed_score: [0.43956485821392066]

--- NO GROUP ---
absorption: ['Bioavailable']
distribution: ['Non-Penetrable']
metabolism: ['Non-Inhibitor']
excretion: ['Half-Life < 3hs']
toxicity: ['Toxic']
binding_affinity: [5.73140811920166]
qed_score: [0.3767950017283207]


# Playground

In [1]:
import os, sys
cwd = os.getcwd()
print("Current working directory:", cwd)
home_dir = os.path.dirname(os.path.dirname(cwd))
print("Home directory:", home_dir)
sys.path.append(home_dir)

Current working directory: /home/hoon/dd-agent/llm_dd/examples/BCL-2
Home directory: /home/hoon/dd-agent/llm_dd


In [2]:
import pandas as pd
from agentD.analysis_utils import rule_based_evaluation

In [7]:
data_path = "/home/hoon/dd-agent/llm_dd/examples/BCL-2/property/combined_dummy.csv"

# "/home/hoon/dd-agent/llm_dd/examples/BCL-2/assessment/objective_dummy2.csv"
#"/home/hoon/dd-agent/llm_dd/examples/BCL-2/property/combined_dummy.csv"
#"/home/hoon/dd-agent/llm_dd/examples/BCL-2/downselection/objective_dummy.csv"
df = pd.read_csv(data_path)
df 

Unnamed: 0,SMILES,[Absorption/Caco-2 (logPaap)] Predictions,[Absorption/Caco-2 (logPaap)] Interpretation,[Absorption/Human Oral Bioavailability 20%] Predictions,[Absorption/Human Oral Bioavailability 20%] Probability,[Absorption/Human Oral Bioavailability 20%] Interpretation,[Absorption/Human Intestinal Absorption] Predictions,[Absorption/Human Intestinal Absorption] Probability,[Absorption/Human Intestinal Absorption] Interpretation,[Absorption/Madin-Darby Canine Kidney] Predictions,...,[General Properties/Log S] Interpretation,[General Properties/Log(Vapor Pressure)] Predictions,[General Properties/Log(Vapor Pressure)] Interpretation,[General Properties/Melting Point] Predictions,[General Properties/Melting Point] Interpretation,[General Properties/pKa Acid] Predictions,[General Properties/pKa Acid] Interpretation,[General Properties/pKa Basic] Predictions,[General Properties/pKa Basic] Interpretation,Affinity [pKa]
0,O=C(NS(=O)(=O)c1ccc(N2CCN(CCO)CC2)cc1)c1ccc(N2...,-5.66,,Bioavailable,0.686,Bioavailable (Medium Confidence),Absorbed,0.971,Absorbed (High Confidence),-5.29,...,Proper Value: -4 to 0.5 log mol/L,-10.98,Vapor (Gas) Phase: log vp < 4; Vapor and Parti...,180.22,MP<25: liquid; MP>25:solid,2.88,,5.04,,5.731408
1,O=C(NS(=O)(=O)c1ccc(N2CCOCC2)c([N+](=O)[O-])c1...,-5.56,,Bioavailable,0.839,Bioavailable (High Confidence),Absorbed,0.981,Absorbed (High Confidence),-4.98,...,Proper Value: -4 to 0.5 log mol/L,-10.59,Vapor (Gas) Phase: log vp < 4; Vapor and Parti...,220.45,MP<25: liquid; MP>25:solid,2.8,,6.3,,6.181618


In [8]:
list(df.columns)

['SMILES',
 '[Absorption/Caco-2 (logPaap)] Predictions',
 '[Absorption/Caco-2 (logPaap)] Interpretation',
 '[Absorption/Human Oral Bioavailability 20%] Predictions',
 '[Absorption/Human Oral Bioavailability 20%] Probability',
 '[Absorption/Human Oral Bioavailability 20%] Interpretation',
 '[Absorption/Human Intestinal Absorption] Predictions',
 '[Absorption/Human Intestinal Absorption] Probability',
 '[Absorption/Human Intestinal Absorption] Interpretation',
 '[Absorption/Madin-Darby Canine Kidney] Predictions',
 '[Absorption/Madin-Darby Canine Kidney] Interpretation',
 '[Absorption/Human Oral Bioavailability 50%] Predictions',
 '[Absorption/Human Oral Bioavailability 50%] Probability',
 '[Absorption/Human Oral Bioavailability 50%] Interpretation',
 '[Absorption/P-Glycoprotein Inhibitor] Predictions',
 '[Absorption/P-Glycoprotein Inhibitor] Probability',
 '[Absorption/P-Glycoprotein Inhibitor] Interpretation',
 '[Absorption/P-Glycoprotein Substrate] Predictions',
 '[Absorption/P-Glycop

In [4]:
entry = df.iloc[0].to_dict()
report = rule_based_evaluation(entry)

In [5]:
report

{'smiles': 'O=C(NS(=O)(=O)c1ccc(N2CCN(CCO)CC2)cc1)c1ccc(N2CCOCC2)c([N+](=O)[O-])c1',
 'lipinski_rule_of_5': True,
 'veber_rule': False,
 'ghose_filter': False,
 'rule_of_3': False,
 'pfizer_rule_of_4': False,
 'oprea_lead_like': False,
 'fda_criteria': False,
 'fda_approval_category': 'Low'}