In [3]:
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import KFold
import numpy as np
import os
import subprocess
from chemprop.train import run_training
from chemprop.data import get_data
from chemprop.args import TrainArgs, PredictArgs

def save_temp_data(data, file_path, target_column):
    # Ensure SMILES strings are correctly formatted
    df = pd.DataFrame([(d.smiles[0].strip('[]').strip("'"), d.targets[0]) for d in data], columns=['smiles', target_column])
    df.to_csv(file_path, index=False)

def debug_csv(file_path):
    # Print out the contents of the CSV file for debugging
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        print(f"Contents of {file_path}:")
        print(df.head())
        print(f"Number of rows: {len(df)}")
    else:
        print(f"File {file_path} does not exist.")

def train_chemprop(train_data_path, save_dir, target_column, num_folds=5, num_trials=2):
    metrics = {'auc': [], 'f1': [], 'accuracy': []}

    for trial in range(num_trials):
        print(f"Starting trial {trial+1}/{num_trials}...")
        
        # Load the dataset
        data = get_data(path=train_data_path, skip_none_targets=True)

        # Extract features and targets
        targets = np.array([d.targets for d in data]).flatten()
        features = [d.smiles for d in data]

        kf = KFold(n_splits=num_folds, shuffle=True, random_state=trial)

        fold_metrics = {'auc': [], 'f1': [], 'accuracy': []}

        for fold, (train_idx, val_idx) in enumerate(kf.split(features)):
            print(f"  Starting fold {fold+1}/{num_folds}...")

            # Split data into training and validation sets
            train_data = [data[i] for i in train_idx]
            val_data = [data[i] for i in val_idx]

            # Save temporary train and validation data
            temp_train_path = f'temp_train_trial_{trial}_fold_{fold}.csv'
            temp_val_path = f'temp_val_trial_{trial}_fold_{fold}.csv'

            save_temp_data(train_data, temp_train_path, target_column)
            save_temp_data(val_data, temp_val_path, target_column)

            # Debug CSV files
            debug_csv(temp_train_path)
            debug_csv(temp_val_path)

            # Define the command to run training
            command = [
                'chemprop_train',
                '--data_path', temp_train_path,
                '--dataset_type', 'classification',
                '--save_dir', os.path.join(save_dir, f'trial_{trial}_fold_{fold}'),
                '--target_columns', target_column,
                '--epochs', '30',
                '--batch_size', '50',
            ]

            try:
                # Run the training command
                subprocess.run(command, check=True)

                # Define the command to run prediction
                predict_command = [
                    'chemprop_predict',
                    '--test_path', temp_val_path,
                    '--checkpoint_dir', os.path.join(save_dir, f'trial_{trial}_fold_{fold}'),
                    '--preds_path', f'preds_trial_{trial}_fold_{fold}.csv'
                ]

                # Run the prediction command
                subprocess.run(predict_command, check=True)

                # Debug prediction file
                debug_csv(f'preds_trial_{trial}_fold_{fold}.csv')

                # Calculate metrics
                preds_df = pd.read_csv(f'preds_trial_{trial}_fold_{fold}.csv')
                # Check the column names and choose the correct one for predictions
                print(f"Prediction file columns: {preds_df.columns}")

                # Replace 'preds' with the actual column name for predictions
                pred_column_name = 'preds'  # Change if necessary based on actual column name
                if pred_column_name not in preds_df.columns:
                    raise KeyError(f"Column '{pred_column_name}' not found in predictions file.")

                # Convert predictions to float
                preds = preds_df[pred_column_name].astype(float)
                true_labels = [d.targets[0] for d in val_data]
                
                # Binarize predictions
                threshold = 0.5
                binary_preds = [1 if x > threshold else 0 for x in preds]

                auc = roc_auc_score(true_labels, preds)
                f1 = f1_score(true_labels, binary_preds)
                accuracy = accuracy_score(true_labels, binary_preds)
                
                fold_metrics['auc'].append(auc)
                fold_metrics['f1'].append(f1)
                fold_metrics['accuracy'].append(accuracy)

            except subprocess.CalledProcessError as e:
                print(f"Error during training or prediction for trial {trial} fold {fold}: {e}")
                continue
            except KeyError as e:
                print(f"KeyError: {e}")
                continue
            except TypeError as e:
                print(f"TypeError: {e}")
                continue

            # Clean up temporary files
            os.remove(temp_train_path)
            os.remove(temp_val_path)
            os.remove(f'preds_trial_{trial}_fold_{fold}.csv')

        # Record the average metrics for this trial
        metrics['auc'].append(np.mean(fold_metrics['auc']))
        metrics['f1'].append(np.mean(fold_metrics['f1']))
        metrics['accuracy'].append(np.mean(fold_metrics['accuracy']))

    # Calculate the average of each metric over all trials
    avg_metrics = {key: np.mean(values) for key, values in metrics.items()}

    return avg_metrics

def main():
    data_path = 'data/TG2_cleaned_smiles_activity.csv'  # Update this with your data path
    save_dir = 'acheckp'  # Update this with your checkpoint directory
    target_column = 'Activity'  # Update this with your target column name

    metrics = train_chemprop(data_path, save_dir, target_column)

    print('Average metrics over 50 trials of 5-fold cross-validation:')
    print(f"AUC: {metrics['auc']}")
    print(f"F1 Score: {metrics['f1']}")
    print(f"Accuracy: {metrics['accuracy']}")

if __name__ == "__main__":
    main()


Starting trial 1/2...


336it [00:00, 230162.69it/s]
100%|██████████| 336/336 [00:00<00:00, 133026.82it/s]
100%|██████████| 336/336 [00:00<00:00, 100807.31it/s]

  Starting fold 1/5...
Contents of temp_train_trial_0_fold_0.csv:
                                              smiles  Activity
0  COC(=O)c1ccc(COC(=O)N2CCN(C(=O)CNC(=O)CBr)CC2)cc1       1.0
1  O=C(CBr)NCC(=O)N1CCN(C(=O)C23CC4CC(CC(C4)C2)C3...       1.0
2  C=CC(=O)NCC(=O)N1CCN(S(=O)(=O)c2cccc3c(N(C)C)c...       1.0
3  O=C(CBr)NCC(=O)N1CCN(C(=O)OCC23CC4CC(CC(C4)C2)...       1.0
4  COC(=O)c1ccc2cc(COC(=O)N3CCN(C(=O)CNC(=O)CCl)C...       1.0
Number of rows: 268
Contents of temp_val_trial_0_fold_0.csv:
                                              smiles  Activity
0                  NC(=O)C1OC1C(=O)Nc1ccc(Br)c(Cl)c1       1.0
1         C=c1[nH]c2ccc(I)cc2c1=Cc1cc(Br)c(O)c(Br)c1       1.0
2  O=C(CCOc1ccccc1)N1CCN(C(=O)n2sc3c(ccc[n+]3[O-]...       1.0
3  COC(=O)C=CCCC(NC(=O)c1ccc([N+](=O)[O-])cc1C(=O...       1.0
4  COC(=O)C=CCCC(NC(=O)c1cc([N+](=O)[O-])ccc1C(=O...       1.0
Number of rows: 68



Command line
python /Library/Frameworks/Python.framework/Versions/3.10/bin/chemprop_train --data_path temp_train_trial_0_fold_0.csv --dataset_type classification --save_dir acheckp/trial_0_fold_0 --target_columns Activity --epochs 30 --batch_size 50
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache_cutoff': 10000,
 'checkpoint_dir': None,
 'checkpoint_frzn': None,
 'checkpoint_path': None,
 'checkpoint_paths': None,
 'class_balance': False

Loading training args
Setting molecule featurization parameters to default.
Loading data
Validating SMILES
Test size = 66


68it [00:00, 225821.59it/s]
100%|██████████| 68/68 [00:00<00:00, 157315.32it/s]
[22:27:05] SMILES Parse Error: syntax error while parsing: CC[S+](CC)CC(=O)CCC(NC(=O)C(Cc1ccccc1)NC(=O)OCc1ccccc1)C(=O)O.[Br-
[22:27:05] SMILES Parse Error: Failed parsing SMILES 'CC[S+](CC)CC(=O)CCC(NC(=O)C(Cc1ccccc1)NC(=O)OCc1ccccc1)C(=O)O.[Br-' for input: 'CC[S+](CC)CC(=O)CCC(NC(=O)C(Cc1ccccc1)NC(=O)OCc1ccccc1)C(=O)O.[Br-'
[22:27:05] SMILES Parse Error: syntax error while parsing: Cc1c(C)[n+](C)c(SCC(=O)CCC(NC(=O)CNC(=O)OCc2ccccc2)C(=O)O)n1Cc1ccccc1.[Br-
[22:27:05] SMILES Parse Error: Failed parsing SMILES 'Cc1c(C)[n+](C)c(SCC(=O)CCC(NC(=O)CNC(=O)OCc2ccccc2)C(=O)O)n1Cc1ccccc1.[Br-' for input: 'Cc1c(C)[n+](C)c(SCC(=O)CCC(NC(=O)CNC(=O)OCc2ccccc2)C(=O)O)n1Cc1ccccc1.[Br-'
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s][A

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".



 50%|█████     | 1/2 [00:04<00:04,  4.55s/it][A
100%|██████████| 1/1 [00:14<00:00, 14.65s/it][A


Saving predictions to preds_trial_0_fold_0.csv
Elapsed time = 0:00:15
Contents of preds_trial_0_fold_0.csv:
                                              smiles            Activity
0                  NC(=O)C1OC1C(=O)Nc1ccc(Br)c(Cl)c1   0.578286349773407
1         C=c1[nH]c2ccc(I)cc2c1=Cc1cc(Br)c(O)c(Br)c1  0.2517865002155304
2  O=C(CCOc1ccccc1)N1CCN(C(=O)n2sc3c(ccc[n+]3[O-]...   0.832227885723114
3  COC(=O)C=CCCC(NC(=O)c1ccc([N+](=O)[O-])cc1C(=O...  0.9359205365180969
4  COC(=O)C=CCCC(NC(=O)c1cc([N+](=O)[O-])ccc1C(=O...  0.9444336295127869
Number of rows: 68
Prediction file columns: Index(['smiles', 'Activity'], dtype='object')
KeyError: "Column 'preds' not found in predictions file."
  Starting fold 2/5...
Contents of temp_train_trial_0_fold_1.csv:
                                              smiles  Activity
0  COC(=O)c1ccc(COC(=O)N2CCN(C(=O)CNC(=O)CBr)CC2)cc1       1.0
1  C=CC(=O)NCC(=O)N1CCN(S(=O)(=O)c2cccc3c(N(C)C)c...       1.0
2  O=C(CBr)NCC(=O)N1CCN(C(=O)OCC23CC4CC(CC(C4)C2)..

Command line
python /Library/Frameworks/Python.framework/Versions/3.10/bin/chemprop_train --data_path temp_train_trial_0_fold_1.csv --dataset_type classification --save_dir acheckp/trial_0_fold_1 --target_columns Activity --epochs 30 --batch_size 50
Args
{'activation': 'ReLU',
 'adding_bond_types': True,
 'adding_h': False,
 'aggregation': 'mean',
 'aggregation_norm': 100,
 'atom_constraints': [],
 'atom_descriptor_scaling': True,
 'atom_descriptors': None,
 'atom_descriptors_path': None,
 'atom_descriptors_size': 0,
 'atom_features_size': 0,
 'atom_messages': False,
 'atom_targets': [],
 'batch_size': 50,
 'bias': False,
 'bias_solvent': False,
 'bond_constraints': [],
 'bond_descriptor_scaling': True,
 'bond_descriptors': None,
 'bond_descriptors_path': None,
 'bond_descriptors_size': 0,
 'bond_features_size': 0,
 'bond_targets': [],
 'cache_cutoff': 10000,
 'checkpoint_dir': None,
 'checkpoint_frzn': None,
 'checkpoint_path': None,
 'checkpoint_paths': None,
 'class_balance': False,

Loading training args
Setting molecule featurization parameters to default.
Loading data
Validating SMILES
Test size = 64


67it [00:00, 155775.15it/s]
100%|██████████| 67/67 [00:00<00:00, 158857.19it/s]
[22:27:36] SMILES Parse Error: syntax error while parsing: CC[S+](CC)CC(=O)CCC(NC(=O)CNC(=O)OCc1ccccc1)C(=O)O.[Br-
[22:27:36] SMILES Parse Error: Failed parsing SMILES 'CC[S+](CC)CC(=O)CCC(NC(=O)CNC(=O)OCc1ccccc1)C(=O)O.[Br-' for input: 'CC[S+](CC)CC(=O)CCC(NC(=O)CNC(=O)OCc1ccccc1)C(=O)O.[Br-'
[22:27:36] SMILES Parse Error: syntax error while parsing: CCCn1c(C)c(C)[n+](C)c1SCC(=O)CCC(NC(=O)CNC(=O)OCc1ccccc1)C(=O)O.[Br-
[22:27:36] SMILES Parse Error: Failed parsing SMILES 'CCCn1c(C)c(C)[n+](C)c1SCC(=O)CCC(NC(=O)CNC(=O)OCc1ccccc1)C(=O)O.[Br-' for input: 'CCCn1c(C)c(C)[n+](C)c1SCC(=O)CCC(NC(=O)CNC(=O)OCc1ccccc1)C(=O)O.[Br-'
[22:27:36] SMILES Parse Error: syntax error while parsing: Cc1c(C)[n+](C)c(SCC(=O)CCC(NC(=O)C(Cc2ccccc2)NC(=O)OCc2ccccc2)C(=O)O)n1-c1ccccc1.[Br-
[22:27:36] SMILES Parse Error: Failed parsing SMILES 'Cc1c(C)[n+](C)c(SCC(=O)CCC(NC(=O)C(Cc2ccccc2)NC(=O)OCc2ccccc2)C(=O)O)n1-c1ccccc1.[Br-' for i

Loading pretrained parameter "encoder.encoder.0.cached_zero_vector".
Loading pretrained parameter "encoder.encoder.0.W_i.weight".
Loading pretrained parameter "encoder.encoder.0.W_h.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.weight".
Loading pretrained parameter "encoder.encoder.0.W_o.bias".
Loading pretrained parameter "readout.1.weight".
Loading pretrained parameter "readout.1.bias".
Loading pretrained parameter "readout.4.weight".
Loading pretrained parameter "readout.4.bias".



 50%|█████     | 1/2 [00:04<00:04,  4.20s/it][A
  0%|          | 0/1 [00:10<?, ?it/s]        [A
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/bin/chemprop_predict", line 8, in <module>
    sys.exit(chemprop_predict())
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/chemprop/train/make_predictions.py", line 506, in chemprop_predict
    make_predictions(args=PredictArgs().parse_args())
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/chemprop/utils.py", line 591, in wrap
    result = func(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/chemprop/train/make_predictions.py", line 462, in make_predictions
    preds, unc = predict_and_save(
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/chemprop/train/make_predictions.py", line 160, in predict_and_save
    estimator = 

KeyboardInterrupt: 

In [2]:
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
import os
import numpy as np

def evaluate_predictions(predictions_path, test_path):
    # Load predictions and testing data
    preds_df = pd.read_csv(predictions_path)
    test_df = pd.read_csv(test_path)
    # preds_df["Activity2"] = preds_df["Activity"]
    # preds_df.drop(["Activity"], axis = 1)

    

    # Ensure the data has 'smiles' and 'preds' columns for predictions and 'smiles' and 'Activity' for testing
    if 'smiles' not in preds_df.columns or 'Activity2' not in preds_df.columns:
        raise ValueError("Prediction file must contain 'smiles' and 'Activity2' columns.")
    if 'smiles' not in test_df.columns or 'Activity' not in test_df.columns:
        raise ValueError("Testing file must contain 'smiles' and 'Activity' columns.")

    # Filter out rows with 'Invalid Smiles' in predictions
    preds_df = preds_df[preds_df['Activity2'] != 'Invalid Smiles']
    
    # Convert valid predictions to numeric
    preds_df['Activity2'] = pd.to_numeric(preds_df['Activity2'], errors='coerce')

    # Merge predictions with the testing data on 'smiles'
    merged_df = pd.merge(test_df, preds_df, on='smiles', how='inner')

    # Check if we have any valid data after merging
    if merged_df.empty:
        raise ValueError("No valid predictions found after merging.")

    # Extract true labels and predicted values
    true_labels = merged_df['Activity']
    preds = merged_df['Activity2'].astype(float)

    # Binarize predictions
    threshold = 0.5
    binary_preds = (preds > threshold).astype(int)

    # Calculate metrics
    print(true_labels)
    print(binary_preds)
    auc = roc_auc_score(true_labels, binary_preds)
    f1 = f1_score(true_labels, binary_preds)
    accuracy = accuracy_score(true_labels, binary_preds)

    return auc, f1, accuracy

def main():
    results = []

    # Loop through trials and folds
    num_trials = 3
    num_folds = 5

    for trial in range(num_trials):
        for fold in range(num_folds):
            pred_file = f'preds_trial_{trial}_fold_{fold}.csv'
            test_file = f'temp_val_trial_{trial}_fold_{fold}.csv'

            if os.path.exists(pred_file) and os.path.exists(test_file):
                print(f"Evaluating trial {trial}, fold {fold}...")

                auc, f1, accuracy = evaluate_predictions(pred_file, test_file)
                results.append({
                        'trial': trial,
                        'fold': fold,
                        'AUC': auc,
                        'F1': f1,
                        'Accuracy': accuracy
                })
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)

    # Calculate average metrics
    avg_results = results_df.groupby(['trial']).mean().reset_index()
    avg_results.to_csv('average_metrics.csv', index=False)

    print("Evaluation complete. Results saved to 'average_metrics.csv'.")

if __name__ == "__main__":
    main()



KeyError: 'trial'