In [1]:
# import packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from typing import List
from rdkit import DataStructs, Chem
from rdkit.Chem import MolFromSmiles, AllChem
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from sklearn import gaussian_process
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Kernel


In [2]:
def select_next_batch(file1, file2, output_csv, strategy, top_n=None):
    """
    Merges two CSV files based on shared compounds and selects top compounds using the specified strategy.

    Parameters:
    - file1 (str): Path to the first CSV file.
    - file2 (str): Path to the second CSV file.
    - output_csv (str): Path to the output CSV file to save the selected compounds.
    - strategy (str): Strategy for selecting compounds ('uncertain' or 'greedy').
    - top_n (int or None): Number of top compounds to select. If None, selects all (default is None).
    """
    # Load the two CSV files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Merge based on shared compounds
    merged_df = pd.merge(df1, df2, how='inner', on='SMILES')

    # Drop 'Name_y' column
    merged_df.drop(columns=['Name_y'], inplace=True)

    # Rename 'Name_x' to 'Name'
    merged_df.rename(columns={'Name_x': 'Name'}, inplace=True)

    # Determine the strategy for sorting
    if strategy == 'uncertain':
        sorted_results = merged_df.sort_values(by='Uncertainty', ascending=False)
    elif strategy == 'greedy':
        sorted_results = merged_df.sort_values(by='Predicted_Score', ascending=True)
    else:
        raise ValueError("Invalid strategy. Use 'uncertain' or 'greedy'.")

    # Select the top N compounds, or all if top_n is None
    if top_n is not None:
        top_compounds = sorted_results.head(top_n)
    else:
        top_compounds = sorted_results

    # Save the selected compounds to a new CSV file
    top_compounds.to_csv(output_csv, index=False)

In [3]:
def select_scores(input_csv_path, cmpds_csv_path, output_csv_path):
    # Load the main CSV file into a DataFrame
    all_compounds_df = pd.read_csv(input_csv_path)

    # Load the compounds CSV file into a DataFrame
    cmpds_df = pd.read_csv(cmpds_csv_path)

    # Merge the two DataFrames based on the 'Name' column using an inner join
    merged_df = pd.merge(all_compounds_df, cmpds_df, how='inner', on='Name')

    # Print the columns of the merged DataFrame for inspection
    print("Columns of merged DataFrame:")
    print(merged_df.columns)

    # Select the desired columns
    selected_df = merged_df[['Name', 'SMILES_x', 'Score']]

    # Print the selected DataFrame for further inspection
    print("\nSelected DataFrame:")
    print(selected_df)

    # Rename 'SMILES_x' to 'SMILES'
    selected_df = selected_df.rename(columns={'SMILES_x': 'SMILES'})

    # Save the resulting DataFrame to a new CSV file
    selected_df.to_csv(output_csv_path, index=False)

    print(f"Selected compounds saved to {output_csv_path}")

In [4]:
def append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path):
    # Read the main compounds CSV file
    compounds_df = pd.read_csv(compounds_csv_path)

    # Read the descriptors CSV file
    descriptors_df = pd.read_csv(descriptors_csv_path)

    # Merge the two DataFrames based on the 'Name' column
    merged_df = pd.merge(compounds_df, descriptors_df, on='Name', how='left', suffixes=('', '_descriptor'))

    # Append descriptor columns after the 'Score' column
    score_index = merged_df.columns.get_loc('Score')
    descriptor_columns = [col for col in merged_df.columns if col.endswith('_descriptor')]
    columns_order = list(merged_df.columns[:score_index + 1]) + descriptor_columns + list(merged_df.columns[score_index + 1:])

    # Update DataFrame with the new column order
    merged_df = merged_df[columns_order]

    # Drop the descriptor columns
    merged_df = merged_df.drop(merged_df.filter(like='_descriptor').columns, axis=1)

    # Save the updated DataFrame to the same CSV file
    merged_df.to_csv(compounds_csv_path, index=False)

    print(f"Descriptors appended to {compounds_csv_path}")

Set up Gaussian Process Regressor model

In [5]:
# Function to calculate the tanimoto similarity for the Gaussian process kernel prediction
def tanimoto_similarity(a, b):
    """Computes the Tanimoto similarity for all pairs.

  Args:
    a: Numpy array with shape [batch_size_a, num_features].
    b: Numpy array with shape [batch_size_b, num_features].

  Returns:
    Numpy array with shape [batch_size_a, batch_size_b].
  """
    aa = np.sum(a, axis=1, keepdims=True)
    bb = np.sum(b, axis=1, keepdims=True)
    ab = np.matmul(a, b.T)
    return np.true_divide(ab, aa + bb.T - ab)

In [6]:
class TanimotoKernel(gaussian_process.kernels.NormalizedKernelMixin,
                     gaussian_process.kernels.StationaryKernelMixin,
                     gaussian_process.kernels.Kernel):
  """Custom Gaussian process kernel that computes Tanimoto similarity."""

  def __init__(self):
    """Initializer."""
    pass  # Does nothing; this is required by get_params().

  def __call__(self, X, Y=None, eval_gradient=False):  # pylint: disable=invalid-name
    """Computes the pairwise Tanimoto similarity.

    Args:
      X: Numpy array with shape [batch_size_a, num_features].
      Y: Numpy array with shape [batch_size_b, num_features]. If None, X is
        used.
      eval_gradient: Whether to compute the gradient.

    Returns:
      Numpy array with shape [batch_size_a, batch_size_b].

    Raises:
      NotImplementedError: If eval_gradient is True.
    """
    if eval_gradient:
      raise NotImplementedError
    if Y is None:
      Y = X
    return tanimoto_similarity(X, Y)

In [7]:
def train_gpr_model(csv_file_path):
    # Load data from CSV
    data = pd.read_csv(csv_file_path)

    # Extract individual bit columns for the representations needed for X_train
    bit_columns = data.drop(columns=['Name', 'SMILES', 'Score'])

    # Convert bits to NumPy array
    X_train = np.array(bit_columns)

    # Target values which in this case are the docking scores for the training data
    y_train = data['Score']

    # Use the custom kernel in a Gaussian process
    gpr = GaussianProcessRegressor(kernel=TanimotoKernel(), n_restarts_optimizer=100).fit(X_train, y_train)

    return gpr

In [8]:
def remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path):
    # Read the main compounds CSV file
    all_compounds_df = pd.read_csv(input_csv_path)

    # Read the list of compounds to remove
    compounds_to_remove_df = pd.read_csv(compounds_to_remove_csv_path)

    # Identify the indices of compounds to remove
    indices_to_remove = all_compounds_df[all_compounds_df['Name'].isin(compounds_to_remove_df['Name'])].index

    # Remove compounds from the main DataFrame
    remaining_compounds_df = all_compounds_df.drop(indices_to_remove)

    # Save the 'Name' and 'SMILES' columns to a new CSV file
    remaining_compounds_df[['Name', 'SMILES']].to_csv(output_csv_path, index=False)

    print(f"Compounds removed and remaining Name and SMILES saved to {output_csv_path}")

In [9]:
def predict_and_save_results(gpr, csv_file, output_csv):
    # Load data from CSV
    data = pd.read_csv(csv_file)

    # Extract individual bit columns for the representations needed for X_test
    bit_columns = data.drop(columns=['Name', 'SMILES'])

    # Convert bits to NumPy array
    X_test = np.array(bit_columns)

    # Predict using the Gaussian process model and obtain covariance
    y_pred, sigma = gpr.predict(X_test, return_std=True)

    # Add predicted values and uncertainty to the DataFrame
    data['Predicted_Score'] = y_pred
    data['Uncertainty'] = sigma

    # Save the DataFrame to a new CSV file
    output_data = data[['Name', 'SMILES', 'Predicted_Score', 'Uncertainty']]
    output_data.to_csv(output_csv, index=False)

In [10]:
def append_train_compounds(file1_path, file2_path, output_path):
    """
    Concatenates two CSV files with the same format and saves the result to a new CSV file.

    Parameters:
    - file1_path (str): Path to the first CSV file.
    - file2_path (str): Path to the second CSV file.
    - output_path (str): Path to save the concatenated CSV file.
    """
    # Load the data from both CSV files
    data1 = pd.read_csv(file1_path)
    data2 = pd.read_csv(file2_path)

    # Concatenate the two DataFrames
    concatenated_data = pd.concat([data1, data2], ignore_index=True)

    # Save the concatenated DataFrame to a new CSV file
    concatenated_data.to_csv(output_path, index=False)

    print(f"Concatenated compounds saved to {output_path}")

In [11]:
def calculate_recall(predictions_file, binders_file, top_n=2000, on_column='Name'):
    # Load the predicted results CSV
    predicted_results = pd.read_csv(predictions_file)

    # Sort the DataFrame based on Predicted_Score in ascending order
    sorted_results = predicted_results.sort_values(by='Predicted_Score', ascending=True)

    # Select the top N compounds
    top_compounds = sorted_results.head(top_n)

    # Read the data from the binders CSV file into a pandas DataFrame
    actual_df = pd.read_csv(binders_file)

    # Extract the compounds (e.g., 'Name') from each DataFrame
    compounds_file = set(actual_df[on_column])
    compounds_predicted = set(top_compounds[on_column])

    # Find the common compounds between the two DataFrames
    common_compounds = compounds_file.intersection(compounds_predicted)

    # Count the number of common compounds
    true_positives_count = len(common_compounds)

    # Count the number of false negatives
    false_negatives_count = len(compounds_file - common_compounds)

    # Calculate recall
    recall = true_positives_count / (true_positives_count + false_negatives_count)

    return recall

_________________________________________________________________________________________________________

In [13]:
# Unblind the scores for these 100 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round0_100_ecfp.csv'
output_csv_path = 'round0_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Morgan_bit0', 'Morgan_bit1',
       'Morgan_bit2', 'Morgan_bit3', 'Morgan_bit4', 'Morgan_bit5',
       ...
       'Morgan_bit2038', 'Morgan_bit2039', 'Morgan_bit2040', 'Morgan_bit2041',
       'Morgan_bit2042', 'Morgan_bit2043', 'Morgan_bit2044', 'Morgan_bit2045',
       'Morgan_bit2046', 'Morgan_bit2047'],
      dtype='object', length=2052)

Selected DataFrame:
               Name                                           SMILES_x  \
0   NCGC00373731-01            c1cc(ccc1C[NH+]2CC(C2)Oc3ccc(cc3F)F)C#N   
1   NCGC00374118-01                                    C[NH2+]C1CCOCC1   
2   NCGC00374337-01                   c1ccc(cc1)[C@H](C[NH3+])N2CCOCC2   
3   NCGC00374953-01           c1cc(cnc1)c2c3cnccn3c(n2)C(=O)NCc4ccncc4   
4   NCGC00375044-01    c1ccc(cc1)COC(=O)N2CCn3c(c(cn3)C(=O)N4CCOCC4)C2   
..              ...                                                ...   
95  NCGC00473936-01  Cc1cc(ccc1Cl)OCC(=

In [14]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round0_100_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round0_100_train_cmpds.csv


In [15]:
# Train our model using this training file.
csv_file_path = 'round0_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [16]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = '../../../../docked_ecfp.csv' #the whole library
compounds_to_remove_csv_path = 'round0_100_train_cmpds.csv' #initially chose from tsne
output_csv_path = 'round0_100_test_cmpds.csv' 

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path) #removing training set from overall library

Compounds removed and remaining Name and SMILES saved to round0_100_test_cmpds.csv


In [17]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round0_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [20]:
# Pick the next 100 train compounds based on highest uncertainty
file1 = 'round0_100_predicted_results.csv' #predicted from the model
file2 = 'round0_100_test_cmpds.csv' #compounds - training set
output_csv = 'round1_100_cmpds.csv' #next
strategy = 'uncertain'  # or 'greedy'
top_n = 100  # or None for all

select_next_batch(file1, file2, output_csv, strategy, top_n)

In [21]:
# Calculate recall for round 1
predictions_file_path = 'round0_100_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.108


ROUND 1

In [22]:
# Unblind the scores for these 100 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round1_100_cmpds.csv'
output_csv_path = 'round1_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
               Name                                           SMILES_x  \
0   NCGC00354182-02                         C1CCC(CC1)(C[NH3+])N2CCCC2   
1   NCGC00178726-02  C[N@](C1CCCCC1)[S@@+](=O)([C@H]2CC[S@+](=O)(C2...   
2   NCGC00027933-02  Cc1c2c([nH]n1)OC(=C([C@@H]2c3ccc(cc3)Cl)C#N)[N...   
3   NCGC00373865-01                         c1cc(cc(c1)c2ccsc2)C[NH3+]   
4   NCGC00373889-01                    Cc1ccc(c(c1)C)c2ccc(cc2)C[NH3+]   
..              ...                                                ...   
95  NCGC00463865-01                        CC[C@@]1(CCC12CC[NH2+]CC2)O   
96  NCGC00468623-01                   C1CC1c2nnc(o2)[C@@H]3CC(CN3)(F)F   
97  NCGC00468643-01                       Cc1nnc(o1)[C@@H]2CC(CN2)(F)F   
98  NCGC00470467-01                                  Cc1nc2c(cccn2n1)N   
99  NCGC00475356-01     

In [23]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round1_100_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round1_100_train_cmpds.csv


In [24]:
# Concatanate the train sets together 
file1_path = 'round0_100_train_cmpds.csv'
file2_path = 'round1_100_train_cmpds.csv'
output_path = 'round1_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round1_100_train_cmpds.csv


In [25]:
# Retrain the model using this training file.
csv_file_path = 'round1_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [26]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round0_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round1_100_train_cmpds.csv'
output_csv_path = 'round1_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round1_100_test_cmpds.csv


In [27]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round1_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [29]:
# Pick the next 100 train compounds based on highest uncertainty
file1 = 'round1_100_predicted_results.csv'
file2 = 'round1_100_test_cmpds.csv'
output_csv = 'round2_100_cmpds.csv'
strategy = 'uncertain'  # or 'greedy'
top_n = 100  # or None for all

select_next_batch(file1, file2, output_csv, strategy, top_n)

In [30]:
# Calculate recall for round 2
predictions_file_path = 'round1_100_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.12


ROUND 2

In [31]:
# Unblind the scores for these 100 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round2_100_cmpds.csv'
output_csv_path = 'round2_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
               Name                                           SMILES_x  \
0   NCGC00019120-02                              C1CCc2n[nH]c(=O)n2CC1   
1   NCGC00374351-01                         c1c[nH+]ccc1NCC[NH+]2CCCC2   
2   NCGC00374355-01                          c1cc(ccc1NCC[NH+]2CCCC2)O   
3   NCGC00375303-01         C1CNC(=O)C[C@H]2[C@@H]1C[N@@H+](CC2)C3COC3   
4   NCGC00375645-01                 CC(C)C1=NOC2(C1)CC[NH+](CC2)C3COC3   
..              ...                                                ...   
95  NCGC00477678-01  Cn1cccc1C[N@@H+]2CCC[C@@]3(C2)CN[S@@+](=O)(c4c...   
96  NCGC00478120-01                CC(C)CC[NH+]1CCC2(CC1)CC(=NO2)C(C)C   
97  NCGC00478706-01               Cn1cccc1C[N@@H+](C)Cc2ccccc2C(F)(F)F   
98  NCGC00478735-01                         CC(C)CC[N@@H+](C)Cc1cccn1C   
99  NCGC00478932-01     

In [32]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round2_100_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round2_100_train_cmpds.csv


In [33]:
# Concatanate the train sets together 
file1_path = 'round1_100_train_cmpds.csv'
file2_path = 'round2_100_train_cmpds.csv'
output_path = 'round2_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round2_100_train_cmpds.csv


In [34]:
# Retrain the model using this training file.
csv_file_path = 'round2_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [35]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round1_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round2_100_train_cmpds.csv'
output_csv_path = 'round2_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round2_100_test_cmpds.csv


In [36]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round2_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [38]:
# Pick the next 100 train compounds based on highest uncertainty
file1 = 'round2_100_predicted_results.csv'
file2 = 'round2_100_test_cmpds.csv'
output_csv = 'round3_100_cmpds.csv'
strategy = 'uncertain'  # or 'greedy'
top_n = 100  # or None for all

select_next_batch(file1, file2, output_csv, strategy, top_n)

In [39]:
# Calculate recall for round 3
predictions_file_path = 'round2_100_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.1255


ROUND 3

In [40]:
# Unblind the scores for these 100 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round3_100_cmpds.csv'
output_csv_path = 'round3_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
               Name                                           SMILES_x  \
0   NCGC00375112-01                CC(=O)N1CCn2cc([nH+]c2C1)C(=O)NCC=C   
1   NCGC00376202-01  C1[C@@H]2[C@H](C[N@@]1[S@+](=O)(CCC(F)(F)F)[O-...   
2   NCGC00377570-01           CC(C)N1C[C@H]2CC[C@@H](C1)[NH+]2CC3CCCC3   
3   NCGC00377777-01   CC(C)[NH+]1C[C@H]2CC[C@@H](C1)N2[S@+](=O)(C)[O-]   
4   NCGC00377776-01             CC(C)[NH+]1C[C@H]2CC[C@@H](C1)N2C(=O)C   
..              ...                                                ...   
95  NCGC00473808-01      Cn1c2ccc(cc2c3c1CCN(C3)C(=O)Nc4ccc(cc4Cl)Cl)F   
96  NCGC00474500-01  COc1ccc(c(c1)c2nc3ccc(cc3o2)c4ccc(c(c4OC)OC)OC)OC   
97  NCGC00475126-01                 CCCCCNC(=O)CCN(CC)C(=O)c1cc(oc1C)C   
98  NCGC00475184-01                   CCCCCCN(CCC(=O)NCCC)C(=O)c1ccoc1   
99  NCGC00479943-01  CC(

In [41]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round3_100_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round3_100_train_cmpds.csv


In [42]:
# Concatanate the train sets together 
file1_path = 'round2_100_train_cmpds.csv'
file2_path = 'round3_100_train_cmpds.csv'
output_path = 'round3_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round3_100_train_cmpds.csv


In [43]:
# Retrain the model using this training file.
csv_file_path = 'round3_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [44]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round2_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round3_100_train_cmpds.csv'
output_csv_path = 'round3_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round3_100_test_cmpds.csv


In [45]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round3_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [47]:
# Pick the next 100 train compounds based on lowest docking score
file1 = 'round3_100_predicted_results.csv'
file2 = 'round3_100_test_cmpds.csv'
output_csv = 'round4_100_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 100  # or None for all

select_next_batch(file1, file2, output_csv, strategy, top_n)

In [48]:
# Calculate recall for round 4
predictions_file_path = 'round3_100_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.1255


ROUND 4

In [49]:
# Unblind the scores for these 100 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round4_100_cmpds.csv'
output_csv_path = 'round4_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
               Name                                           SMILES_x  \
0   NCGC00373726-01               c1ccc(cc1)C[NH+]2CC(C2)Oc3ccc(cc3F)F   
1   NCGC00373725-01               c1ccnc(c1)C(=O)N2CC(C2)Oc3ccc(cc3F)F   
2   NCGC00373728-01          CN(C)c1ccc(cc1)C[NH+]2CC(C2)Oc3ccc(cc3F)F   
3   NCGC00374771-01     CCCC(=O)N1C[C@@]2(CCC[N@@H+]2C3COC3)Cc4c1cccc4   
4   NCGC00375507-01  C[S@@+](=O)([N@@]1C[C@@]2(CC[N@@H+](C2)C3CCCCC...   
..              ...                                                ...   
95  NCGC00478603-01            Cc1ccc(cc1)C(=O)N(C)Cc2ccc(cc2)c3ccccc3   
96  NCGC00478602-01              Cc1ccccc1C(=O)N(C)Cc2ccc(cc2)c3ccccc3   
97  NCGC00478601-01     CN(Cc1ccc(cc1)c2ccccc2)C(=O)c3ccc(cc3)C(F)(F)F   
98  NCGC00478615-01           CN(Cc1ccc(cc1)c2ccccc2)C(=O)c3ccc(nc3)Cl   
99  NCGC00478608-01     

In [50]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round4_100_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round4_100_train_cmpds.csv


In [51]:
# Concatanate the train sets together 
file1_path = 'round3_100_train_cmpds.csv'
file2_path = 'round4_100_train_cmpds.csv'
output_path = 'round4_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round4_100_train_cmpds.csv


In [52]:
# Retrain the model using this training file.
csv_file_path = 'round4_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [53]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round3_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round4_100_train_cmpds.csv'
output_csv_path = 'round4_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round4_100_test_cmpds.csv


In [54]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round4_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [55]:
# Pick the next 100 train compounds based on lowest docking score
file1 = 'round4_100_predicted_results.csv'
file2 = 'round4_100_test_cmpds.csv'
output_csv = 'round5_100_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 100  # or None for all

select_next_batch(file1, file2, output_csv, strategy, top_n)

In [56]:
# Calculate recall for round 5
predictions_file_path = 'round4_100_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.1115


ROUND 5

In [12]:
# Unblind the scores for these 100 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round5_100_cmpds.csv'
output_csv_path = 'round5_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
               Name                                           SMILES_x  \
0   NCGC00393688-01  COc1ccc(cc1)C[N@@H+]2C[C@@H]3C[C@H]2C(=O)N(c4c...   
1   NCGC00393713-01  c1ccc(cc1)CN2c3ccccc3O[C@H]4C[C@@H](C2=O)[N@@H...   
2   NCGC00393646-01  c1ccc2c(c1)N(C(=O)[C@@H]3C[C@H](O2)C[N@H+]3Cc4...   
3   NCGC00393689-01  c1ccc2c(c1)N(C(=O)[C@@H]3C[C@H](O2)C[N@H+]3Cc4...   
4   NCGC00393793-01  COCCN1c2ccccc2O[C@H]3C[C@@H](C1=O)[N@@H+](C3)C...   
..              ...                                                ...   
95  NCGC00478614-01                      CC(=O)N(C)Cc1ccc(cc1)c2ccccc2   
96  NCGC00478617-01               CC(C)(C)C(=O)N(C)Cc1ccc(cc1)c2ccccc2   
97  NCGC00478604-01           CN(Cc1ccc(cc1)c2ccccc2)C(=O)Cc3ccc(cc3)F   
98  NCGC00478609-01           CN(Cc1ccc(cc1)c2ccccc2)C(=O)c3cccc(c3)OC   
99  NCGC00478621-01     

In [13]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round5_100_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round5_100_train_cmpds.csv


In [14]:
# Concatanate the train sets together 
file1_path = 'round4_100_train_cmpds.csv'
file2_path = 'round5_100_train_cmpds.csv'
output_path = 'round5_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round5_100_train_cmpds.csv


In [15]:
# Retrain the model using this training file.
csv_file_path = 'round5_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [16]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round4_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round5_100_train_cmpds.csv'
output_csv_path = 'round5_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round5_100_test_cmpds.csv


In [17]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round5_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [12]:
# Pick the next 100 train compounds based on lowest docking score
file1 = 'round5_100_predicted_results.csv'
file2 = 'round5_100_test_cmpds.csv'
output_csv = 'round6_100_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 100  # or None for all

select_next_batch(file1, file2, output_csv, strategy, top_n)

In [14]:
# Calculate recall for round 5
predictions_file_path = 'round5_100_predicted_results.csv'
binders_file_path = '../../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.1485


ROUND 6

In [15]:
# Unblind the scores for these 100 compounds and save as training set.
input_csv_path = '../../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round6_100_cmpds.csv'
output_csv_path = 'round6_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
               Name                                           SMILES_x  \
0   NCGC00373734-01         c1ccc2c(c1)cccc2C[NH+]3CC(C3)Oc4ccc(cc4F)F   
1   NCGC00373736-01               Cn1c(ccn1)C[NH+]2CC(C2)Oc3ccc(cc3F)F   
2   NCGC00377045-01  c1ccc(cc1)c2ccc(cc2)C[NH+]3[C@@H]4CC[C@H]3CN(C...   
3   NCGC00393686-01  c1ccc2c(c1)N(C(=O)[C@@H]3C[C@H](O2)C[N@H+]3CC4...   
4   NCGC00393670-01  c1ccc2c(c1)N(C(=O)[C@@H]3C[C@H](O2)CN3C(=O)C4C...   
..              ...                                                ...   
95  NCGC00477628-01  c1ccc2c(c1)NC[C@]3(CCC[N@H+](C3)Cc4ccc(cc4)C(F...   
96  NCGC00477840-01  c1ccc(cc1)Cn2c(=O)nc([nH]2)C3CC[NH+](CC3)Cc4cc...   
97  NCGC00478655-01               CN(Cc1cccc(c1)C(F)(F)F)C(=O)c2ccccc2   
98  NCGC00478616-01          CN(Cc1ccc(cc1)c2ccccc2)C(=O)Cc3cccc(c3)Cl   
99  NCGC00478840-01     

In [13]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round6_100_train_cmpds.csv'
descriptors_csv_path = '../../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round6_100_train_cmpds.csv


In [14]:
# Concatanate the train sets together 
file1_path = 'round5_100_train_cmpds.csv'
file2_path = 'round6_100_train_cmpds.csv'
output_path = 'round6_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round6_100_train_cmpds.csv


In [15]:
# Retrain the model using this training file.
csv_file_path = 'round6_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [16]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round5_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round6_100_train_cmpds.csv'
output_csv_path = 'round6_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round6_100_test_cmpds.csv


In [17]:
# Predict on the entire library
csv_file_to_predict = '../../../../../docked_ecfp.csv'
output_csv_file = 'round6_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [18]:
# Pick the next 100 train compounds based on lowest docking score
file1 = 'round6_100_predicted_results.csv'
file2 = 'round6_100_test_cmpds.csv'
output_csv = 'round7_100_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 100  # or None for all

select_next_batch(file1, file2, output_csv, strategy, top_n)

In [19]:
# Calculate recall for round 5
predictions_file_path = 'round6_100_predicted_results.csv'
binders_file_path = '../../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.1445
