In [9]:
# import packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from typing import List
from rdkit import DataStructs, Chem
from rdkit.Chem import MolFromSmiles, AllChem
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from sklearn import gaussian_process
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Kernel


In [25]:
def merge_and_select_shared_compounds(file1, file2, output_csv, strategy, top_n=None):
    """
    Merges two CSV files based on shared compounds and selects top compounds using the specified strategy.

    Parameters:
    - file1 (str): Path to the first CSV file.
    - file2 (str): Path to the second CSV file.
    - output_csv (str): Path to the output CSV file to save the selected compounds.
    - strategy (str): Strategy for selecting compounds ('uncertain' or 'greedy').
    - top_n (int or None): Number of top compounds to select. If None, selects all (default is None).
    """
    # Load the two CSV files
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)

    # Merge based on shared compounds
    merged_df = pd.merge(df1, df2, how='inner', on='SMILES')

    # Drop 'Name_y' column
    merged_df.drop(columns=['Name_y'], inplace=True)

    # Rename 'Name_x' to 'Name'
    merged_df.rename(columns={'Name_x': 'Name'}, inplace=True)

    # Determine the strategy for sorting
    if strategy == 'uncertain':
        sorted_results = merged_df.sort_values(by='Uncertainty', ascending=False)
    elif strategy == 'greedy':
        sorted_results = merged_df.sort_values(by='Predicted_Score', ascending=True)
    else:
        raise ValueError("Invalid strategy. Use 'uncertain' or 'greedy'.")

    # Select the top N compounds, or all if top_n is None
    if top_n is not None:
        top_compounds = sorted_results.head(top_n)
    else:
        top_compounds = sorted_results

    # Save the selected compounds to a new CSV file
    top_compounds.to_csv(output_csv, index=False)

In [11]:
def select_scores(input_csv_path, cmpds_csv_path, output_csv_path):
    # Load the main CSV file into a DataFrame
    all_compounds_df = pd.read_csv(input_csv_path)

    # Load the compounds CSV file into a DataFrame
    cmpds_df = pd.read_csv(cmpds_csv_path)

    # Merge the two DataFrames based on the 'Name' column using an inner join
    merged_df = pd.merge(all_compounds_df, cmpds_df, how='inner', on='Name')

    # Print the columns of the merged DataFrame for inspection
    print("Columns of merged DataFrame:")
    print(merged_df.columns)

    # Select the desired columns
    selected_df = merged_df[['Name', 'SMILES_x', 'Score']]

    # Print the selected DataFrame for further inspection
    print("\nSelected DataFrame:")
    print(selected_df)

    # Rename 'SMILES_x' to 'SMILES'
    selected_df = selected_df.rename(columns={'SMILES_x': 'SMILES'})

    # Save the resulting DataFrame to a new CSV file
    selected_df.to_csv(output_csv_path, index=False)

    print(f"Selected compounds saved to {output_csv_path}")

In [12]:
def append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path):
    # Read the main compounds CSV file
    compounds_df = pd.read_csv(compounds_csv_path)

    # Read the descriptors CSV file
    descriptors_df = pd.read_csv(descriptors_csv_path)

    # Merge the two DataFrames based on the 'Name' column
    merged_df = pd.merge(compounds_df, descriptors_df, on='Name', how='left', suffixes=('', '_descriptor'))

    # Append descriptor columns after the 'Score' column
    score_index = merged_df.columns.get_loc('Score')
    descriptor_columns = [col for col in merged_df.columns if col.endswith('_descriptor')]
    columns_order = list(merged_df.columns[:score_index + 1]) + descriptor_columns + list(merged_df.columns[score_index + 1:])

    # Update DataFrame with the new column order
    merged_df = merged_df[columns_order]

    # Drop the descriptor columns
    merged_df = merged_df.drop(merged_df.filter(like='_descriptor').columns, axis=1)

    # Save the updated DataFrame to the same CSV file
    merged_df.to_csv(compounds_csv_path, index=False)

    print(f"Descriptors appended to {compounds_csv_path}")

Set up Gaussian Process Regressor model

In [13]:
# Function to calculate the tanimoto similarity for the Gaussian process kernel prediction
def tanimoto_similarity(a, b):
    """Computes the Tanimoto similarity for all pairs.

  Args:
    a: Numpy array with shape [batch_size_a, num_features].
    b: Numpy array with shape [batch_size_b, num_features].

  Returns:
    Numpy array with shape [batch_size_a, batch_size_b].
  """
    aa = np.sum(a, axis=1, keepdims=True)
    bb = np.sum(b, axis=1, keepdims=True)
    ab = np.matmul(a, b.T)
    return np.true_divide(ab, aa + bb.T - ab)

In [14]:
class TanimotoKernel(gaussian_process.kernels.NormalizedKernelMixin,
                     gaussian_process.kernels.StationaryKernelMixin,
                     gaussian_process.kernels.Kernel):
  """Custom Gaussian process kernel that computes Tanimoto similarity."""

  def __init__(self):
    """Initializer."""
    pass  # Does nothing; this is required by get_params().

  def __call__(self, X, Y=None, eval_gradient=False):  # pylint: disable=invalid-name
    """Computes the pairwise Tanimoto similarity.

    Args:
      X: Numpy array with shape [batch_size_a, num_features].
      Y: Numpy array with shape [batch_size_b, num_features]. If None, X is
        used.
      eval_gradient: Whether to compute the gradient.

    Returns:
      Numpy array with shape [batch_size_a, batch_size_b].

    Raises:
      NotImplementedError: If eval_gradient is True.
    """
    if eval_gradient:
      raise NotImplementedError
    if Y is None:
      Y = X
    return tanimoto_similarity(X, Y)

In [15]:
def train_gpr_model(csv_file_path):
    # Load data from CSV
    data = pd.read_csv(csv_file_path)

    # Extract individual bit columns for the representations needed for X_train
    bit_columns = data.drop(columns=['Name', 'SMILES', 'Score'])

    # Convert bits to NumPy array
    X_train = np.array(bit_columns)

    # Target values which in this case are the docking scores for the training data
    y_train = data['Score']

    # Use the custom kernel in a Gaussian process
    gpr = GaussianProcessRegressor(kernel=TanimotoKernel(), n_restarts_optimizer=100).fit(X_train, y_train)

    return gpr

In [16]:
def remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path):
    # Read the main compounds CSV file
    all_compounds_df = pd.read_csv(input_csv_path)

    # Read the list of compounds to remove
    compounds_to_remove_df = pd.read_csv(compounds_to_remove_csv_path)

    # Identify the indices of compounds to remove
    indices_to_remove = all_compounds_df[all_compounds_df['Name'].isin(compounds_to_remove_df['Name'])].index

    # Remove compounds from the main DataFrame
    remaining_compounds_df = all_compounds_df.drop(indices_to_remove)

    # Save the 'Name' and 'SMILES' columns to a new CSV file
    remaining_compounds_df[['Name', 'SMILES']].to_csv(output_csv_path, index=False)

    print(f"Compounds removed and remaining Name and SMILES saved to {output_csv_path}")

In [17]:
def predict_and_save_results(gpr, csv_file, output_csv):
    # Load data from CSV
    data = pd.read_csv(csv_file)

    # Extract individual bit columns for the representations needed for X_test
    bit_columns = data.drop(columns=['Name', 'SMILES'])

    # Convert bits to NumPy array
    X_test = np.array(bit_columns)

    # Predict using the Gaussian process model and obtain covariance
    y_pred, sigma = gpr.predict(X_test, return_std=True)

    # Add predicted values and uncertainty to the DataFrame
    data['Predicted_Score'] = y_pred
    data['Uncertainty'] = sigma

    # Save the DataFrame to a new CSV file
    output_data = data[['Name', 'SMILES', 'Predicted_Score', 'Uncertainty']]
    output_data.to_csv(output_csv, index=False)

In [18]:
def append_train_compounds(file1_path, file2_path, output_path):
    """
    Concatenates two CSV files with the same format and saves the result to a new CSV file.

    Parameters:
    - file1_path (str): Path to the first CSV file.
    - file2_path (str): Path to the second CSV file.
    - output_path (str): Path to save the concatenated CSV file.
    """
    # Load the data from both CSV files
    data1 = pd.read_csv(file1_path)
    data2 = pd.read_csv(file2_path)

    # Concatenate the two DataFrames
    concatenated_data = pd.concat([data1, data2], ignore_index=True)

    # Save the concatenated DataFrame to a new CSV file
    concatenated_data.to_csv(output_path, index=False)

    print(f"Concatenated compounds saved to {output_path}")

In [19]:
def calculate_recall(predictions_file, binders_file, top_n=2000, on_column='Name'):
    # Load the predicted results CSV
    predicted_results = pd.read_csv(predictions_file)

    # Sort the DataFrame based on Predicted_Score in ascending order
    sorted_results = predicted_results.sort_values(by='Predicted_Score', ascending=True)

    # Select the top N compounds
    top_compounds = sorted_results.head(top_n)

    # Read the data from the binders CSV file into a pandas DataFrame
    actual_df = pd.read_csv(binders_file)

    # Extract the compounds (e.g., 'Name') from each DataFrame
    compounds_file = set(actual_df[on_column])
    compounds_predicted = set(top_compounds[on_column])

    # Find the common compounds between the two DataFrames
    common_compounds = compounds_file.intersection(compounds_predicted)

    # Count the number of common compounds
    true_positives_count = len(common_compounds)

    # Count the number of false negatives
    false_negatives_count = len(compounds_file - common_compounds)

    # Calculate recall
    recall = true_positives_count / (true_positives_count + false_negatives_count)

    return recall

_________________________________________________________________________________________________________

In [14]:
# Unblind the scores for these 1000 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round0_1000_ecfp.csv'
output_csv_path = 'round0_1000_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Morgan_bit0', 'Morgan_bit1',
       'Morgan_bit2', 'Morgan_bit3', 'Morgan_bit4', 'Morgan_bit5',
       ...
       'Morgan_bit2038', 'Morgan_bit2039', 'Morgan_bit2040', 'Morgan_bit2041',
       'Morgan_bit2042', 'Morgan_bit2043', 'Morgan_bit2044', 'Morgan_bit2045',
       'Morgan_bit2046', 'Morgan_bit2047'],
      dtype='object', length=2052)

Selected DataFrame:
                Name                                           SMILES_x  \
0    NCGC00373269-01          CC(C)C1=NO[C@@]2(C1)CCCN(C2)C(=O)c3ccccc3   
1    NCGC00373335-01  COc1ccc(cc1)C2=NO[C@@]3(C2)CCCN(C3)C(=O)c4ccc(...   
2    NCGC00373405-01          CC(C)C1=NO[C@]2(C1)CCN(C2)C(=O)c3ccccc3OC   
3    NCGC00373685-01                 c1cc2c(cc1OC3C[NH+](C3)CC4CC4)OCO2   
4    NCGC00373953-01                    c1ccc(c(c1)C=O)Oc2ccc(c(c2)Cl)F   
..               ...                                                ...   
995  NCGC00479866-01     c1cn2cc

In [15]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round0_1000_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round0_1000_train_cmpds.csv


In [16]:
# Train our model using this training file.
csv_file_path = 'round0_1000_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [28]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = '../../../../docked_ecfp.csv'
compounds_to_remove_csv_path = 'round0_1000_train_cmpds.csv'
output_csv_path = 'round0_1000_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round0_1000_test_cmpds.csv


In [18]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round0_1000_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [29]:
# Pick the next 1000 train compounds based on lowest docking score
file1 = 'round0_1000_predicted_results.csv'
file2 = 'round0_1000_test_cmpds.csv'
output_csv = 'round1_1000_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 1000  # or None for all

merge_and_select_shared_compounds(file1, file2, output_csv, strategy, top_n)

In [32]:
# Calculate recall for round 1
predictions_file_path = 'round0_1000_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.1785


ROUND 1

In [33]:
# Unblind the scores for these 1000 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round1_1000_cmpds.csv'
output_csv_path = 'round1_1000_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
                Name                                           SMILES_x  \
0    NCGC00373364-01  c1ccnc(c1)C2=NO[C@]3(C2)CCC[N@H+](C3)Cc4ccc(cc4)F   
1    NCGC00373572-01                 c1ccc(cc1)C[NH+]2CCC(CC2)N3CCOC3=O   
2    NCGC00373621-01                c1cc(ccc1C[NH+]2CCC(CC2)N3CCNC3=O)F   
3    NCGC00373614-01         c1ccc2c(c1)CN(C2=O)C3CC[NH+](CC3)CC4CCCCC4   
4    NCGC00373664-01             c1ccc(cc1)C(=O)N2CC(C2)Oc3ccc(c(c3)F)F   
..               ...                                                ...   
995  NCGC00480209-01       c1ccc(cc1)CC2=NOC3(C2)CCN(CC3)C(=O)c4ccccc4F   
996  NCGC00480585-01       COc1ccc(cc1)CC(=O)N2[C@@H]3CCC[C@H]2CC(=O)C3   
997  NCGC00480588-01  c1ccc2c(c1)ccc(n2)C(=O)N3[C@@H]4CCC[C@H]3CC(=O)C4   
998  NCGC00480597-01         Cc1ccc(cc1)C(=O)N2[C@@H]3CCC[C@H]2CC(=O)C3   
999  NCGC0048

In [34]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round1_1000_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round1_1000_train_cmpds.csv


In [35]:
# Concatanate the train sets together 
file1_path = 'round0_1000_train_cmpds.csv'
file2_path = 'round1_1000_train_cmpds.csv'
output_path = 'round1_1000_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round1_1000_train_cmpds.csv


In [37]:
# Retrain the model using this training file.
csv_file_path = 'round1_1000_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [38]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round0_1000_test_cmpds.csv'
compounds_to_remove_csv_path = 'round1_1000_train_cmpds.csv'
output_csv_path = 'round1_1000_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round1_1000_test_cmpds.csv


In [45]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round1_1000_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [47]:
# Pick the next 1000 train compounds based on lowest docking score
file1 = 'round1_1000_predicted_results.csv'
file2 = 'round1_1000_test_cmpds.csv'
output_csv = 'round2_1000_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 1000  # or None for all

merge_and_select_shared_compounds(file1, file2, output_csv, strategy, top_n)

In [48]:
# Calculate recall for round 2
predictions_file_path = 'round1_1000_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.291


ROUND 2

In [49]:
# Unblind the scores for these 1000 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round2_1000_cmpds.csv'
output_csv_path = 'round2_1000_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
                Name                                           SMILES_x  \
0    NCGC00373265-01          CC(C)C1=NO[C@]2(C1)CCC[N@H+](C2)Cc3ccccn3   
1    NCGC00373272-01  CC(C)C1=NO[C@@]2(C1)CCC[N@@](C2)[S@@+](=O)(c3c...   
2    NCGC00373287-01      c1ccc(cc1)C2=NO[C@]3(C2)CCCN(C3)C(=O)C4CCCCC4   
3    NCGC00373391-01  CC(C)C1=NO[C@]2(C1)CC[N@@](C2)[S@@+](=O)(c3ccc...   
4    NCGC00373394-01           CC(C)C1=NO[C@]2(C1)CC[N@H+](C2)Cc3ccccc3   
..               ...                                                ...   
995  NCGC00480138-01   CC(=O)N1CC2(C1)Cc3ccccc3[N@](C2)[S@+](=O)(C)[O-]   
996  NCGC00480137-01  COC(=O)N1CC2(C1)Cc3ccccc3[N@](C2)[S@+](=O)(C)[O-]   
997  NCGC00480211-01    c1ccc(cc1)CC2=NOC3(C2)CCN(CC3)C(=O)c4cccc(c4)Cl   
998  NCGC00480232-01  C[S@@+](=O)([N@]1CC2(Cc3c1cccc3)CN(C2)C(=O)Cc4...   
999  NCGC0048

In [50]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round2_1000_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round2_1000_train_cmpds.csv


In [51]:
# Concatanate the train sets together 
file1_path = 'round1_1000_train_cmpds.csv'
file2_path = 'round2_1000_train_cmpds.csv'
output_path = 'round2_1000_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round2_1000_train_cmpds.csv


In [53]:
# Retrain the model using this training file.
csv_file_path = 'round2_1000_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [54]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round1_1000_test_cmpds.csv'
compounds_to_remove_csv_path = 'round2_1000_train_cmpds.csv'
output_csv_path = 'round2_1000_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round2_1000_test_cmpds.csv


In [55]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round2_1000_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [56]:
# Pick the next 1000 train compounds based on lowest docking score
file1 = 'round2_1000_predicted_results.csv'
file2 = 'round2_1000_test_cmpds.csv'
output_csv = 'round3_1000_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 1000  # or None for all

merge_and_select_shared_compounds(file1, file2, output_csv, strategy, top_n)

In [58]:
# Calculate recall for round 3
predictions_file_path = 'round2_1000_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.3515


ROUND 3

In [60]:
# Unblind the scores for these 1000 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round3_1000_cmpds.csv'
output_csv_path = 'round3_1000_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
                Name                                           SMILES_x  \
0    NCGC00373273-01  CC(C)C1=NO[C@@]2(C1)CCC[N@@](C2)[S@@+](=O)(c3c...   
1    NCGC00373284-01  COc1ccc(cc1)C[N@@H+]2CCC[C@@]3(C2)CC(=NO3)c4cc...   
2    NCGC00373286-01     c1ccc(cc1)C2=NO[C@]3(C2)CCC[N@H+](C3)CC4CCCCC4   
3    NCGC00373283-01    c1ccc(cc1)CC(=O)N2CCC[C@@]3(C2)CC(=NO3)c4ccccc4   
4    NCGC00373296-01      c1ccnc(c1)C2=NO[C@]3(C2)CCC[N@H+](C3)CC4CCCC4   
..               ...                                                ...   
995  NCGC00479622-01   Cc1ccc(cc1)n2c(=O)n3c(n2)CN(CC3)C(=O)c4ccc(cc4)F   
996  NCGC00479729-01                CN(Cc1cccc(c1)C(F)(F)F)C(=O)C2CCCC2   
997  NCGC00479741-01                 CN(Cc1ccc(cc1)F)C(=O)Cc2ccc(cc2)OC   
998  NCGC00479864-01      c1cn2cc(cc(c2n1)C(=O)N3CCC(CC3)N4CCCCCC4=O)Br   
999  NCGC0048

In [62]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round3_1000_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round3_1000_train_cmpds.csv


In [63]:
# Concatanate the train sets together 
file1_path = 'round2_1000_train_cmpds.csv'
file2_path = 'round3_1000_train_cmpds.csv'
output_path = 'round3_1000_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round3_1000_train_cmpds.csv


In [65]:
# Retrain the model using this training file.
csv_file_path = 'round3_1000_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [66]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round2_1000_test_cmpds.csv'
compounds_to_remove_csv_path = 'round3_1000_train_cmpds.csv'
output_csv_path = 'round3_1000_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round3_1000_test_cmpds.csv


In [67]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round3_1000_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [68]:
# Pick the next 1000 train compounds based on lowest docking score
file1 = 'round3_1000_predicted_results.csv'
file2 = 'round3_1000_test_cmpds.csv'
output_csv = 'round4_1000_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 1000  # or None for all

merge_and_select_shared_compounds(file1, file2, output_csv, strategy, top_n)

In [69]:
# Calculate recall for round 4
predictions_file_path = 'round3_1000_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

Recall: 0.3995


ROUND 4

In [70]:
# Unblind the scores for these 1000 compounds and save as training set.
input_csv_path = '../../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round4_1000_cmpds.csv'
output_csv_path = 'round4_1000_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Columns of merged DataFrame:
Index(['Name', 'SMILES_x', 'Score', 'SMILES_y', 'Predicted_Score',
       'Uncertainty'],
      dtype='object')

Selected DataFrame:
                Name                                           SMILES_x  \
0    NCGC00038620-02    Cc1ccc(c(c1)Cl)NC(=O)CN2C(=O)C3(CCC(CC3)C)NC2=O   
1    NCGC00373271-01  CC(C)C1=NO[C@@]2(C1)CCC[N@@](C2)[S@@+](=O)(c3c...   
2    NCGC00373276-01  c1ccc(cc1)C2=NO[C@]3(C2)CCC[N@](C3)[S@@+](=O)(...   
3    NCGC00373285-01  c1ccc(cc1)C2=NO[C@]3(C2)CCC[N@H+](C3)Cc4ccc(cc...   
4    NCGC00373329-01  c1cc(cc(c1)C(F)(F)F)C2=NO[C@]3(C2)CCC[N@H+](C3...   
..               ...                                                ...   
995  NCGC00480253-01        c1cc(ccc1C(=O)N2CCn3c(nnc3C4CC4)C2)C(F)(F)F   
996  NCGC00480215-01  c1ccc(cc1)CC2=NOC3(C2)CCN(CC3)[S@@+](=O)(c4ccc...   
997  NCGC00480343-01  Cc1nc2c(c(n1)c3ccc(cc3)Cl)CCN(CC2)C(=O)Cc4ccc(...   
998  NCGC00480665-01           Cc1cc2ncc(cn2n1)C(=O)N3CCC(CC3)Cc4ccccc4   
999  NCGC0048

In [72]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round4_1000_train_cmpds.csv'
descriptors_csv_path = '../../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round4_1000_train_cmpds.csv


In [74]:
# Concatanate the train sets together 
file1_path = 'round3_1000_train_cmpds.csv'
file2_path = 'round4_1000_train_cmpds.csv'
output_path = 'round4_1000_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round4_1000_train_cmpds.csv


In [77]:
# Retrain the model using this training file.
csv_file_path = 'round4_1000_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [78]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round3_1000_test_cmpds.csv'
compounds_to_remove_csv_path = 'round4_1000_train_cmpds.csv'
output_csv_path = 'round4_1000_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)

Compounds removed and remaining Name and SMILES saved to round4_1000_test_cmpds.csv


In [79]:
# Predict on the entire library
csv_file_to_predict = '../../../../docked_ecfp.csv'
output_csv_file = 'round4_1000_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [80]:
# Pick the next 1000 train compounds based on lowest docking score
file1 = 'round4_1000_predicted_results.csv'
file2 = 'round4_1000_test_cmpds.csv'
output_csv = 'round5_1000_cmpds.csv'
strategy = 'greedy'  # or 'greedy'
top_n = 1000  # or None for all

merge_and_select_shared_compounds(file1, file2, output_csv, strategy, top_n)

In [81]:
# Calculate recall for round 5
predictions_file_path = 'round4_1000_predicted_results.csv'
binders_file_path = '../../../../binders_docking.csv'
# Calculate Recall
recall_value = calculate_recall(predictions_file_path, binders_file_path, top_n=2000)
print("Recall:", recall_value)

FileNotFoundError: [Errno 2] No such file or directory: '../../../../binders_docking.csv'