In [1]:
# import packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from typing import List
from rdkit import DataStructs, Chem
from rdkit.Chem import MolFromSmiles, AllChem
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from sklearn import gaussian_process
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Kernel

In [2]:
def random_strategy(docked_file, output_csv, subset_size=None, random_seed=None):
    """
    Randomly selects a subset of compounds from the docking results CSV.

    Parameters:
    - docked_file (str): Path to the CSV file with docking results.
    - output_csv (str): Path to the output CSV file to save the selected compounds.
    - subset_size (int or None): Number of compounds to randomly select. If None, selects all (default is None).
    - random_seed (int or None): Seed for reproducibility in random sampling. If None, no seed is set (default is None).
    """
    # Load the docking results CSV
    docking_results = pd.read_csv(docked_file)

    # Randomly select a subset of compounds
    if subset_size is not None:
        selected_compounds = docking_results.sample(n=subset_size, random_state=random_seed)
    else:
        selected_compounds = docking_results

    # Save the selected compounds to a new CSV file
    selected_compounds.to_csv(output_csv, index=False)

In [3]:
def select_scores(input_csv_path, cmpds_csv_path, output_csv_path):
    # Load the main CSV file into a DataFrame
    all_compounds_df = pd.read_csv(input_csv_path)

    # Load the compounds CSV file into a DataFrame
    cmpds_df = pd.read_csv(cmpds_csv_path)

    # Merge the two DataFrames based on the 'Name' column using an inner join
    merged_df = pd.merge(all_compounds_df, cmpds_df, how='inner', on='Name')

    # Select the desired columns
    selected_df = merged_df[['Name', 'SMILES_x', 'Score']]

    # Rename 'SMILES_x' to 'SMILES'
    selected_df = selected_df.rename(columns={'SMILES_x': 'SMILES'})

    # Save the resulting DataFrame to a new CSV file
    selected_df.to_csv(output_csv_path, index=False)

    print(f"Selected compounds saved to {output_csv_path}")


In [4]:
def append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path):
    # Read the main compounds CSV file
    compounds_df = pd.read_csv(compounds_csv_path)

    # Read the descriptors CSV file
    descriptors_df = pd.read_csv(descriptors_csv_path)

    # Merge the two DataFrames based on the 'Name' column
    merged_df = pd.merge(compounds_df, descriptors_df, on='Name', how='left', suffixes=('', '_descriptor'))

    # Append descriptor columns after the 'Score' column
    score_index = merged_df.columns.get_loc('Score')
    descriptor_columns = [col for col in merged_df.columns if col.endswith('_descriptor')]
    columns_order = list(merged_df.columns[:score_index + 1]) + descriptor_columns + list(merged_df.columns[score_index + 1:])

    # Update DataFrame with the new column order
    merged_df = merged_df[columns_order]

    # Drop the descriptor columns
    merged_df = merged_df.drop(merged_df.filter(like='_descriptor').columns, axis=1)

    # Save the updated DataFrame to the same CSV file
    merged_df.to_csv(compounds_csv_path, index=False)

    print(f"Descriptors appended to {compounds_csv_path}")

Set up Gaussian Process Regressor model

In [5]:
# Function to calculate the tanimoto similarity for the Gaussian process kernel prediction
def tanimoto_similarity(a, b):
    """Computes the Tanimoto similarity for all pairs.

  Args:
    a: Numpy array with shape [batch_size_a, num_features].
    b: Numpy array with shape [batch_size_b, num_features].

  Returns:
    Numpy array with shape [batch_size_a, batch_size_b].
  """
    aa = np.sum(a, axis=1, keepdims=True)
    bb = np.sum(b, axis=1, keepdims=True)
    ab = np.matmul(a, b.T)
    return np.true_divide(ab, aa + bb.T - ab)

In [6]:
class TanimotoKernel(gaussian_process.kernels.NormalizedKernelMixin,
                     gaussian_process.kernels.StationaryKernelMixin,
                     gaussian_process.kernels.Kernel):
  """Custom Gaussian process kernel that computes Tanimoto similarity."""

  def __init__(self):
    """Initializer."""
    pass  # Does nothing; this is required by get_params().

  def __call__(self, X, Y=None, eval_gradient=False):  # pylint: disable=invalid-name
    """Computes the pairwise Tanimoto similarity.

    Args:
      X: Numpy array with shape [batch_size_a, num_features].
      Y: Numpy array with shape [batch_size_b, num_features]. If None, X is
        used.
      eval_gradient: Whether to compute the gradient.

    Returns:
      Numpy array with shape [batch_size_a, batch_size_b].

    Raises:
      NotImplementedError: If eval_gradient is True.
    """
    if eval_gradient:
      raise NotImplementedError
    if Y is None:
      Y = X
    return tanimoto_similarity(X, Y)

In [7]:
def train_gpr_model(csv_file_path):
    # Load data from CSV
    data = pd.read_csv(csv_file_path)

    # Extract individual bit columns for the representations needed for X_train
    bit_columns = data.drop(columns=['Name', 'SMILES', 'Score'])

    # Convert bits to NumPy array
    X_train = np.array(bit_columns)

    # Target values which in this case are the docking scores for the training data
    y_train = data['Score']

    # Use the custom kernel in a Gaussian process
    gpr = GaussianProcessRegressor(kernel=TanimotoKernel(), n_restarts_optimizer=100).fit(X_train, y_train)

    return gpr

In [8]:
def remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path):
    # Read the main compounds CSV file
    all_compounds_df = pd.read_csv(input_csv_path)

    # Read the list of compounds to remove
    compounds_to_remove_df = pd.read_csv(compounds_to_remove_csv_path)

    # Identify the indices of compounds to remove
    indices_to_remove = all_compounds_df[all_compounds_df['Name'].isin(compounds_to_remove_df['Name'])].index

    # Remove compounds from the main DataFrame
    remaining_compounds_df = all_compounds_df.drop(indices_to_remove)

    # Save the remaining compounds to a new CSV file
    remaining_compounds_df.to_csv(output_csv_path, index=False)

    print(f"Compounds removed and remaining compounds saved to {output_csv_path}")

In [9]:
def predict_and_save_results(gpr, csv_file, output_csv):
    # Load data from CSV
    data = pd.read_csv(csv_file)

    # Extract individual bit columns for the representations needed for X_test
    bit_columns = data.drop(columns=['Name', 'SMILES'])

    # Convert bits to NumPy array
    X_test = np.array(bit_columns)

    # Predict using the Gaussian process model and obtain covariance
    y_pred, sigma = gpr.predict(X_test, return_std=True)

    # Add predicted values and uncertainty to the DataFrame
    data['Predicted_Score'] = y_pred
    data['Uncertainty'] = sigma

    # Save the DataFrame to a new CSV file
    output_data = data[['Name', 'SMILES', 'Predicted_Score', 'Uncertainty']]
    output_data.to_csv(output_csv, index=False)

In [10]:
def append_train_compounds(file1_path, file2_path, output_path):
    """
    Concatenates two CSV files with the same format and saves the result to a new CSV file.

    Parameters:
    - file1_path (str): Path to the first CSV file.
    - file2_path (str): Path to the second CSV file.
    - output_path (str): Path to save the concatenated CSV file.
    """
    # Load the data from both CSV files
    data1 = pd.read_csv(file1_path)
    data2 = pd.read_csv(file2_path)

    # Concatenate the two DataFrames
    concatenated_data = pd.concat([data1, data2], ignore_index=True)

    # Save the concatenated DataFrame to a new CSV file
    concatenated_data.to_csv(output_path, index=False)

    print(f"Concatenated compounds saved to {output_path}")

_________________________________________________________________________________________________________

In [11]:
# Choose a random subset of compounds to start with.
docked_file_path = '../../../docked_ecfp.csv'
output_csv_path = 'round0_100_ecfp.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)


In [12]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round0_100_ecfp.csv'
output_csv_path = 'round0_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round0_100_train_cmpds.csv


In [13]:
!head -n 2 round0_100_train_cmpds.csv

Name,SMILES,Score
NCGC00373801-01,c1cc(cc(c1)c2cccs2)C[NH3+],-9.1241


In [14]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round0_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round0_100_train_cmpds.csv


In [15]:
!head -n 2 round0_100_train_cmpds.csv

Name,SMILES,Score,Morgan_bit0,Morgan_bit1,Morgan_bit2,Morgan_bit3,Morgan_bit4,Morgan_bit5,Morgan_bit6,Morgan_bit7,Morgan_bit8,Morgan_bit9,Morgan_bit10,Morgan_bit11,Morgan_bit12,Morgan_bit13,Morgan_bit14,Morgan_bit15,Morgan_bit16,Morgan_bit17,Morgan_bit18,Morgan_bit19,Morgan_bit20,Morgan_bit21,Morgan_bit22,Morgan_bit23,Morgan_bit24,Morgan_bit25,Morgan_bit26,Morgan_bit27,Morgan_bit28,Morgan_bit29,Morgan_bit30,Morgan_bit31,Morgan_bit32,Morgan_bit33,Morgan_bit34,Morgan_bit35,Morgan_bit36,Morgan_bit37,Morgan_bit38,Morgan_bit39,Morgan_bit40,Morgan_bit41,Morgan_bit42,Morgan_bit43,Morgan_bit44,Morgan_bit45,Morgan_bit46,Morgan_bit47,Morgan_bit48,Morgan_bit49,Morgan_bit50,Morgan_bit51,Morgan_bit52,Morgan_bit53,Morgan_bit54,Morgan_bit55,Morgan_bit56,Morgan_bit57,Morgan_bit58,Morgan_bit59,Morgan_bit60,Morgan_bit61,Morgan_bit62,Morgan_bit63,Morgan_bit64,Morgan_bit65,Morgan_bit66,Morgan_bit67,Morgan_bit68,Morgan_bit69,Morgan_bit70,Morgan_bit71,Morgan_bit72,Morgan_bit73,Morgan_bit74,Morgan_bit75,Morg

In [16]:
# Train our model using this training file.
csv_file_path = 'round0_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)


In [17]:
# Remove the training data from the overall Nanoluc library
# Save the remainder of compounds as test set
input_csv_path = '../../../docked_ecfp.csv'
compounds_to_remove_csv_path = 'round0_100_train_cmpds.csv'
output_csv_path = 'round0_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round0_100_test_cmpds.csv


In [18]:
# Predict on the test set
csv_file_to_predict = 'round0_100_test_cmpds.csv'
output_csv_file = 'round0_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)


In [19]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round0_100_test_cmpds.csv'
output_csv_path = 'round1_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)


In [20]:
!head -n 2 round1_100_cmpds.csv

Name,SMILES,Morgan_bit0,Morgan_bit1,Morgan_bit2,Morgan_bit3,Morgan_bit4,Morgan_bit5,Morgan_bit6,Morgan_bit7,Morgan_bit8,Morgan_bit9,Morgan_bit10,Morgan_bit11,Morgan_bit12,Morgan_bit13,Morgan_bit14,Morgan_bit15,Morgan_bit16,Morgan_bit17,Morgan_bit18,Morgan_bit19,Morgan_bit20,Morgan_bit21,Morgan_bit22,Morgan_bit23,Morgan_bit24,Morgan_bit25,Morgan_bit26,Morgan_bit27,Morgan_bit28,Morgan_bit29,Morgan_bit30,Morgan_bit31,Morgan_bit32,Morgan_bit33,Morgan_bit34,Morgan_bit35,Morgan_bit36,Morgan_bit37,Morgan_bit38,Morgan_bit39,Morgan_bit40,Morgan_bit41,Morgan_bit42,Morgan_bit43,Morgan_bit44,Morgan_bit45,Morgan_bit46,Morgan_bit47,Morgan_bit48,Morgan_bit49,Morgan_bit50,Morgan_bit51,Morgan_bit52,Morgan_bit53,Morgan_bit54,Morgan_bit55,Morgan_bit56,Morgan_bit57,Morgan_bit58,Morgan_bit59,Morgan_bit60,Morgan_bit61,Morgan_bit62,Morgan_bit63,Morgan_bit64,Morgan_bit65,Morgan_bit66,Morgan_bit67,Morgan_bit68,Morgan_bit69,Morgan_bit70,Morgan_bit71,Morgan_bit72,Morgan_bit73,Morgan_bit74,Morgan_bit75,Morgan_bit

ROUND 1

In [21]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round1_100_cmpds.csv'
output_csv_path = 'round1_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round1_100_train_cmpds.csv


In [22]:
!head -n 2 round1_100_train_cmpds.csv

Name,SMILES,Score
NCGC00373360-01,CC(C)CC[N@@H+]1CCC[C@@]2(C1)CC(=NO2)c3cccnc3,-12.2036


In [23]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round1_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round1_100_train_cmpds.csv


In [24]:
!head -n 2 round1_100_train_cmpds.csv

Name,SMILES,Score,Morgan_bit0,Morgan_bit1,Morgan_bit2,Morgan_bit3,Morgan_bit4,Morgan_bit5,Morgan_bit6,Morgan_bit7,Morgan_bit8,Morgan_bit9,Morgan_bit10,Morgan_bit11,Morgan_bit12,Morgan_bit13,Morgan_bit14,Morgan_bit15,Morgan_bit16,Morgan_bit17,Morgan_bit18,Morgan_bit19,Morgan_bit20,Morgan_bit21,Morgan_bit22,Morgan_bit23,Morgan_bit24,Morgan_bit25,Morgan_bit26,Morgan_bit27,Morgan_bit28,Morgan_bit29,Morgan_bit30,Morgan_bit31,Morgan_bit32,Morgan_bit33,Morgan_bit34,Morgan_bit35,Morgan_bit36,Morgan_bit37,Morgan_bit38,Morgan_bit39,Morgan_bit40,Morgan_bit41,Morgan_bit42,Morgan_bit43,Morgan_bit44,Morgan_bit45,Morgan_bit46,Morgan_bit47,Morgan_bit48,Morgan_bit49,Morgan_bit50,Morgan_bit51,Morgan_bit52,Morgan_bit53,Morgan_bit54,Morgan_bit55,Morgan_bit56,Morgan_bit57,Morgan_bit58,Morgan_bit59,Morgan_bit60,Morgan_bit61,Morgan_bit62,Morgan_bit63,Morgan_bit64,Morgan_bit65,Morgan_bit66,Morgan_bit67,Morgan_bit68,Morgan_bit69,Morgan_bit70,Morgan_bit71,Morgan_bit72,Morgan_bit73,Morgan_bit74,Morgan_bit75,Morg

In [25]:
!wc -l round0_100_train_cmpds.csv

101 round0_100_train_cmpds.csv


In [26]:
!wc -l round1_100_train_cmpds.csv

101 round1_100_train_cmpds.csv


In [27]:
# Concatanate the train sets together 
file1_path = 'round0_100_train_cmpds.csv'
file2_path = 'round1_100_train_cmpds.csv'
output_path = 'round1_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round1_100_train_cmpds.csv


In [28]:
!wc -l round1_100_train_cmpds.csv

201 round1_100_train_cmpds.csv


In [29]:
# Retrain the model using this training file.
csv_file_path = 'round1_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [30]:
!wc -l ../../../docked_ecfp.csv

41905 ../../../docked_ecfp.csv


In [31]:
!wc -l round0_100_test_cmpds.csv

41805 round0_100_test_cmpds.csv


In [32]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round0_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round1_100_train_cmpds.csv'
output_csv_path = 'round1_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round1_100_test_cmpds.csv


In [33]:
!wc -l round1_100_test_cmpds.csv

41705 round1_100_test_cmpds.csv


In [34]:
# Predict on the test set
csv_file_to_predict = 'round1_100_test_cmpds.csv'
output_csv_file = 'round1_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [35]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round1_100_test_cmpds.csv'
output_csv_path = 'round2_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)

ROUND 2

In [36]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round2_100_cmpds.csv'
output_csv_path = 'round2_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round2_100_train_cmpds.csv


In [37]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round2_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round2_100_train_cmpds.csv


In [38]:
# Concatanate the train sets together 
file1_path = 'round1_100_train_cmpds.csv'
file2_path = 'round2_100_train_cmpds.csv'
output_path = 'round2_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round2_100_train_cmpds.csv


In [39]:
# Retrain the model using this training file.
csv_file_path = 'round2_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [40]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round1_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round2_100_train_cmpds.csv'
output_csv_path = 'round2_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round2_100_test_cmpds.csv


In [41]:
# Predict on the test set
csv_file_to_predict = 'round2_100_test_cmpds.csv'
output_csv_file = 'round2_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [42]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round2_100_test_cmpds.csv'
output_csv_path = 'round3_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)

ROUND 3

In [43]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round3_100_cmpds.csv'
output_csv_path = 'round3_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round3_100_train_cmpds.csv


In [44]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round3_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round3_100_train_cmpds.csv


In [45]:
# Concatanate the train sets together 
file1_path = 'round2_100_train_cmpds.csv'
file2_path = 'round3_100_train_cmpds.csv'
output_path = 'round3_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round3_100_train_cmpds.csv


In [46]:
# Retrain the model using this training file.
csv_file_path = 'round3_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [47]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round2_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round3_100_train_cmpds.csv'
output_csv_path = 'round3_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round3_100_test_cmpds.csv


In [48]:
# Predict on the test set
csv_file_to_predict = 'round3_100_test_cmpds.csv'
output_csv_file = 'round3_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [49]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round3_100_test_cmpds.csv'
output_csv_path = 'round4_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)

ROUND 4

In [50]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round4_100_cmpds.csv'
output_csv_path = 'round4_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round4_100_train_cmpds.csv


In [51]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round4_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round4_100_train_cmpds.csv


In [52]:
# Concatanate the train sets together 
file1_path = 'round3_100_train_cmpds.csv'
file2_path = 'round4_100_train_cmpds.csv'
output_path = 'round4_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round4_100_train_cmpds.csv


In [53]:
# Retrain the model using this training file.
csv_file_path = 'round4_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [54]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round3_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round4_100_train_cmpds.csv'
output_csv_path = 'round4_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round4_100_test_cmpds.csv


In [55]:
# Predict on the test set
csv_file_to_predict = 'round4_100_test_cmpds.csv'
output_csv_file = 'round4_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [56]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round4_100_test_cmpds.csv'
output_csv_path = 'round5_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)

ROUND 5

In [57]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round5_100_cmpds.csv'
output_csv_path = 'round5_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round5_100_train_cmpds.csv


In [58]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round5_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round5_100_train_cmpds.csv


In [59]:
# Concatanate the train sets together 
file1_path = 'round4_100_train_cmpds.csv'
file2_path = 'round5_100_train_cmpds.csv'
output_path = 'round5_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round5_100_train_cmpds.csv


In [60]:
# Retrain the model using this training file.
csv_file_path = 'round5_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [61]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round4_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round5_100_train_cmpds.csv'
output_csv_path = 'round5_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round5_100_test_cmpds.csv


In [62]:
# Predict on the test set
csv_file_to_predict = 'round5_100_test_cmpds.csv'
output_csv_file = 'round5_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [63]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round5_100_test_cmpds.csv'
output_csv_path = 'round6_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)

ROUND 6

In [64]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round6_100_cmpds.csv'
output_csv_path = 'round6_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round6_100_train_cmpds.csv


In [65]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round6_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round6_100_train_cmpds.csv


In [66]:
# Concatanate the train sets together 
file1_path = 'round5_100_train_cmpds.csv'
file2_path = 'round6_100_train_cmpds.csv'
output_path = 'round6_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round6_100_train_cmpds.csv


In [67]:
# Retrain the model using this training file.
csv_file_path = 'round6_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [68]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round5_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round6_100_train_cmpds.csv'
output_csv_path = 'round6_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round6_100_test_cmpds.csv


In [69]:
# Predict on the test set
csv_file_to_predict = 'round6_100_test_cmpds.csv'
output_csv_file = 'round6_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [70]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round6_100_test_cmpds.csv'
output_csv_path = 'round7_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)

ROUND 7

In [11]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round7_100_cmpds.csv'
output_csv_path = 'round7_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round7_100_train_cmpds.csv


In [12]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round7_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round7_100_train_cmpds.csv


In [13]:
# Concatanate the train sets together 
file1_path = 'round6_100_train_cmpds.csv'
file2_path = 'round7_100_train_cmpds.csv'
output_path = 'round7_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round7_100_train_cmpds.csv


In [14]:
# Retrain the model using this training file.
csv_file_path = 'round7_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [15]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round6_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round7_100_train_cmpds.csv'
output_csv_path = 'round7_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round7_100_test_cmpds.csv


In [16]:
# Predict on the test set
csv_file_to_predict = 'round7_100_test_cmpds.csv'
output_csv_file = 'round7_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [17]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round7_100_test_cmpds.csv'
output_csv_path = 'round8_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)

ROUND 8

In [18]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round8_100_cmpds.csv'
output_csv_path = 'round8_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round8_100_train_cmpds.csv


In [19]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round8_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round8_100_train_cmpds.csv


In [20]:
# Concatanate the train sets together 
file1_path = 'round7_100_train_cmpds.csv'
file2_path = 'round8_100_train_cmpds.csv'
output_path = 'round8_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round8_100_train_cmpds.csv


In [21]:
# Retrain the model using this training file.
csv_file_path = 'round8_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [22]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round7_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round8_100_train_cmpds.csv'
output_csv_path = 'round8_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round8_100_test_cmpds.csv


In [23]:
# Predict on the test set
csv_file_to_predict = 'round8_100_test_cmpds.csv'
output_csv_file = 'round8_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [24]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round8_100_test_cmpds.csv'
output_csv_path = 'round9_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)

ROUND 9

In [25]:
# Unblind the scores for these 100 randomly chosen compounds and save as training set.
input_csv_path = '../../../7nsw_all_hybrid.csv'
cmpds_csv_path = 'round9_100_cmpds.csv'
output_csv_path = 'round9_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to round9_100_train_cmpds.csv


In [26]:
# Extract the descriptors and append them to the training set.
compounds_csv_path = 'round9_100_train_cmpds.csv'
descriptors_csv_path = '../../../docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)

Descriptors appended to round9_100_train_cmpds.csv


In [27]:
# Concatanate the train sets together 
file1_path = 'round8_100_train_cmpds.csv'
file2_path = 'round9_100_train_cmpds.csv'
output_path = 'round9_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to round9_100_train_cmpds.csv


In [28]:
# Retrain the model using this training file.
csv_file_path = 'round9_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [29]:
# Remove the training set from the test set
# Save the remainder of compounds as next rounds test set
input_csv_path = 'round8_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'round9_100_train_cmpds.csv'
output_csv_path = 'round9_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to round9_100_test_cmpds.csv


In [None]:
# Predict on the test set
csv_file_to_predict = 'round9_100_test_cmpds.csv'
output_csv_file = 'round9_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)

In [None]:
# Choose the next random subset of compounds from the test set
docked_file_path = 'round9_100_test_cmpds.csv'
output_csv_path = 'round10_100_cmpds.csv'
subset_size_to_select = 100  # Set the desired subset size

# Call the function
random_strategy(docked_file_path, output_csv_path, subset_size=subset_size_to_select)