In [1]:
# import packages
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from typing import List
from rdkit import DataStructs, Chem
from rdkit.Chem import MolFromSmiles, AllChem
from rdkit.DataStructs.cDataStructs import ExplicitBitVect
from sklearn import gaussian_process
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import Kernel

I want to pull out the docking scores for the round0 compounds

In [2]:
def select_scores(input_csv_path, cmpds_csv_path, output_csv_path):
    # Load the main CSV file into a DataFrame
    all_compounds_df = pd.read_csv(input_csv_path)

    # Load the compounds CSV file into a DataFrame
    cmpds_df = pd.read_csv(cmpds_csv_path)

    # Merge the two DataFrames based on the 'Name' column using an inner join
    merged_df = pd.merge(all_compounds_df, cmpds_df, how='inner', on='Name')

    # Select the desired columns
    selected_df = merged_df[['Name', 'SMILES_x', 'Score']]

    # Rename 'SMILES_x' to 'SMILES'
    selected_df = selected_df.rename(columns={'SMILES_x': 'SMILES'})

    # Save the resulting DataFrame to a new CSV file
    selected_df.to_csv(output_csv_path, index=False)

    print(f"Selected compounds saved to {output_csv_path}")


In [6]:
# Save out the top 100 compounds based on uncertainty for the next round of training
input_csv_path = '7nsw_all_hybrid.csv'
cmpds_csv_path = 'ecfp_euclidean_set/round0_100_ecfp.csv'
output_csv_path = 'ecfp_euclidean_set/round0_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to ecfp_euclidean_set/round0_100_train_cmpds.csv


Then I would like to extract the descriptors and append them to the training file.

In [3]:
def append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path):
    # Read the main compounds CSV file
    compounds_df = pd.read_csv(compounds_csv_path)

    # Read the descriptors CSV file
    descriptors_df = pd.read_csv(descriptors_csv_path)

    # Merge the two DataFrames based on the 'Name' column
    merged_df = pd.merge(compounds_df, descriptors_df, on='Name', how='left', suffixes=('', '_descriptor'))

    # Append descriptor columns after the 'Score' column
    score_index = merged_df.columns.get_loc('Score')
    descriptor_columns = [col for col in merged_df.columns if col.endswith('_descriptor')]
    columns_order = list(merged_df.columns[:score_index + 1]) + descriptor_columns + list(merged_df.columns[score_index + 1:])

    # Update DataFrame with the new column order
    merged_df = merged_df[columns_order]

    # Drop the descriptor columns
    merged_df = merged_df.drop(merged_df.filter(like='_descriptor').columns, axis=1)

    # Save the updated DataFrame to the same CSV file
    merged_df.to_csv(compounds_csv_path, index=False)

    print(f"Descriptors appended to {compounds_csv_path}")

In [9]:
# Append the ecfp descriptors from the 
compounds_csv_path = 'ecfp_euclidean_set/round0_100_train_cmpds.csv'
descriptors_csv_path = 'docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)


Descriptors appended to ecfp_euclidean_set/round0_100_train_cmpds.csv


In [4]:
def count_headers(csv_file):
    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        # Read the first row
        first_row = next(reader, None)
        if first_row:
            # Count the number of fields in the first row
            num_headers = len(first_row)
            return num_headers
        else:
            # File is empty or has no headers
            return 0

# Replace 'your_file.csv' with the actual path to your CSV file
csv_file_path = 'ecfp_euclidean_set/round0_100_train_cmpds.csv'
num_headers = count_headers(csv_file_path)

if num_headers > 0:
    print(f"The CSV file has {num_headers} headers.")
else:
    print("The CSV file is either empty or has no headers.")

The CSV file has 2051 headers.


Now we can start training our model using this training file.

_________________________________________________________________________________________________________

Set up Gaussian Process Regressor model

In [5]:
# Function to calculate the tanimoto similarity for the Gaussian process kernel prediction
def tanimoto_similarity(a, b):
    """Computes the Tanimoto similarity for all pairs.

  Args:
    a: Numpy array with shape [batch_size_a, num_features].
    b: Numpy array with shape [batch_size_b, num_features].

  Returns:
    Numpy array with shape [batch_size_a, batch_size_b].
  """
    aa = np.sum(a, axis=1, keepdims=True)
    bb = np.sum(b, axis=1, keepdims=True)
    ab = np.matmul(a, b.T)
    return np.true_divide(ab, aa + bb.T - ab)

In [6]:
class TanimotoKernel(gaussian_process.kernels.NormalizedKernelMixin,
                     gaussian_process.kernels.StationaryKernelMixin,
                     gaussian_process.kernels.Kernel):
  """Custom Gaussian process kernel that computes Tanimoto similarity."""

  def __init__(self):
    """Initializer."""
    pass  # Does nothing; this is required by get_params().

  def __call__(self, X, Y=None, eval_gradient=False):  # pylint: disable=invalid-name
    """Computes the pairwise Tanimoto similarity.

    Args:
      X: Numpy array with shape [batch_size_a, num_features].
      Y: Numpy array with shape [batch_size_b, num_features]. If None, X is
        used.
      eval_gradient: Whether to compute the gradient.

    Returns:
      Numpy array with shape [batch_size_a, batch_size_b].

    Raises:
      NotImplementedError: If eval_gradient is True.
    """
    if eval_gradient:
      raise NotImplementedError
    if Y is None:
      Y = X
    return tanimoto_similarity(X, Y)

Now I will read in my round0 train data using the ecfp fingerprints and the associated docking score. 

In [7]:
def train_gpr_model(csv_file_path):
    # Load data from CSV
    data = pd.read_csv(csv_file_path)

    # Extract individual bit columns for the representations needed for X_train
    bit_columns = data.drop(columns=['Name', 'SMILES', 'Score'])

    # Convert bits to NumPy array
    X_train = np.array(bit_columns)

    # Target values which in this case are the docking scores for the training data
    y_train = data['Score']

    # Use the custom kernel in a Gaussian process
    gpr = GaussianProcessRegressor(kernel=TanimotoKernel()).fit(X_train, y_train)

    return gpr

In [8]:
csv_file_path = 'ecfp_euclidean_set/round0_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)


Now that my model is trained on this data we need to remove the training data from the master list "all_ecfp4.csv" and save it to a new csv. 

In [9]:
def remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path):
    # Read the main compounds CSV file
    all_compounds_df = pd.read_csv(input_csv_path)

    # Read the list of compounds to remove
    compounds_to_remove_df = pd.read_csv(compounds_to_remove_csv_path)

    # Identify the indices of compounds to remove
    indices_to_remove = all_compounds_df[all_compounds_df['Name'].isin(compounds_to_remove_df['Name'])].index

    # Remove compounds from the main DataFrame
    remaining_compounds_df = all_compounds_df.drop(indices_to_remove)

    # Save the remaining compounds to a new CSV file
    remaining_compounds_df.to_csv(output_csv_path, index=False)

    print(f"Compounds removed and remaining compounds saved to {output_csv_path}")


In [10]:
# I will remove the compounds that were picked in round0 from the overall master list all_ecfp4.csv
# Save the remainder of compounds as test set
input_csv_path = 'docked_ecfp.csv'
compounds_to_remove_csv_path = 'ecfp_euclidean_set/round0_100_train_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round0_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to ecfp_euclidean_set/round0_100_test_cmpds.csv


I also want to save out the y_pred, sigma, cov to a csv file for sorting

In [11]:
def predict_and_save_results(gpr, csv_file, output_csv):
    # Load data from CSV
    data = pd.read_csv(csv_file)

    # Extract individual bit columns for the representations needed for X_test
    bit_columns = data.drop(columns=['Name', 'SMILES'])

    # Convert bits to NumPy array
    X_test = np.array(bit_columns)

    # Predict using the Gaussian process model and obtain covariance
    y_pred, sigma = gpr.predict(X_test, return_std=True)

    # Add predicted values and uncertainty to the DataFrame
    data['Predicted_Score'] = y_pred
    data['Uncertainty'] = sigma

    # Save the DataFrame to a new CSV file
    output_data = data[['Name', 'SMILES', 'Predicted_Score', 'Uncertainty']]
    output_data.to_csv(output_csv, index=False)

In [12]:
# Example usage
csv_file_to_predict = 'ecfp_euclidean_set/round0_100_test_cmpds.csv'
output_csv_file = 'ecfp_euclidean_set/round0_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)


In [13]:
!head -n 10 'round0_100_predicted_results.csv'

Name,SMILES,Predicted_Score,Uncertainty
NCGC00174717-02,[NH3+]Cc1c[nH]nc1-c1ccccc1,-8.760971334260105,0.9363772544332654
NCGC00305123-02,Cc1nnc2ccc(NCc3ccc(F)cc3)nn12,-8.276844428724662,0.9377730601317923
NCGC00048081-02,COc1ccccc1Cn1cnc2cccnc21,-8.858221919961371,0.9386759166851402
NCGC00019120-02,O=c1[nH]nc2n1CCCCC2,-7.01805490468938,0.9571019498888149
NCGC00054421-02,Cc1nnc2ccc(N3CCCC3)nn12,-8.02363231716192,0.951261887333017
NCGC00326711-02,CC[NH2+]Cc1ccc(-c2ccccc2)o1,-8.093806158380634,0.9427606245983814
NCGC00245606-02,Cc1nnc2ccc(N3CCOCC3)nn12,-8.115666878392155,0.9406714350304407
NCGC00112026-02,COc1ccc(CNC(=O)c2ccc3c(c2)N=C(C)c2c(C)ccc(C)c2S3)cc1,-8.694782244726067,0.9238847930737877
NCGC00338944-02,C[NH2+][C@H]1CCCc2ccccc21,-9.429385888176553,0.9442430926249455


In [14]:
def uncertain_strategy(predictions_file, output_csv, top_n=None):
    """
    Sorts the predicted results based on uncertainty and selects the top N compounds.

    Parameters:
    - predictions_file (str): Path to the CSV file with predicted results.
    - output_csv (str): Path to the output CSV file to save the selected compounds.
    - top_n (int or None): Number of top compounds to select. If None, selects all (default is None).
    """
    # Load the predicted results CSV
    predicted_results = pd.read_csv(predictions_file)

    # Sort the DataFrame based on uncertainty in descending order
    sorted_results = predicted_results.sort_values(by='Uncertainty', ascending=False)

    # Select the top N compounds, or all if top_n is None
    if top_n is not None:
        top_compounds = sorted_results.head(top_n)
    else:
        top_compounds = sorted_results

    # Save the selected compounds to a new CSV file
    top_compounds.to_csv(output_csv, index=False)

In [15]:
def greedy_strategy(predictions_file, output_csv, top_n=None):
    """
    Sorts the predicted results based on greedy and selects the top N compounds.

    Parameters:
    - predictions_file (str): Path to the CSV file with predicted results.
    - output_csv (str): Path to the output CSV file to save the selected compounds.
    - top_n (int or None): Number of top compounds to select. If None, selects all (default is None).
    """
    # Load the predicted results CSV
    predicted_results = pd.read_csv(predictions_file)

    # Sort the DataFrame based on greedy in descending order
    sorted_results = predicted_results.sort_values(by='Predicted_Score', ascending=True)

    # Select the top N compounds, or all if top_n is None
    if top_n is not None:
        top_compounds = sorted_results.head(top_n)
    else:
        top_compounds = sorted_results

    # Save the selected compounds to a new CSV file
    top_compounds.to_csv(output_csv, index=False)

In [16]:
# Pick the top 100 compounds based on highest uncertainty
predictions_file = 'ecfp_euclidean_set/round0_100_predicted_results.csv'
output_csv_file = 'ecfp_euclidean_set/round1_100_cmpds.csv'
top_n = int(input("Enter the number of top compounds to select: "))
uncertain_strategy(predictions_file, output_csv_file, top_n=top_n)

Enter the number of top compounds to select:  100


In [17]:
def append_train_compounds(file1_path, file2_path, output_path):
    """
    Concatenates two CSV files with the same format and saves the result to a new CSV file.

    Parameters:
    - file1_path (str): Path to the first CSV file.
    - file2_path (str): Path to the second CSV file.
    - output_path (str): Path to save the concatenated CSV file.
    """
    # Load the data from both CSV files
    data1 = pd.read_csv(file1_path)
    data2 = pd.read_csv(file2_path)

    # Concatenate the two DataFrames
    concatenated_data = pd.concat([data1, data2], ignore_index=True)

    # Save the concatenated DataFrame to a new CSV file
    concatenated_data.to_csv(output_path, index=False)

    print(f"Concatenated compounds saved to {output_path}")

ROUND 1

Extract(unblind) the docking scores for these round1 compounds.

In [18]:
# Save out the top 100 compounds based on uncertainty for the next round of training
input_csv_path = '7nsw_all_hybrid.csv'
cmpds_csv_path = 'ecfp_euclidean_set/round1_100_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round1_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to ecfp_euclidean_set/round1_100_train_cmpds.csv


Append the ecfp descriptors to the csv file above that has Name, SMILE and Score.

In [19]:
# Append the ecfp descriptors from the master list.
compounds_csv_path = 'ecfp_euclidean_set/round1_100_train_cmpds.csv'
descriptors_csv_path = 'docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)


Descriptors appended to ecfp_euclidean_set/round1_100_train_cmpds.csv


In [20]:
!wc -l ecfp_euclidean_set/round0_100_train_cmpds.csv

101 ecfp_euclidean_set/round0_100_train_cmpds.csv


In [21]:
!wc -l ecfp_euclidean_set/round1_100_train_cmpds.csv

101 ecfp_euclidean_set/round1_100_train_cmpds.csv


Append the new round of test cmpds to the previous round.

In [22]:
# Example usage
file1_path = 'ecfp_euclidean_set/round0_100_train_cmpds.csv'
file2_path = 'ecfp_euclidean_set/round1_100_train_cmpds.csv'
output_path = 'ecfp_euclidean_set/round1_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to ecfp_euclidean_set/round1_100_train_cmpds.csv


In [23]:
!wc -l ecfp_euclidean_set/round1_100_train_cmpds.csv

201 ecfp_euclidean_set/round1_100_train_cmpds.csv


Retrain the ML model with this new set of 100 compounds.

In [24]:
csv_file_path = 'ecfp_euclidean_set/round1_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [25]:
!wc -l docked_ecfp.csv

41905 docked_ecfp.csv


In [26]:
!wc -l ecfp_euclidean_set/round0_100_test_cmpds.csv

41805 ecfp_euclidean_set/round0_100_test_cmpds.csv


Remove the training set from the test set.

In [27]:
# I will remove the compounds that were picked in round1 from the round0 test set
input_csv_path = 'ecfp_euclidean_set/round0_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'ecfp_euclidean_set/round1_100_train_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round1_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to ecfp_euclidean_set/round1_100_test_cmpds.csv


In [28]:
!wc -l ecfp_euclidean_set/round1_100_test_cmpds.csv

41705 ecfp_euclidean_set/round1_100_test_cmpds.csv


Predict on the remaining test set.

In [29]:
# Predict on this new test library 
csv_file_to_predict = 'ecfp_euclidean_set/round1_100_test_cmpds.csv'
output_csv_file = 'ecfp_euclidean_set/round1_100_predicted_results.csv'
predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)


Perform uncertain selection strategy on these predictions.

In [31]:
# Pick the top 100 compounds based on highest uncertainty
predictions_file = 'ecfp_euclidean_set/round1_100_predicted_results.csv'
output_csv_file = 'ecfp_euclidean_set/round2_100_cmpds.csv'
top_n = int(input("Enter the number of top compounds to select: "))
uncertain_strategy(predictions_file, output_csv_file, top_n=top_n)

Enter the number of top compounds to select:  100


ROUND 2

Unblind the scores for the round 2 compounds.

In [32]:
# Save out the top 100 compounds based on uncertainty for the next round of training
input_csv_path = '7nsw_all_hybrid.csv'
cmpds_csv_path = 'ecfp_euclidean_set/round2_100_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round2_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to ecfp_euclidean_set/round2_100_train_cmpds.csv


Append the ecfp descriptors for model training.

In [34]:
# Append the ecfp descriptors from the master list.
compounds_csv_path = 'ecfp_euclidean_set/round2_100_train_cmpds.csv'
descriptors_csv_path = 'docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)


Descriptors appended to ecfp_euclidean_set/round2_100_train_cmpds.csv


In [35]:
# Append the round 2 training compounds to the previous training set
file1_path = 'ecfp_euclidean_set/round1_100_train_cmpds.csv'
file2_path = 'ecfp_euclidean_set/round2_100_train_cmpds.csv'
output_path = 'ecfp_euclidean_set/round2_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to ecfp_euclidean_set/round2_100_train_cmpds.csv


In [36]:
! wc -l ecfp_euclidean_set/round2_100_train_cmpds.csv

301 ecfp_euclidean_set/round2_100_train_cmpds.csv


Retrain the model.

In [37]:
csv_file_path = 'ecfp_euclidean_set/round2_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

Remove the training compounds from the test set.

In [38]:
input_csv_path = 'ecfp_euclidean_set/round1_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'ecfp_euclidean_set/round2_100_train_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round2_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to ecfp_euclidean_set/round2_100_test_cmpds.csv


In [39]:
! wc -l ecfp_euclidean_set/round2_100_test_cmpds.csv

41605 ecfp_euclidean_set/round2_100_test_cmpds.csv


In [40]:
csv_file_to_predict = 'ecfp_euclidean_set/round2_100_test_cmpds.csv'
output_csv_file = 'ecfp_euclidean_set/round2_100_predicted_results.csv'

predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)


In [41]:
# Pick the top 100 compounds based on highest uncertainty
predictions_file = 'ecfp_euclidean_set/round2_100_predicted_results.csv'
output_csv_file = 'ecfp_euclidean_set/round3_100_cmpds.csv'
top_n = int(input("Enter the number of top compounds to select: "))
uncertain_strategy(predictions_file, output_csv_file, top_n=top_n)

Enter the number of top compounds to select:  100


ROUND 3

In [42]:
# Save out the top 100 compounds based on uncertainty for the next round of training
input_csv_path = '7nsw_all_hybrid.csv'
cmpds_csv_path = 'ecfp_euclidean_set/round3_100_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round3_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to ecfp_euclidean_set/round3_100_train_cmpds.csv


In [43]:
# Append the ecfp descriptors from the master list.
compounds_csv_path = 'ecfp_euclidean_set/round3_100_train_cmpds.csv'
descriptors_csv_path = 'docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)


Descriptors appended to ecfp_euclidean_set/round3_100_train_cmpds.csv


In [44]:
# Append the round 3 training compounds to the previous training set
file1_path = 'ecfp_euclidean_set/round2_100_train_cmpds.csv'
file2_path = 'ecfp_euclidean_set/round3_100_train_cmpds.csv'
output_path = 'ecfp_euclidean_set/round3_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to ecfp_euclidean_set/round3_100_train_cmpds.csv


In [45]:
! wc -l ecfp_euclidean_set/round3_100_train_cmpds.csv

401 ecfp_euclidean_set/round3_100_train_cmpds.csv


In [46]:
csv_file_path = 'ecfp_euclidean_set/round3_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [47]:
input_csv_path = 'ecfp_euclidean_set/round2_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'ecfp_euclidean_set/round3_100_train_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round3_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to ecfp_euclidean_set/round3_100_test_cmpds.csv


In [48]:
! wc -l ecfp_euclidean_set/round3_100_test_cmpds.csv

41505 ecfp_euclidean_set/round3_100_test_cmpds.csv


In [49]:
csv_file_to_predict = 'ecfp_euclidean_set/round3_100_test_cmpds.csv'
output_csv_file = 'ecfp_euclidean_set/round3_100_predicted_results.csv'

predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)


In [50]:
# Pick the top 100 compounds based on lowest docking score
predictions_file = 'ecfp_euclidean_set/round3_100_predicted_results.csv'
output_csv_file = 'ecfp_euclidean_set/round4_100_cmpds.csv'
top_n = int(input("Enter the number of top compounds to select: "))
greedy_strategy(predictions_file, output_csv_file, top_n=top_n)

Enter the number of top compounds to select:  100


ROUND 4

In [51]:
# Save out the top 100 compounds based on uncertainty for the next round of training
input_csv_path = '7nsw_all_hybrid.csv'
cmpds_csv_path = 'ecfp_euclidean_set/round4_100_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round4_100_train_cmpds.csv'

select_scores(input_csv_path, cmpds_csv_path, output_csv_path)

Selected compounds saved to ecfp_euclidean_set/round4_100_train_cmpds.csv


In [52]:
# Append the ecfp descriptors from the master list.
compounds_csv_path = 'ecfp_euclidean_set/round4_100_train_cmpds.csv'
descriptors_csv_path = 'docked_ecfp.csv'

append_descriptors_to_csv(compounds_csv_path, descriptors_csv_path)


Descriptors appended to ecfp_euclidean_set/round4_100_train_cmpds.csv


In [53]:
# Append the round 3 training compounds to the previous training set
file1_path = 'ecfp_euclidean_set/round3_100_train_cmpds.csv'
file2_path = 'ecfp_euclidean_set/round4_100_train_cmpds.csv'
output_path = 'ecfp_euclidean_set/round4_100_train_cmpds.csv'

append_train_compounds(file1_path, file2_path, output_path)

Concatenated compounds saved to ecfp_euclidean_set/round4_100_train_cmpds.csv


In [54]:
! wc -l ecfp_euclidean_set/round4_100_train_cmpds.csv

501 ecfp_euclidean_set/round4_100_train_cmpds.csv


In [55]:
csv_file_path = 'ecfp_euclidean_set/round4_100_train_cmpds.csv'
trained_gpr = train_gpr_model(csv_file_path)

In [56]:
input_csv_path = 'ecfp_euclidean_set/round3_100_test_cmpds.csv'
compounds_to_remove_csv_path = 'ecfp_euclidean_set/round4_100_train_cmpds.csv'
output_csv_path = 'ecfp_euclidean_set/round4_100_test_cmpds.csv'

remove_train_compounds(input_csv_path, compounds_to_remove_csv_path, output_csv_path)


Compounds removed and remaining compounds saved to ecfp_euclidean_set/round4_100_test_cmpds.csv


In [57]:
csv_file_to_predict = 'ecfp_euclidean_set/round4_100_test_cmpds.csv'
output_csv_file = 'ecfp_euclidean_set/round4_100_predicted_results.csv'

predict_and_save_results(trained_gpr, csv_file_to_predict, output_csv_file)


In [58]:
# Pick the top 100 compounds based on lowest docking score
predictions_file = 'ecfp_euclidean_set/round4_100_predicted_results.csv'
output_csv_file = 'ecfp_euclidean_set/round5_100_cmpds.csv'
top_n = int(input("Enter the number of top compounds to select: "))
greedy_strategy(predictions_file, output_csv_file, top_n=top_n)

Enter the number of top compounds to select:  100
