In [1]:
import os
import sys
import numpy as np
import pandas as pd
import rdkit

sys.path.append("../")

from src.utils.constants import DATA_DIR, RXN_VARS, CATEGORICAL_MAPPINGS

In [2]:
# Candidate combinations (space we are exploring)
candidate_reactions_path = os.path.join(DATA_DIR, "Reaction_Data_Inputs_021424.csv")
candidate_reactions = pd.read_csv(candidate_reactions_path, index_col=0)
candidate_reactions = candidate_reactions.drop_duplicates()

In [3]:
# Attempted combinations (space we have already explored, is a subset of candidate combinations)
attempted_reactions_path = os.path.join(DATA_DIR, "2D_Dataset.csv")
attempted_reactions = pd.read_csv(attempted_reactions_path)
attempted_reactions = attempted_reactions[["Metal", "Halide", "Ligand", "Type"]]

# Drop rows with NaN values
attempted_reactions = attempted_reactions.dropna(inplace=False, how="any", axis=0)

# Drop rows with Type = 0.0
attempted_reactions = attempted_reactions[attempted_reactions["Type"] != 0.0]
attempted_reactions = attempted_reactions.drop_duplicates()

In [4]:
# Change column name of attempted_reactions to match candidate_reactions
attempted_reactions = attempted_reactions.rename(
    columns={"Metal": "Ion", "Ligand": "SMILES"}
)

In [5]:
# Add Rxn_Name column to attempted_reactions
attempted_reactions.insert(0, "Rxn_Name", "")

# Add all other columns to attempted_reactions that is not in candidate_reactions
for col in candidate_reactions.columns:
    if col not in attempted_reactions.columns:
        attempted_reactions[col] = np.nan

In [6]:
# Fill in Rxn_Name of attempted_reactions
for i, row in attempted_reactions.iterrows():
    metal = row["Ion"]
    halide = row["Halide"]
    ligand = row["SMILES"]

    # Convert SMILES to canonical form
    ligand = rdkit.Chem.CanonSmiles(ligand)

    # Get reaction name to retrieve info from candidate reactions
    rxn_name = metal + " + " + halide + " + " + ligand
    attempted_reactions.at[i, "Rxn_Name"] = rxn_name

In [7]:
# Fill in the rest of the columns of attempted_reactions with info from candidate_reactions
for i, row in attempted_reactions.iterrows():
    rxn_name = row["Rxn_Name"]
    candidate_row = candidate_reactions[candidate_reactions["Rxn_Name"] == rxn_name]

    for col in candidate_row.columns:
        attempted_reactions.at[i, col] = candidate_row[col].values[0]

In [8]:
labelled_reactions = attempted_reactions
unlabelled_reactions = candidate_reactions[~candidate_reactions["Rxn_Name"].isin(labelled_reactions["Rxn_Name"])]
total_reactions = candidate_reactions

In [9]:
unlabelled_reactions = unlabelled_reactions.drop_duplicates()
labelled_reactions = labelled_reactions.drop_duplicates()
total_reactions = total_reactions.drop_duplicates()

In [10]:
print(labelled_reactions.shape, unlabelled_reactions.shape, total_reactions.shape)

(272, 51) (1664, 50) (1935, 50)


In [11]:
def pandas_to_numpy_data(labelled_reactions, unlabelled_reactions, total_reactions):
    labelled_X_quant = []
    labelled_X_qual = []

    unlabelled_X_quant = []
    unlabelled_X_qual = []

    X_quant_labels = []
    X_qual_labels = []

    # 'feature' is the name used in src.utils.constants, 'feature_name' is the name used in the dataframe 
    for feature, (feature_type, feature_name) in RXN_VARS.items():
        if feature_type == "discrete" or feature_type == "continuous":
            # Normalize quantitative features to [0, 1]
            min_value = total_reactions[feature_name].min()
            max_value = total_reactions[feature_name].max()

            labelled_feature_values = labelled_reactions[feature_name].values 
            unlabelled_feature_values = unlabelled_reactions[feature_name].values
            
            labelled_feature_values = (labelled_feature_values - min_value) / (max_value - min_value)
            unlabelled_feature_values = (unlabelled_feature_values - min_value) / (max_value - min_value)
    
            labelled_X_quant.append(labelled_feature_values)
            unlabelled_X_quant.append(unlabelled_feature_values)
            X_quant_labels.append(feature_name)

        else: # feature_type == "categorical"
            labelled_feature_values = labelled_reactions[feature_name].values
            unlabelled_feature_values = unlabelled_reactions[feature_name].values

            labelled_feature_values = [CATEGORICAL_MAPPINGS[feature][val] for val in labelled_feature_values]
            unlabelled_feature_values = [CATEGORICAL_MAPPINGS[feature][val] for val in unlabelled_feature_values]
            
            labelled_X_qual.append(labelled_feature_values)
            unlabelled_X_qual.append(unlabelled_feature_values)
            X_qual_labels.append(feature_name)


    labelled_X = labelled_X_qual + labelled_X_quant
    unlabelled_X = unlabelled_X_qual + unlabelled_X_quant

    X_labels = X_qual_labels + X_quant_labels

    return labelled_X, unlabelled_X, X_labels

In [12]:
pandas_to_numpy_data(labelled_reactions, unlabelled_reactions, total_reactions)

[2, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 2, 0, 1, 0, 2, 0, 0, 0, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 1, 0, 2, 2, 2, 2, 0, 1, 1, 0, 0, 0, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 0, 0, 1, 1, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 0, 0, 2, 0, 1, 2, 2, 2, 1, 0, 2, 0, 1, 1, 2, 1, 0, 1, 0, 0, 2, 2, 1, 2, 1, 1, 0, 1, 2, 0, 2, 0, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 2, 2, 2, 0, 1, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1, 0, 1, 0, 0, 2, 1, 0, 0, 2]
[0.88888889 0.88888889 0.88888889 0.88888889 0.88888889 0.88888889
 0.88888889 0.88888889 0.88888889 0.88888889 0.88888889 0.88888889
 0.88888889 0.88888889 0.88888889 0.88888889 0.88