In [1]:
import os
import sys
import numpy as np
import pandas as pd
import rdkit.Chem

sys.path.append("../")
sys.path.append(os.getcwd())

from src.utils.constants import DATA_DIR, RXN_VARS, CATEGORICAL_MAPPINGS

In [2]:
# Candidate combinations (space we are exploring)
candidate_reactions_path = os.path.join(DATA_DIR, "Reaction_Data_Inputs_021424.csv")
candidate_reactions = pd.read_csv(candidate_reactions_path, index_col=0)

# Attempted combinations (space we have already explored, is a subset of candidate combinations)
attempted_reactions_path = os.path.join(DATA_DIR, "2D_Dataset.csv")
attempted_reactions = pd.read_csv(attempted_reactions_path)

In [3]:
candidate_reactions = candidate_reactions.drop_duplicates()
candidate_reactions

Unnamed: 0,Rxn_Name,index,SMILES,MW,IE,AMW,frac_sp3,heavy_atom,NH_OH,aromatic_heter,...,Ion,Form 2D Perovskites?,Metal Ionization Energy (kJ/mol),Metal Electron Affinity (kJ/mol),Metal Pearson Hardness (eV),Metal Mulliken Electronegativity (eV; ionic),Metal Pauling Electronegativity (atomic),No.Electrons,Valence d electron count,Ionic Radius
0,Mn++ + Cl + CSCCN,19,CSCCN,0,0,0,1.0,5,2,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
1,Mn++ + Cl + CSCCCN,19,CSCCCN,0,0,0,1.0,6,2,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
2,Mn++ + Cl + C1CSCCN1,19,C1CSCCN1,0,0,0,1.0,6,1,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
3,Mn++ + Cl + NCC1CCSCC1,19,NCC1CCSCC1,144,8,8,1.0,8,2,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
4,Mn++ + Cl + COCCN,19,COCCN,0,0,0,1.0,5,2,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2074,Pb++ + I + CN,43,CN,0,0,0,1.0,2,2,0,...,Pb++,1,3081.5,1450.5,8.451925,23.485053,2.33,80,10,1.33
2075,Pb++ + I + N,43,N,137,8,7,0.0,1,3,0,...,Pb++,1,3081.5,1450.5,8.451925,23.485053,2.33,80,10,1.33
2076,Pb++ + I + NCCCCCN,43,NCCCCCN,137,8,7,1.0,7,4,0,...,Pb++,1,3081.5,1450.5,8.451925,23.485053,2.33,80,10,1.33
2077,Pb++ + I + Nc1ccc(-c2ccccc2)cc1,43,Nc1ccc(-c2ccccc2)cc1,137,8,7,0.0,13,2,0,...,Pb++,1,3081.5,1450.5,8.451925,23.485053,2.33,80,10,1.33


In [4]:
# Normalize continuous and discrete variables to [0, 1]
for (data_type, field_name) in RXN_VARS.values():
    if data_type != "categorical": # discrete or continuous
        candidate_reactions[field_name] = ((candidate_reactions[field_name] - candidate_reactions[field_name].min())
                                / (candidate_reactions[field_name].max() - candidate_reactions[field_name].min()))

In [5]:
attempted_reactions = attempted_reactions[["Metal", "Halide", "Ligand", "Type"]]

# Drop rows with NaN values
# .dropna(inplace=False, how="all")
attempted_reactions = attempted_reactions.dropna(inplace=False, how="any", axis=0)

# Drop rows with Type = 0.0
attempted_reactions = attempted_reactions[attempted_reactions["Type"] != 0.0]

In [12]:
attempted_reactions

Unnamed: 0,Metal,Halide,Ligand,Type
0,Pb++,I,NCCCC,100
1,Pb++,Br,NCCCC,100
2,Pb++,I,NCCCCC,100
3,Pb++,I,NCCCCCC,100
4,Pb++,I,NCCCCCCC,100
...,...,...,...,...
288,Pb++,I,NCCCc1ncc[nH]1,110
289,Pb++,Cl,c1cncc[n+]1,110
290,Cu++,Br,NCC[S](O)(=O)=O,100
291,Pb++,Br,CNCCN,110


In [6]:
# Prepare dataset for training
all_rows = []
all_y = []
reaction_ids = []
for (index, row) in attempted_reactions.iterrows():
    metal = row["Metal"]
    halide = row["Halide"]
    ligand = row["Ligand"]
    phase = row["Type"]

    # Convert SMILES to canonical form
    ligand = rdkit.Chem.CanonSmiles(ligand)

    # Get reaction name to retrieve info from candidate reactions
    rxn_name = metal + " + " + halide + " + " + ligand
    all_info_row = candidate_reactions.loc[candidate_reactions['Rxn_Name'] == rxn_name]
    reaction_ids.append(rxn_name)

    # Check if there are any duplicates
    try:
        assert len(all_info_row) == 1
    except AssertionError:
        print(rxn_name)
        print(all_info_row)
        print(len(all_info_row))
        print()
        continue

    
    all_rows.append(all_info_row)
    all_y.append(int(phase == 110.0))

all_x = pd.concat(all_rows, axis=0)
all_y = np.array(all_y)

In [7]:
# Deal with qualitative and quantitative features separately (assign class labels to qualitative features)
all_features = []
all_feature_names = []
all_qual_feature_num = []
# all_precursor_names = []

for (index, row) in all_x.iterrows():

    qual_features = []
    qual_feature_names = []
    qual_feature_num = []

    quant_features = []
    quant_feature_names = []

    for (key, items) in RXN_VARS.items():
        data_point = row[items[1]]
        if items[0] == "categorical":
            categorical_mapping = CATEGORICAL_MAPPINGS[key]
            class_label = categorical_mapping[data_point]
            num_classes = len(list(categorical_mapping.keys()))
            qual_features.append(class_label)
            qual_feature_names.append(key)
            qual_feature_num.append(num_classes)

        else:
            quant_features.append(data_point)
            quant_feature_names.append(key)

    all_features.append(qual_features + quant_features)
    all_feature_names.append(qual_feature_names + quant_feature_names)
    all_qual_feature_num.append(qual_feature_num)
    # all_precursor_names = np.array(all_precursor_names)

In [8]:
print(type(all_y))

<class 'numpy.ndarray'>


In [9]:
assert np.any(np.isnan(all_features[0])) == False

In [10]:
 # Save data dict to numpy file
dataset_path = os.path.join(DATA_DIR, "dataset.npz")
print("Saving dataset to", dataset_path)
np.savez(dataset_path, 
            X=all_features, 
            Y=all_y, 
            X_labels=all_feature_names, 
            X_qual_num_classes=all_qual_feature_num,
            reaction_ids=reaction_ids)

Saving dataset to /Users/junhalee/Desktop/perovskite-phase-pred/data/dataset.npz


In [11]:
npz_file = np.load(os.path.join(DATA_DIR, "dataset.npz"))
print(npz_file.files)
print(type(npz_file['X']))
print(npz_file['X'])

['X', 'Y', 'X_labels', 'X_qual_num_classes', 'reaction_ids']
<class 'numpy.ndarray'>
[[2.         2.         0.88888889 ... 1.         1.         1.        ]
 [0.         2.         0.88888889 ... 1.         1.         1.        ]
 [2.         2.         0.88888889 ... 1.         1.         1.        ]
 ...
 [0.         0.         0.         ... 0.44871795 0.07017544 0.8       ]
 [0.         2.         0.88888889 ... 1.         1.         1.        ]
 [2.         2.         0.88888889 ... 1.         1.         1.        ]]
