In [1]:
import numpy as np
import pandas as pd
import rdkit
import os
from new_header_keys import rxn_vars, categorical_mappings

In [2]:
all_info = pd.read_csv('Reaction_Data_Inputs_021424.csv', index_col=0)
all_info = all_info.drop_duplicates()
all_info

Unnamed: 0,Rxn_Name,index,SMILES,MW,IE,AMW,frac_sp3,heavy_atom,NH_OH,aromatic_heter,...,Ion,Form 2D Perovskites?,Metal Ionization Energy (kJ/mol),Metal Electron Affinity (kJ/mol),Metal Pearson Hardness (eV),Metal Mulliken Electronegativity (eV; ionic),Metal Pauling Electronegativity (atomic),No.Electrons,Valence d electron count,Ionic Radius
0,Mn++ + Cl + CSCCN,19,CSCCN,0,0,0,1.0,5,2,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
1,Mn++ + Cl + CSCCCN,19,CSCCCN,0,0,0,1.0,6,2,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
2,Mn++ + Cl + C1CSCCN1,19,C1CSCCN1,0,0,0,1.0,6,1,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
3,Mn++ + Cl + NCC1CCSCC1,19,NCC1CCSCC1,144,8,8,1.0,8,2,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
4,Mn++ + Cl + COCCN,19,COCCN,0,0,0,1.0,5,2,0,...,Mn++,1,3248.0,1509.0,9.011586,24.651015,1.55,23,5,0.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2074,Pb++ + I + CN,43,CN,0,0,0,1.0,2,2,0,...,Pb++,1,3081.5,1450.5,8.451925,23.485053,2.33,80,10,1.33
2075,Pb++ + I + N,43,N,137,8,7,0.0,1,3,0,...,Pb++,1,3081.5,1450.5,8.451925,23.485053,2.33,80,10,1.33
2076,Pb++ + I + NCCCCCN,43,NCCCCCN,137,8,7,1.0,7,4,0,...,Pb++,1,3081.5,1450.5,8.451925,23.485053,2.33,80,10,1.33
2077,Pb++ + I + Nc1ccc(-c2ccccc2)cc1,43,Nc1ccc(-c2ccccc2)cc1,137,8,7,0.0,13,2,0,...,Pb++,1,3081.5,1450.5,8.451925,23.485053,2.33,80,10,1.33


In [3]:
for (data_type, field_name) in rxn_vars.values():
    if data_type != "categorical":
        all_info[field_name] = ((all_info[field_name] - all_info[field_name].min())
                                / (all_info[field_name].max() - all_info[field_name].min()))
    

In [4]:
dataset = pd.read_csv("2D_Dataset.csv")
dataset = dataset[["Metal", "Halide", "Ligand", "Type"]]

# .dropna(inplace=False, how="all")
dataset = dataset.dropna(inplace=False, how="any", axis=0)
dataset = dataset[dataset["Type"] != 0.0]
dataset

Unnamed: 0,Metal,Halide,Ligand,Type
0,Pb++,I,NCCCC,100
1,Pb++,Br,NCCCC,100
2,Pb++,I,NCCCCC,100
3,Pb++,I,NCCCCCC,100
4,Pb++,I,NCCCCCCC,100
...,...,...,...,...
288,Pb++,I,NCCCc1ncc[nH]1,110
289,Pb++,Cl,c1cncc[n+]1,110
290,Cu++,Br,NCC[S](O)(=O)=O,100
291,Pb++,Br,CNCCN,110


In [5]:
all_rows = []
all_y = []
for (index, row) in dataset.iterrows():
    metal = row["Metal"]
    halide = row["Halide"]
    ligand = row["Ligand"]
    phase = row["Type"]
    ligand = rdkit.Chem.CanonSmiles(ligand)
    rxn_name = metal + " + " + halide + " + " + ligand
    # all_info_index = pd.where(all_info["Reaction Name"] == rxn_name)
    all_info_row = all_info.loc[all_info['Rxn_Name'] == rxn_name]
    try:
        assert len(all_info_row) == 1
    except AssertionError:
        print(rxn_name)
        print(all_info_row)
        print(len(all_info_row))
        print()
        continue
    # try:
    # except AssertionError:
    #     duplicates = all_info_row.duplicated(keep=False)
    #     all_same = duplicates.all()
    #     assert all_same
    all_rows.append(all_info_row)
    all_y.append(int(phase == 110.0))
all_y = np.array(all_y)

In [6]:
all_x = pd.concat(all_rows, axis=0)

In [7]:
all_features = []
all_feature_names = []
all_qual_feature_num = []
all_precursor_names = []
all_amounts = []

for (index, row) in all_x.iterrows():

    qual_features = []
    qual_feature_names = []
    qual_feature_num = []

    quant_features = []
    quant_feature_names = []

    for (key, items) in rxn_vars.items():
        data_point = row[items[1]]
        if items[0] == "categorical":
            categorical_mapping = categorical_mappings[key]
            class_label = categorical_mapping[data_point]
            num_classes = len(list(categorical_mapping.keys()))
            qual_features.append(class_label)
            qual_feature_names.append(key)
            qual_feature_num.append(num_classes)

        else:
            quant_features.append(data_point)
            quant_feature_names.append(key)
    all_features.append(qual_features + quant_features)
    all_feature_names.append(qual_feature_names + quant_feature_names)
    all_qual_feature_num.append(qual_feature_num)
    all_precursor_names = np.array(all_precursor_names)

In [8]:
os.makedirs("data", exist_ok=True)

In [9]:
np.any(np.isnan(all_features[0]))

False

In [10]:
np.save("./data/all_features.npy", all_features)
np.save("./data/all_feature_names.npy", all_feature_names[-1])
np.save("./data/all_qual_input_dims.npy", all_qual_feature_num[-1])
np.save("./data/all_precursor_names.npy", all_precursor_names)
np.save("./data/all_amounts.npy", all_amounts)
np.save("./data/all_y.npy", all_y)