In [88]:
import selfies as sf
import pathlib as Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import re

In [89]:
mof_dataset = pd.read_csv(r"C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_CORE_final_dataset.csv")

In [90]:
mof_dataset.columns

Index(['MOFid', 'metal', 'linker', 'logKH_CO2', 'pure_uptake_CO2_298.00_15000',
       'pure_uptake_methane_298.00_6500000', 'LCD', 'PLD', 'LFPD', 'cm3_g',
       'ASA_m2_cm3', 'ASA_m2_g', 'NASA_m2_cm3', 'NASA_m2_g', 'AV_VF',
       'AV_cm3_g', 'NAV_cm3_g', 'All_Metals', 'Has_OMS', 'Open_Metal_Sites'],
      dtype='object')

In [91]:
geometry_info = mof_dataset[['LCD', 'PLD', 'LFPD', 'cm3_g',
       'ASA_m2_cm3', 'ASA_m2_g', 'NASA_m2_cm3', 'NASA_m2_g', 'AV_VF',
       'AV_cm3_g', 'NAV_cm3_g']]

precursors_info = mof_dataset[['MOFid', 'metal', 'linker', 'Has_OMS', 'Open_Metal_Sites']]

target_info = mof_dataset[['logKH_CO2', 'pure_uptake_CO2_298.00_15000', 'pure_uptake_methane_298.00_6500000']]

In [92]:

# Standardize geometry features
geometry_scaler = StandardScaler()
geometry_info_norm = pd.DataFrame(
    geometry_scaler.fit_transform(geometry_info),
    columns=geometry_info.columns,
    index=geometry_info.index
)

# Standardize targets
target_scaler = StandardScaler()
target_info_norm = pd.DataFrame(
    target_scaler.fit_transform(target_info),
    columns=target_info.columns,
    index=target_info.index
)

# Save scalers
import joblib
joblib.dump(geometry_scaler, 'geometry_scaler.pkl')
joblib.dump(target_scaler, 'target_scaler.pkl')

['target_scaler.pkl']

In [94]:
print(geometry_info_norm.min())
print(geometry_info_norm.max())

LCD           -0.930531
PLD           -1.071692
LFPD          -1.083744
cm3_g         -2.542419
ASA_m2_cm3    -1.413214
ASA_m2_g      -1.001030
NASA_m2_cm3   -0.472329
NASA_m2_g     -0.431001
AV_VF         -2.643680
AV_cm3_g      -0.961112
NAV_cm3_g     -0.025221
dtype: float64
LCD            13.329026
PLD             9.864606
LFPD           13.678638
cm3_g           6.537135
ASA_m2_cm3      2.608635
ASA_m2_g        5.548906
NASA_m2_cm3     5.653545
NASA_m2_g       9.040795
AV_VF           3.609937
AV_cm3_g       29.896164
NAV_cm3_g      57.070888
dtype: float64


In [95]:
print(target_info_norm.min())
print(target_info_norm.max())

logKH_CO2                            -5.013828
pure_uptake_CO2_298.00_15000         -1.299537
pure_uptake_methane_298.00_6500000   -1.258191
dtype: float64
logKH_CO2                              2.949122
pure_uptake_CO2_298.00_15000           4.699299
pure_uptake_methane_298.00_6500000    11.045829
dtype: float64


In [96]:
count = 0
for i in precursors_info['metal']:
    if len(i) > 12:
        print(i)
        count += 1

print(count)

[Na][Y][W][Cs]
[W][Na][K][Rb][Y]
[Co][K][W][Cu]
[Co][K][W][Cr]
[Na][Mo][Al][Eu]
5


In [97]:
def parse_metal_string(metal_str):

    metals = re.findall(r'\[([^\]]+)\]', metal_str)
    
    return metals

def build_metal_vocabulary(df, metal_column='metal'):

    all_metals = set()
    max_metals = 0
    
    for metal_str in df[metal_column]:
        metals = parse_metal_string(str(metal_str))
        all_metals.update(metals)
        max_metals = max(max_metals, len(metals))
    
    # Create vocabulary with padding token
    unique_metals = sorted(all_metals)
    metal_to_idx = {'<NONE>': 0}  # Padding token
    metal_to_idx.update({metal: i+1 for i, metal in enumerate(unique_metals)})
    
    print(f"Found {len(unique_metals)} unique metals")
    print(f"Vocabulary size: {len(metal_to_idx)} (including padding)")
    print(f"Maximum metals per MOF: {max_metals}")
    
    return metal_to_idx, max_metals

def encode_metals(df, metal_to_idx, max_length, metal_column='metal', oms_column='Open_Metal_Sites'):
    """
    Convert metal strings to integer sequences with OMS flags.
    
    Args:
        df: DataFrame with metal and OMS columns
        metal_to_idx: Metal vocabulary dictionary
        max_length: Sequence length (pad/truncate to this)
        metal_column: Name of metal column
        oms_column: Name of Open Metal Sites column
    
    Returns:
        metal_ids: Array of shape (n_mofs, max_length)
        metal_oms_flags: Array of shape (n_mofs, max_length)
    """
    n_mofs = len(df)
    metal_ids = np.zeros((n_mofs, max_length), dtype=np.int32)
    metal_oms_flags = np.zeros((n_mofs, max_length), dtype=np.float32)
    
    for i, row in df.iterrows():
        # Parse metals
        metals = parse_metal_string(str(row[metal_column]))
        
        # Convert to IDs (truncate if too long)
        for j, metal in enumerate(metals[:max_length]):
            metal_ids[i, j] = metal_to_idx.get(metal, 0)
        
        # Parse OMS
        oms_str = str(row[oms_column])
        if oms_str and oms_str != 'nan' and oms_str.strip():
            oms_metals = [m.strip() for m in oms_str.replace('[', '').replace(']', '').split(',')]
            
            # Set OMS flags
            for j, metal in enumerate(metals[:max_length]):
                if metal in oms_metals:
                    metal_oms_flags[i, j] = 1.0
    
    return metal_ids, metal_oms_flags



In [129]:
# Usage:
# Step 1: Build vocabulary
metal_vocab, max_metals = build_metal_vocabulary(precursors_info, metal_column='metal')

# Step 2: Encode all metals
metal_ids, metal_oms_flags = encode_metals(
    precursors_info, 
    metal_vocab, 
    max_length=max_metals,  # Or set fixed length like 5
    metal_column='metal',
    oms_column='Open_Metal_Sites'
)

# Step 3: Add to dataset
precursors_info_encoded = precursors_info.copy()  # To avoid SettingWithCopyWarning
precursors_info_encoded['metal_ids'] = list(metal_ids)
precursors_info_encoded['metal_oms_flags'] = list(metal_oms_flags)

Found 54 unique metals
Vocabulary size: 55 (including padding)
Maximum metals per MOF: 5


In [130]:
print(metal_vocab)
joblib.dump(metal_vocab, 'metal_vocabulary.pkl')

{'<NONE>': 0, 'Ag': 1, 'Al': 2, 'Au': 3, 'Ba': 4, 'Be': 5, 'Bi': 6, 'Ca': 7, 'Cd': 8, 'Ce': 9, 'Co': 10, 'Cr': 11, 'Cs': 12, 'Cu': 13, 'Dy': 14, 'Er': 15, 'Eu': 16, 'Fe': 17, 'Ga': 18, 'Gd': 19, 'Hf': 20, 'Ho': 21, 'In': 22, 'Ir': 23, 'K': 24, 'La': 25, 'Li': 26, 'Lu': 27, 'Mg': 28, 'Mn': 29, 'Mo': 30, 'Na': 31, 'Nb': 32, 'Nd': 33, 'Ni': 34, 'Pb': 35, 'Pd': 36, 'Pr': 37, 'Pt': 38, 'Rb': 39, 'Re': 40, 'Rh': 41, 'Ru': 42, 'Sm': 43, 'Sn': 44, 'Sr': 45, 'Tb': 46, 'Th': 47, 'Tm': 48, 'U': 49, 'V': 50, 'W': 51, 'Y': 52, 'Yb': 53, 'Zn': 54}


['metal_vocabulary.pkl']

In [131]:
print(precursors_info_encoded['metal_ids'])
print(precursors_info_encoded['metal_oms_flags'])

0       [29, 0, 0, 0, 0]
1       [10, 0, 0, 0, 0]
2       [13, 0, 0, 0, 0]
3       [25, 0, 0, 0, 0]
4        [9, 0, 0, 0, 0]
              ...       
3328     [8, 0, 0, 0, 0]
3329    [34, 8, 0, 0, 0]
3330    [34, 8, 0, 0, 0]
3331    [54, 0, 0, 0, 0]
3332    [49, 0, 0, 0, 0]
Name: metal_ids, Length: 3333, dtype: object
0       [0.0, 0.0, 0.0, 0.0, 0.0]
1       [0.0, 0.0, 0.0, 0.0, 0.0]
2       [0.0, 0.0, 0.0, 0.0, 0.0]
3       [1.0, 0.0, 0.0, 0.0, 0.0]
4       [1.0, 0.0, 0.0, 0.0, 0.0]
                  ...            
3328    [0.0, 0.0, 0.0, 0.0, 0.0]
3329    [1.0, 0.0, 0.0, 0.0, 0.0]
3330    [1.0, 1.0, 0.0, 0.0, 0.0]
3331    [1.0, 0.0, 0.0, 0.0, 0.0]
3332    [1.0, 0.0, 0.0, 0.0, 0.0]
Name: metal_oms_flags, Length: 3333, dtype: object


In [132]:
def encode_linker(df, linker_column='linker'):
    linkers = df[linker_column].tolist()

    # Build vocabulary
    linker_vocab = sf.get_alphabet_from_selfies(linkers)
    linker_vocab.add('[nop]')  # Padding token
    linker_vocab = list(sorted(linker_vocab))

    # Determine padding length
    pad_to_len = max(sf.len_selfies(s) for s in linkers)
    
    # Create symbol to index mapping
    symbol_to_idx = {s: i for i, s in enumerate(linker_vocab)}

    print(f"Linker vocabulary size: {len(linker_vocab)}")
    print(f"Padding length: {pad_to_len}")

    # Encode each linker
    onehot_encodings = []
    labels = []

    for linker in linkers:
        label, onehot_encoding = sf.selfies_to_encoding(
            selfies=linker,
            vocab_stoi=symbol_to_idx,
            pad_to_len=pad_to_len,
            enc_type="both"
        )
        onehot_encodings.append(onehot_encoding)
        labels.append(label)

    onehot_encodings = np.array(onehot_encodings)
    labels = np.array(labels)

    return labels, onehot_encodings

In [133]:
labels, linker_ids = encode_linker(precursors_info_encoded, linker_column='linker')

Linker vocabulary size: 86
Padding length: 198


In [138]:
print(labels[0])
print(labels[0].shape)

[46 36 19 46 19 46 27 58 23 46 36 19 46 19 46 27 58 23 46 36 19 46 19 46
 27 58 23 66  3 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85
 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85
 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85
 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85
 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85
 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85
 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85 85
 85 85 85 85 85 85]
(198,)


In [140]:
print(linker_ids[0])
print(linker_ids[0].shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]
 [0 0 0 ... 0 0 1]]
(198, 86)


In [None]:
precursors_info_encoded = 