In [2]:
# Packages

import selfies as sf
import pathlib as Path
import pandas as pd
import numpy as np

In [3]:
# Help functions

def txts2dataframe(folder_path, column_name):
    folder = Path.Path(folder_path)
    data = []
    for txt_file in folder.glob("*.txt"):
        with txt_file.open("r") as f:
            content = f.read()
            data.append({f"{column_name[0]}": txt_file.stem, f"{column_name[1]}": content})
    return pd.DataFrame(data)

def csv2dataframe(file_path):
    return pd.read_csv(file_path)

def attach_labels(base_df, folder):
    folder = Path.Path(folder)
    merged = base_df.copy()
    for f in sorted(folder.glob("*.csv")):
        df = pd.read_csv(f).iloc[:, :2]    
        df.columns = ["MOFid", f.stem]     
        merged = pd.merge(merged, df, on="MOFid", how="left")
    return merged

def smile2selfie(smile):
    try:
        selfie = sf.encoder(smile)
        return selfie
    except:
        return np.nan

def check4duplicate(df1, df2):
    duplicates = pd.merge(df1, df2, on=['metal', 'linker'], how='inner')
    return duplicates

def match_cif_files(cif_directory, dataset_mof_ids):
    
    cif_dir = Path.Path(cif_directory)

    cif_files = list(cif_dir.rglob('*.cif'))
    print(f"Found {len(cif_files)} total CIF files")
    
    cif_dict = {}
    for cif_file in cif_files:
        mof_id = cif_file.stem.replace('_pacman', '')
        cif_dict[mof_id] = cif_file

    matched = {}
    for mof_id in dataset_mof_ids:
        if mof_id in cif_dict:
            matched[mof_id] = cif_dict[mof_id]
    
    print(f"Found {len(matched)}/{len(dataset_mof_ids)} matches")
    return matched

def merge_mof_datasets(original_csv, metadata_csv, output_path=None):

    original_df = pd.read_csv(original_csv)
    metadata_df = pd.read_csv(metadata_csv)
    
    # Strip _pacman from filename to get MOFid
    metadata_df['MOFid'] = metadata_df['filename'].str.replace('_pacman', '')
    
    # Drop unnecessary columns from metadata
    # Keeping: geometric features + metal info
    # Dropping: metadata/database tracking columns
    columns_to_drop = [
        'filename',  # Already have MOFid
        'Extension', 'FSR_overlap', 'from_CSD', 'CR',
        'CSD_overlap_inCoRE', 'CSD_of_WoS_inCoRE', 'CSD_overlap_inCCDC',
        'date_CSD', 'DOI_public', 'Note', 
        'Matched_CSD_of_CoRE', 'Possible_List_CSD_of_CoRE'
    ]
    
    # Drop columns that exist
    columns_to_drop = [col for col in columns_to_drop if col in metadata_df.columns]
    metadata_df = metadata_df.drop(columns=columns_to_drop)
    
    # Drop any unnamed/empty columns
    metadata_df = metadata_df.loc[:, ~metadata_df.columns.str.contains('^Unnamed')]
    
    print(f"Dropping columns: {columns_to_drop}")
    print(f"\nKept metadata features: {list(metadata_df.columns)}")
    
    # Merge on MOFid
    merged_df = original_df.merge(metadata_df, on='MOFid', how='inner')
    
    print(f"\nMerged {len(merged_df)}/{len(original_df)} MOFs")
    
    # Save if output path provided
    if output_path:
        merged_df.to_csv(output_path, index=False)
        print(f"Saved to {output_path}")
    
    return merged_df

In [9]:
# Loading data from Professor Moosavi's research group

df_mofs_moosavi_precursor = txts2dataframe(r"C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_dataset_Moosavi\precursors\core", ["MOFid", "precursors"])

In [10]:
# Seeing what the loaded data looks like

df_mofs_moosavi_precursor

Unnamed: 0,MOFid,precursors
0,00958972.2016.1250260_1436516_clean,[Cu].c1cc(ccn1)CNC(=O)Nc1cccnc1
1,00958972.2016.1250260_1436519_clean,[Cd].n1cccc(NC(=O)NCc2cccnc2)c1
2,ABAVIJ_clean,[Co].c1cnccc1C(=O)[O]
3,ABAYIO_clean,[Mn].c1(C(=O)[O])cc(C(=O)[O])cc(C(=O)[O])c1
4,ABAYOU_clean,[Co].C(=O)(c1cc(C(=O)[O])cc(C(=O)[O])c1)[O]
...,...,...
9001,ZUTBUN_clean,[Ga].[O]P(=O)([O])[O]
9002,ZUVTEP_clean,[La].[O]C(=O)COCC(=O)[O]
9003,ZUWXUM_clean,[In].C(=O)(c1cc(C(=O)[O])cc(c1)C(=O)[O])[O]
9004,ZUXPOZ_clean,[U].c1cc(ccc1C(=O)[O])C(=O)[O]


In [11]:
# Separating metal and linker
# Smiles to selfies

df_mofs_moosavi = pd.DataFrame()

df_mofs_moosavi['MOFid'] = df_mofs_moosavi_precursor['MOFid']
df_mofs_moosavi['metal'] = df_mofs_moosavi_precursor['precursors'].apply(lambda x: x.split('.')[0])

df_mofs_moosavi['linker'] = df_mofs_moosavi_precursor['precursors'].apply(lambda x: smile2selfie(x.split('.')[1]))

df_mofs_moosavi

Unnamed: 0,MOFid,metal,linker
0,00958972.2016.1250260_1436516_clean,[Cu],[C][=C][C][=Branch1][=Branch1][=C][C][=N][Ring...
1,00958972.2016.1250260_1436519_clean,[Cd],[N][=C][C][=C][C][Branch1][S][N][C][=Branch1][...
2,ABAVIJ_clean,[Co],[C][=C][N][=C][C][=C][Ring1][=Branch1][C][=Bra...
3,ABAYIO_clean,[Mn],[C][Branch1][=Branch1][C][=Branch1][C][=O][OH0...
4,ABAYOU_clean,[Co],[C][=Branch1][C][=O][Branch2][Ring1][#Branch1]...
...,...,...,...
9001,ZUTBUN_clean,[Ga],[OH0][P][=Branch1][C][=O][Branch1][C][OH0][OH0]
9002,ZUVTEP_clean,[La],[OH0][C][=Branch1][C][=O][C][O][C][C][=Branch1...
9003,ZUWXUM_clean,[In],[C][=Branch1][C][=O][Branch2][Ring1][#Branch1]...
9004,ZUXPOZ_clean,[U],[C][=C][C][=Branch1][O][=C][C][=C][Ring1][=Bra...


In [12]:
# Attaching labels to the dataset with columns MOFid, metal, and linker(in selfie)

df_mofs_moosavi = attach_labels(df_mofs_moosavi, r"C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_dataset_Moosavi\labels\core")


In [13]:
# dropping NaNs from the dataset (Nans are generated from converting Smile to Selfies; Some SMILES are not valid)
df_mofs_moosavi_clean = df_mofs_moosavi.dropna().reset_index(drop=True)
df_mofs_moosavi_clean

Unnamed: 0,MOFid,metal,linker,density,Di,logKH_CO2,pure_uptake_CO2_298.00_15000,pure_uptake_methane_298.00_6500000
0,00958972.2016.1250260_1436516_clean,[Cu],[C][=C][C][=Branch1][=Branch1][=C][C][=N][Ring...,1.102640,4.71855,-3.578895,2.651632,3.493840
1,00958972.2016.1250260_1436519_clean,[Cd],[N][=C][C][=C][C][Branch1][S][N][C][=Branch1][...,1.254080,5.14924,-2.131282,2.567136,3.423210
2,ABAYIO_clean,[Mn],[C][Branch1][=Branch1][C][=Branch1][C][=O][OH0...,0.949126,11.39486,-4.353803,0.452906,8.424232
3,ABAYOU_clean,[Co],[C][=Branch1][C][=O][Branch2][Ring1][#Branch1]...,0.979267,11.27344,-4.388713,0.428606,8.341120
4,ABETIN_clean,[Cu],[C][=C][C][=Branch1][=Branch1][=C][C][=N][Ring...,0.597233,9.47418,-4.611687,0.374226,17.698172
...,...,...,...,...,...,...,...,...
3647,ZURLAB_clean,[Cd],[C][=Branch2][Ring1][#Branch2][=N][C][=Branch1...,1.582500,4.89339,-4.362978,0.493877,1.824221
3648,ZURQOS_clean,[Ni][Cd],[C][Branch1][=N][C][=C][C][=C][Branch1][Ring1]...,1.306180,4.62363,-3.028101,3.650447,4.786686
3649,ZURROT_clean,[Ni][Cd],[C][Branch1][=N][C][=C][C][=C][Branch1][Ring1]...,1.077740,4.90807,-3.104099,5.325700,5.502994
3650,ZUSBOG_clean,[Zn],[C][=Branch1][C][=O][Branch2][Ring1][Branch2][...,1.167820,4.86162,-3.538818,2.509926,5.918989


In [14]:
# Saving processed MOF dataset into csv file
# Later this will be merged with ASR csv file containing metadata in CoRE MOF 2019 database available online.

df_mofs_moosavi_clean.to_csv(r"C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_processed data\df_mofs_moosavi_clean.csv", index=False)

In [15]:
# Checking for available CIF files on local environment downloaded from CoRE MOF 2019 databse online. CIF files contain geometric / structural information for MOFs.

matched_cif_files = match_cif_files(r"C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_CORE2019\CoREMOF2019_public_v2_20241118", df_mofs_moosavi_clean['MOFid'])

print(matched_cif_files)

Found 19081 total CIF files
Found 3333/3652 matches
{'ABAYIO_clean': WindowsPath('C:/Users/dalja/OneDrive/Desktop/APS360 Project/MOF_CORE2019/CoREMOF2019_public_v2_20241118/CR/ASR/ABAYIO_clean_pacman.cif'), 'ABAYOU_clean': WindowsPath('C:/Users/dalja/OneDrive/Desktop/APS360 Project/MOF_CORE2019/CoREMOF2019_public_v2_20241118/CR/ASR/ABAYOU_clean_pacman.cif'), 'ABETIN_clean': WindowsPath('C:/Users/dalja/OneDrive/Desktop/APS360 Project/MOF_CORE2019/CoREMOF2019_public_v2_20241118/CR/ASR/ABETIN_clean_pacman.cif'), 'ABEXEM_clean': WindowsPath('C:/Users/dalja/OneDrive/Desktop/APS360 Project/MOF_CORE2019/CoREMOF2019_public_v2_20241118/CR/ASR/ABEXEM_clean_pacman.cif'), 'ABEXIQ_clean': WindowsPath('C:/Users/dalja/OneDrive/Desktop/APS360 Project/MOF_CORE2019/CoREMOF2019_public_v2_20241118/CR/ASR/ABEXIQ_clean_pacman.cif'), 'ABEXOW_clean': WindowsPath('C:/Users/dalja/OneDrive/Desktop/APS360 Project/MOF_CORE2019/CoREMOF2019_public_v2_20241118/CR/ASR/ABEXOW_clean_pacman.cif'), 'ABEXUC_clean': Windows

In [16]:
# Merging two datasets; one from Professor Moosavi's group & one from CoRE MOF 2019 Dataset, merged based on MOFID that matches with cif

df_mofs_v1 = merge_mof_datasets(r'C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_processed data\df_mofs_moosavi_clean.csv',
r'C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_CORE2019\ASR_internal_20241119.csv')

# Dropping unnecessary columns; these are duplicates; Professor Moosavi's dataset has these columns already, which are identical to cm3_g and PLD
df_mofs_v1.drop(columns=['density'], inplace=True)

df_mofs_v1.drop(columns=['Di'], inplace=True)

Dropping columns: ['filename', 'Extension', 'FSR_overlap', 'from_CSD', 'CR', 'CSD_overlap_inCoRE', 'CSD_of_WoS_inCoRE', 'CSD_overlap_inCCDC', 'date_CSD', 'DOI_public', 'Note', 'Matched_CSD_of_CoRE', 'Possible_List_CSD_of_CoRE']

Kept metadata features: ['LCD', 'PLD', 'LFPD', 'cm3_g', 'ASA_m2_cm3', 'ASA_m2_g', 'NASA_m2_cm3', 'NASA_m2_g', 'AV_VF', 'AV_cm3_g', 'NAV_cm3_g', 'All_Metals', 'Has_OMS', 'Open_Metal_Sites', 'MOFid']

Merged 3333/3652 MOFs


In [4]:
df_mofs_qmof_precursor = txts2dataframe(r"C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_dataset_Moosavi\precursors\qmof", ["MOFid", "precursors"])


In [5]:
DF_MOFS_QMOF = pd.DataFrame()
DF_MOFS_QMOF['MOFid'] = df_mofs_qmof_precursor['MOFid']
DF_MOFS_QMOF['metal'] = df_mofs_qmof_precursor['precursors'].apply(lambda x: x.split('.')[0])
DF_MOFS_QMOF['linker'] = df_mofs_qmof_precursor['precursors'].apply(lambda x: smile2selfie(x.split('.')[1]))
DF_MOFS_QMOF = attach_labels(DF_MOFS_QMOF, r"C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_dataset_Moosavi\labels\qmof")


In [6]:
DF_MOFS_QMOF_clean = DF_MOFS_QMOF.dropna().reset_index(drop=True)
DF_MOFS_QMOF_clean

Unnamed: 0,MOFid,metal,linker,bandgap
0,qmof-000741d,[Sc],[O-1][C][=O],3.565085
1,qmof-0009829,[Al],[O-1][C][=Branch1][C][=O][C][#C][C][C][C][Bran...,2.101297
2,qmof-004080b,[Zn],[N][C][C][C][=C][Ring1][Ring2][C][=Branch1][O]...,1.487983
3,qmof-004947a,[Zn],[C][C][=N][N][=C][Branch1][Ring2][NH0][Ring1][...,4.282851
4,qmof-004e466,[Cu],[OH0][P][=Branch1][C][=O][Branch2][Ring1][Bran...,0.958525
...,...,...,...,...
7075,qmof-ffe6994,[Zn],[O-1][C][=Branch1][C][=O][C][=C][N][=C][C][=Br...,2.887756
7076,qmof-ffed86d,[Cd],[O-1][C][=Branch1][C][=O][C][=C][C][=C][Branch...,2.177468
7077,qmof-fff0df1,[Cu],[O][=C][Branch1][=Branch2][C][=C][C][=N][C][=C...,0.500609
7078,qmof-fffd0e2,[Zn],[Br][C][=Branch1][=Branch2][=C][C][=C][N][=N][...,2.166898


In [None]:
check4duplicate(DF_MOFS_CORE_clean, DF_MOFS_QMOF_clean) 

#These duplicates can be removed from one of the datasets to ensure uniqueness.

In [None]:
# DF_MOFS_QMOF_clean.to_csv(r"C:\Users\dalja\OneDrive\Desktop\APS360 Project\MOF_QMOF_processed_data.csv", index=False)