# Calculating the RACs for some MOFs ...

In [1]:
from molSimplify.Informatics.MOF.MOF_descriptors import get_MOF_descriptors
import os
from glob import glob
from pathlib import Path
import pandas as pd 

The first step is to get some MOF CIFs as primitive structures. For this, we can use some of your favorites and the following utlity function

In [2]:
from pymatgen.io.cif import CifParser
def get_primitive(datapath, writepath):
    s = CifParser(datapath, occupancy_tolerance=1).get_structures()[0]
    sprim = s.get_primitive_structure()
    sprim.to("cif",writepath)

In [5]:
get_primitive('/Users/kevinmaikjablonka/Downloads/sion17.cif', '/Users/kevinmaikjablonka/Downloads/sion17_prim.cif')

In [4]:
get_primitive(os.path.join('structures', 'UiO66.cif'), os.path.join('structures', 'UiO66_primitive.cif'))
get_primitive(os.path.join('structures', 'hkust1.cif'), os.path.join('structures', 'hkust1_primitive.cif'))

Now, we can get the RACs ..

In [3]:
featurization_list = []
for cif_file in glob(os.path.join('structures', '*_primitive.cif')):
    full_names, full_descriptors = get_MOF_descriptors(
        cif_file, # inputstructure 
        3, # scope 
        path=os.path.join('structures', 'output'), # stuff will be dumped here 
        xyzpath=os.path.join('structures', 'output', 'xyz', Path(cif_file).stem + '.xyz'))
    full_names.append('filename')
    full_descriptors.append(cif_file)
    featurization = dict(zip(full_names, full_descriptors))
    featurization_list.append(featurization)


('cell vectors: ', 'alpha, beta, gamma = 90.0, 90.0 ,90.0')
[31.978, 0, 0]
[4.0298227572742804e-16, 6.5812, 0]
[1.0324997163611333e-15, 1.0324997163611333e-15, 16.862]


In [4]:
featurization_list

[{0: 0,
  'filename': '/Users/kevinmaikjablonka/Dropbox (LSMO)/to_featurize/2020-5-12-cleanup/cifs/AlPMOF.cif'}]

In [5]:
df = pd.DataFrame(featurization_list)

In [6]:
df_all_lc = pd.read_csv('/Users/kevinmaikjablonka/Dropbox (LSMO)/Documents/open_source/molSimplify/examples/structures/output/lc_descriptors.csv')

In [7]:
df_all_lc

Unnamed: 0,D_func-I-0-all,D_func-I-1-all,D_func-I-2-all,D_func-I-3-all,D_func-S-0-all,D_func-S-1-all,D_func-S-2-all,D_func-S-3-all,D_func-T-0-all,D_func-T-1-all,...,lc-Z-3-all,lc-alpha-0-all,lc-alpha-1-all,lc-alpha-2-all,lc-alpha-3-all,lc-chi-0-all,lc-chi-1-all,lc-chi-2-all,lc-chi-3-all,name
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,hkust1_primitive
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,hkust1_primitive
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,hkust1_primitive
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,hkust1_primitive
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,hkust1_primitive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,UiO66_primitive
86,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,UiO66_primitive
87,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,UiO66_primitive
88,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,96.0,28.09,59.89,87.98,119.78,11.8336,8.772,20.6056,17.544,UiO66_primitive


In [8]:
list(df.columns)

['f-chi-0-all',
 'f-chi-1-all',
 'f-chi-2-all',
 'f-chi-3-all',
 'f-Z-0-all',
 'f-Z-1-all',
 'f-Z-2-all',
 'f-Z-3-all',
 'f-I-0-all',
 'f-I-1-all',
 'f-I-2-all',
 'f-I-3-all',
 'f-T-0-all',
 'f-T-1-all',
 'f-T-2-all',
 'f-T-3-all',
 'f-S-0-all',
 'f-S-1-all',
 'f-S-2-all',
 'f-S-3-all',
 'mc-chi-0-all',
 'mc-chi-1-all',
 'mc-chi-2-all',
 'mc-chi-3-all',
 'mc-Z-0-all',
 'mc-Z-1-all',
 'mc-Z-2-all',
 'mc-Z-3-all',
 'mc-I-0-all',
 'mc-I-1-all',
 'mc-I-2-all',
 'mc-I-3-all',
 'mc-T-0-all',
 'mc-T-1-all',
 'mc-T-2-all',
 'mc-T-3-all',
 'mc-S-0-all',
 'mc-S-1-all',
 'mc-S-2-all',
 'mc-S-3-all',
 'D_mc-chi-0-all',
 'D_mc-chi-1-all',
 'D_mc-chi-2-all',
 'D_mc-chi-3-all',
 'D_mc-Z-0-all',
 'D_mc-Z-1-all',
 'D_mc-Z-2-all',
 'D_mc-Z-3-all',
 'D_mc-I-0-all',
 'D_mc-I-1-all',
 'D_mc-I-2-all',
 'D_mc-I-3-all',
 'D_mc-T-0-all',
 'D_mc-T-1-all',
 'D_mc-T-2-all',
 'D_mc-T-3-all',
 'D_mc-S-0-all',
 'D_mc-S-1-all',
 'D_mc-S-2-all',
 'D_mc-S-3-all',
 'sum-f-chi-0-all',
 'sum-f-chi-1-all',
 'sum-f-chi-2-al

In [5]:
df.shape

(2, 177)

Let's get the 156 RACs reported in the [paper](https://chemrxiv.org/articles/Understanding_the_Diversity_of_the_Metal-Organic_Framework_Ecosystem/12251186). SBU Racs are redundant with mc RACs and are not included.

In [6]:
keep = [val for val in df.columns.values if ('mc' in val) or ('lc' in val) or ('f-lig' in val) or ('func' in val)]
df = df[['filename']+keep]

In [7]:
df.shape

(2, 157)