## Note
Feeding angle directly to the model with ( * PI / 180 ) and without it makes no difference (cuz it's multiply with a constant)

In [1]:
from tqdm import tqdm
from ase.io import read, write
from ase import Atoms
# from rdkit.Chem import Draw
# from rdkit.Chem import AllChem as Chem
from collections import Counter

# import cairosvg
import pybel
import numpy as np
import pandas as pd
import os

ImportError: DLL load failed: The specified module could not be found.

## Util functions

In [94]:
PI = np.pi
def get_cell_parameters(cifname):
    content = open(cifname, 'r').readlines()
    mol = read(cifname)
    n_atoms = mol.get_global_number_of_atoms()
    mol_avg_mass = mol.get_masses().mean()
    
    # calculate charges (not including charges of neutral element)
    atom_count = Counter(mol.get_chemical_symbols())
    charges = calc_charges(atom_count, charge_dict)
    
    for line in content:
        if "_cell_length_a" in line:
            a = float(line.split()[1])
        if "_cell_length_b" in line:
            b = float(line.split()[1])
        if "_cell_length_c" in line:
            c = float(line.split()[1])
        if "_cell_angle_alpha" in line:
            alpha_r = float(line.split()[1]) * PI / 180.0
        if "_cell_angle_beta" in line:
            beta_r = float(line.split()[1]) * PI / 180.0
        if "_cell_angle_gamma" in line:
            gamma_r = float(line.split()[1]) * PI / 180.0

    return a, b, c, alpha_r, beta_r, gamma_r, n_atoms, mol_avg_mass, charges, avg_radius

In [68]:
# some files are converted to the wrong SMILES, so it cannot be saved as image
def smiles_img_from_cif(cifname, dest):
    cif = next(pybel.readfile("cif", cifname))
    smiles = cif.write(format="smi")
    smiles = smiles[:smiles.rfind('\t')]
    mol = Chem.MolFromSmiles(smiles)
    Draw.MolToFile(mol, dest)

In [2]:
def smiles_from_cif(cifname):
    cif = next(pybel.readfile("cif", cifname))
    smiles = cif.write(format="smi")
    smiles = smiles[:smiles.rfind('\t')]
    return smiles

In [69]:
def svg_png_from_cif(cifname, dest_svg, dest_png):
    cif = next(pybel.readfile("cif", cifname))
    svg = cif.write(format="svg")
    # remove the text and add the missing closed tag in the end
    svg = svg[:svg.rfind('<text')]+'</svg>\n' 
    with open(dest_svg, 'w') as file:
        file.write(svg)
    cairosvg.svg2png(file_obj=open(dest_svg), write_to=dest_png, output_width=300, output_height=300)

In [70]:
def count_atom(formula):
    count = {}
    number = ""
    atom = ""
    for i in range(len(formula)):
        if formula[i].isalpha():
            if atom != '':
                if formula[i].islower():
                    atom += formula[i]
                else:
                    if number == '':
                        number = '1'
                    count[atom] = int(number)
                    atom = formula[i]
                    number = ''
            else:
                atom += formula[i]
        else:
            number += formula[i]
            
    # get the last atom
    if number == '':
        number = '1'
    count[atom] = int(number)
    
    return count

In [71]:
def calc_charges(atom_count, charge_dict):
    total_charge = 0
    for k,v in atom_count.items():
        total_charge += (v * charge_dict[k])
    return total_charge

In [108]:
def calc_avg_radius(n_atoms, atom_count, radius_dict):
    total_radius = 0
    for k,v in atom_count.items():
        total_radius += (v * radius_dict[k])
    mean_radius = total_radius / n_atoms
    return mean_radius

In [16]:
def calc_atoms_vol(cifname, vol_dict):
    mol = read(cifname)
    # calculate charges (not including charges of neutral element)
    atom_count = Counter(mol.get_chemical_symbols())
    total_vol = 0
    for k,v in atom_count.items():
        total_vol += (v * vol_dict[k])
    return total_vol

In [6]:
def calc_atoms_area(cifname, area_dict):
    mol = read(cifname)
    # calculate charges (not including charges of neutral element)
    atom_count = Counter(mol.get_chemical_symbols())
    total_area = 0
    for k,v in atom_count.items():
        total_area += (v * area_dict[k])
    return total_area

In [59]:
def get_desc_xyz(xyzname):
    mol = next(pybel.readfile("xyz", xyzname))
    desc = mol.calcdesc()
    n_atoms = len(mol.atoms)
    return desc['HBA1'], desc['HBA2'], desc['HBD'], desc['MW'], n_atoms

In [71]:
def get_desc_cif(cifname):
    mol = next(pybel.readfile("cif", cifname))
    desc = mol.calcdesc()
    return desc['HBA1'], desc['HBA2'], desc['HBD'], desc['nF'], desc['logP'], desc['MR'], desc['TPSA']

# Smiles from CIF

## Train

In [8]:
len_train = len(os.listdir('mof_cif_train'))
len_train

68613

In [None]:
train_smiles = []
for i in tqdm(range(948,20001)):
    name = f'mof_unit_{i+1}'
    smiles = smiles_from_cif(f'mof_cif_train/{name}.cif')
    train_smiles.append(smiles)


  0%|                                                                                        | 0/19053 [00:00<?, ?it/s][A
  0%|                                                                     | 1/19053 [3:09:30<60176:56:51, 11370.83s/it][A
  0%|                                                                      | 2/19053 [3:09:31<24778:13:59, 4682.25s/it][A
  0%|                                                                      | 3/19053 [3:09:31<13463:59:17, 2544.38s/it][A
  0%|                                                                       | 4/19053 [3:09:31<8148:26:46, 1539.94s/it][A
  0%|                                                                        | 5/19053 [3:09:31<5210:04:53, 984.69s/it][A
  0%|                                                                        | 6/19053 [3:09:33<3441:16:00, 650.42s/it][A
  0%|                                                                        | 7/19053 [3:09:33<2317:15:59, 438.00s/it][A
  0%|          

  1%|▌                                                                         | 134/19053 [3:15:06<6:27:33,  1.23s/it][A
  1%|▌                                                                         | 135/19053 [3:15:07<6:54:54,  1.32s/it][A
  1%|▌                                                                        | 136/19053 [3:15:12<12:18:46,  2.34s/it][A
  1%|▌                                                                         | 137/19053 [3:15:13<9:25:04,  1.79s/it][A
  1%|▌                                                                         | 138/19053 [3:15:13<6:59:51,  1.33s/it][A
  1%|▌                                                                         | 139/19053 [3:15:14<5:55:29,  1.13s/it][A
  1%|▌                                                                         | 140/19053 [3:15:14<4:58:01,  1.06it/s][A
  1%|▌                                                                         | 141/19053 [3:15:14<4:02:51,  1.30it/s][A
  1%|▌          

  1%|▉                                                                     | 269/19053 [4:46:39<1040:39:58, 199.45s/it][A
  1%|█                                                                      | 270/19053 [4:46:39<729:20:09, 139.79s/it][A
  1%|█                                                                       | 271/19053 [4:46:39<511:08:49, 97.97s/it][A
  1%|█                                                                       | 272/19053 [4:46:40<358:14:49, 68.67s/it][A
  1%|█                                                                       | 273/19053 [4:46:43<255:55:16, 49.06s/it][A
  1%|█                                                                       | 274/19053 [4:46:43<179:18:10, 34.37s/it][A
  1%|█                                                                       | 275/19053 [4:46:43<125:45:42, 24.11s/it][A
  1%|█                                                                        | 276/19053 [4:46:44<88:56:01, 17.05s/it][A
  1%|█          

  2%|█▌                                                                       | 407/19053 [5:16:46<85:05:51, 16.43s/it][A
  2%|█▌                                                                       | 408/19053 [5:16:46<60:16:26, 11.64s/it][A
  2%|█▌                                                                       | 409/19053 [5:16:48<44:56:43,  8.68s/it][A
  2%|█▌                                                                       | 410/19053 [5:16:48<32:02:49,  6.19s/it][A
  2%|█▌                                                                       | 411/19053 [5:16:48<22:40:31,  4.38s/it][A
  2%|█▌                                                                       | 412/19053 [5:16:48<16:06:29,  3.11s/it][A
  2%|█▌                                                                       | 413/19053 [5:16:49<11:54:11,  2.30s/it][A
  2%|█▌                                                                        | 414/19053 [5:16:49<8:33:10,  1.65s/it][A
  2%|█▌         

  3%|██                                                                       | 541/19053 [5:20:52<43:31:47,  8.47s/it][A
  3%|██                                                                       | 542/19053 [5:20:52<30:42:58,  5.97s/it][A
  3%|██                                                                       | 543/19053 [5:20:54<23:30:19,  4.57s/it][A
  3%|██                                                                       | 544/19053 [5:20:54<17:03:38,  3.32s/it][A
  3%|██                                                                       | 545/19053 [5:20:55<13:10:11,  2.56s/it][A
  3%|██                                                                        | 546/19053 [5:20:55<9:49:55,  1.91s/it][A
  3%|██                                                                        | 547/19053 [5:20:56<7:32:42,  1.47s/it][A
  3%|██▏                                                                       | 548/19053 [5:20:56<6:18:30,  1.23s/it][A
  3%|██▏        

  4%|██▌                                                                       | 675/19053 [5:31:22<4:03:28,  1.26it/s][A
  4%|██▋                                                                       | 676/19053 [5:31:22<3:02:44,  1.68it/s][A
  4%|██▋                                                                       | 677/19053 [5:31:23<2:27:56,  2.07it/s][A
  4%|██▋                                                                       | 678/19053 [5:31:23<2:45:34,  1.85it/s][A
  4%|██▋                                                                       | 679/19053 [5:31:24<2:26:01,  2.10it/s][A
  4%|██▋                                                                       | 680/19053 [5:31:28<7:55:05,  1.55s/it][A
  4%|██▌                                                                      | 681/19053 [5:31:32<12:38:02,  2.48s/it][A
  4%|██▌                                                                      | 682/19053 [5:31:35<12:15:50,  2.40s/it][A
  4%|██▋        

  4%|███▏                                                                      | 810/19053 [5:51:09<3:46:11,  1.34it/s][A
  4%|███▏                                                                      | 811/19053 [5:51:09<3:10:04,  1.60it/s][A
  4%|███▏                                                                      | 812/19053 [5:51:11<3:51:24,  1.31it/s][A
  4%|███▏                                                                      | 813/19053 [5:51:11<4:00:33,  1.26it/s][A
  4%|███▏                                                                      | 814/19053 [5:51:12<3:10:55,  1.59it/s][A
  4%|███▏                                                                      | 815/19053 [5:51:14<5:42:08,  1.13s/it][A
  4%|███▏                                                                      | 816/19053 [5:51:14<4:13:16,  1.20it/s][A
  4%|███▏                                                                      | 817/19053 [5:51:14<3:19:19,  1.52it/s][A
  4%|███▏       

  5%|███▋                                                                      | 944/19053 [6:09:48<9:50:30,  1.96s/it][A
  5%|███▋                                                                      | 945/19053 [6:09:50<9:26:25,  1.88s/it][A
  5%|███▋                                                                      | 946/19053 [6:09:50<7:52:58,  1.57s/it][A
  5%|███▋                                                                      | 947/19053 [6:09:51<5:41:23,  1.13s/it][A
  5%|███▋                                                                     | 948/19053 [6:09:55<10:50:38,  2.16s/it][A
  5%|███▋                                                                      | 949/19053 [6:09:55<8:03:41,  1.60s/it][A
  5%|███▋                                                                      | 950/19053 [6:09:56<5:54:17,  1.17s/it][A
  5%|███▋                                                                      | 951/19053 [6:09:59<8:43:45,  1.74s/it][A
  5%|███▋       

  6%|████▏                                                                    | 1078/19053 [6:22:27<4:03:51,  1.23it/s][A
  6%|████▏                                                                    | 1079/19053 [6:22:27<3:16:02,  1.53it/s][A
  6%|████▏                                                                    | 1080/19053 [6:22:27<2:32:41,  1.96it/s][A
  6%|████▏                                                                    | 1081/19053 [6:22:28<2:51:45,  1.74it/s][A
  6%|████                                                                   | 1082/19053 [6:24:00<140:15:17, 28.10s/it][A
  6%|████                                                                    | 1083/19053 [6:24:00<98:26:56, 19.72s/it][A
  6%|████                                                                    | 1084/19053 [6:24:00<69:10:00, 13.86s/it][A
  6%|████                                                                    | 1085/19053 [6:24:18<73:57:19, 14.82s/it][A
  6%|████       

  6%|████▋                                                                    | 1210/19053 [6:42:33<1:55:58,  2.56it/s][A
  6%|████▋                                                                    | 1211/19053 [6:42:34<3:27:40,  1.43it/s][A
  6%|████▋                                                                    | 1212/19053 [6:42:34<2:35:15,  1.92it/s][A
  6%|████▋                                                                    | 1213/19053 [6:42:35<2:27:23,  2.02it/s][A
  6%|████▌                                                                   | 1214/19053 [6:42:41<11:08:39,  2.25s/it][A
  6%|████▋                                                                    | 1215/19053 [6:42:42<8:41:06,  1.75s/it][A
  6%|████▋                                                                    | 1216/19053 [6:42:42<6:19:28,  1.28s/it][A
  6%|████▋                                                                    | 1217/19053 [6:42:43<5:59:15,  1.21s/it][A
  6%|████▌      

  7%|█████                                                                   | 1343/19053 [6:54:52<15:33:57,  3.16s/it][A
  7%|█████                                                                   | 1344/19053 [6:54:52<11:24:16,  2.32s/it][A
  7%|█████▏                                                                   | 1345/19053 [6:54:53<8:31:33,  1.73s/it][A
  7%|█████▏                                                                   | 1346/19053 [6:54:53<6:17:22,  1.28s/it][A
  7%|█████▏                                                                   | 1347/19053 [6:54:53<4:40:36,  1.05it/s][A
  7%|█████                                                                   | 1348/19053 [6:54:59<12:30:08,  2.54s/it][A
  7%|█████▏                                                                   | 1349/19053 [6:55:00<9:04:43,  1.85s/it][A
  7%|█████                                                                   | 1350/19053 [6:55:03<11:37:30,  2.36s/it][A
  7%|█████      

  8%|█████▋                                                                   | 1477/19053 [7:01:49<4:21:12,  1.12it/s][A
  8%|█████▋                                                                   | 1478/19053 [7:01:52<7:21:50,  1.51s/it][A
  8%|█████▌                                                                  | 1479/19053 [7:02:01<17:25:39,  3.57s/it][A
  8%|█████▌                                                                  | 1480/19053 [7:02:05<18:25:52,  3.78s/it][A
  8%|█████▌                                                                  | 1481/19053 [7:02:05<13:20:24,  2.73s/it][A
  8%|█████▋                                                                   | 1483/19053 [7:02:05<7:33:04,  1.55s/it][A
  8%|█████▋                                                                   | 1484/19053 [7:02:06<5:55:41,  1.21s/it][A
  8%|█████▋                                                                   | 1485/19053 [7:02:06<4:43:37,  1.03it/s][A
  8%|█████▋     

  8%|██████                                                                  | 1611/19053 [7:13:23<26:31:48,  5.48s/it][A
  8%|██████                                                                  | 1612/19053 [7:13:48<55:27:27, 11.45s/it][A
  8%|██████                                                                  | 1613/19053 [7:13:48<39:10:27,  8.09s/it][A
  8%|██████                                                                  | 1614/19053 [7:14:37<97:38:13, 20.16s/it][A
  8%|██████                                                                  | 1615/19053 [7:14:37<68:29:51, 14.14s/it][A
  8%|██████                                                                  | 1616/19053 [7:14:37<48:32:12, 10.02s/it][A
  8%|██████                                                                 | 1617/19053 [7:15:47<135:24:53, 27.96s/it][A
  8%|██████                                                                  | 1618/19053 [7:15:49<98:15:47, 20.29s/it][A
  8%|██████     

  9%|██████▌                                                                | 1746/19053 [7:44:34<186:00:24, 38.69s/it][A
  9%|██████▌                                                                | 1747/19053 [7:44:35<131:10:55, 27.29s/it][A
  9%|██████▌                                                                 | 1748/19053 [7:44:35<92:08:53, 19.17s/it][A
  9%|██████▌                                                                 | 1749/19053 [7:44:36<66:24:17, 13.82s/it][A
  9%|██████▌                                                                 | 1750/19053 [7:44:37<46:47:17,  9.73s/it][A
  9%|██████▌                                                                 | 1751/19053 [7:44:50<52:31:09, 10.93s/it][A
  9%|██████▌                                                                 | 1752/19053 [7:44:51<37:41:43,  7.84s/it][A
  9%|██████▌                                                                 | 1753/19053 [7:44:55<33:02:58,  6.88s/it][A
  9%|██████▋    

 10%|███████▏                                                                 | 1878/19053 [8:05:54<9:54:30,  2.08s/it][A
 10%|███████▏                                                                 | 1879/19053 [8:05:55<7:25:15,  1.56s/it][A
 10%|███████▏                                                                 | 1880/19053 [8:05:55<5:52:26,  1.23s/it][A
 10%|███████                                                                 | 1881/19053 [8:06:32<56:32:13, 11.85s/it][A
 10%|███████                                                                 | 1882/19053 [8:06:32<40:05:35,  8.41s/it][A
 10%|███████                                                                 | 1883/19053 [8:06:47<49:37:55, 10.41s/it][A
 10%|███████                                                                 | 1884/19053 [8:06:48<35:05:14,  7.36s/it][A
 10%|███████                                                                 | 1885/19053 [8:06:48<24:55:23,  5.23s/it][A
 10%|██████▉    

 11%|███████▌                                                                | 2012/19053 [8:26:31<16:28:33,  3.48s/it][A
 11%|███████▌                                                                | 2013/19053 [8:26:31<12:02:58,  2.55s/it][A
 11%|███████▌                                                                | 2014/19053 [8:26:41<22:34:54,  4.77s/it][A
 11%|███████▌                                                                | 2015/19053 [8:26:42<16:04:39,  3.40s/it][A
 11%|███████▌                                                                | 2016/19053 [8:26:42<11:27:51,  2.42s/it][A
 11%|███████▋                                                                 | 2017/19053 [8:26:42<9:09:16,  1.93s/it][A
 11%|███████▋                                                                | 2018/19053 [8:26:46<11:42:45,  2.48s/it][A
 11%|███████▋                                                                 | 2019/19053 [8:26:46<8:33:27,  1.81s/it][A
 11%|███████▋   

 11%|████████▏                                                                | 2145/19053 [8:40:11<3:32:54,  1.32it/s][A
 11%|████████▏                                                                | 2146/19053 [8:40:11<2:49:16,  1.66it/s][A
 11%|████████▏                                                                | 2147/19053 [8:40:14<5:51:22,  1.25s/it][A
 11%|████████▏                                                                | 2148/19053 [8:40:14<4:28:05,  1.05it/s][A
 11%|████████▏                                                                | 2149/19053 [8:40:15<4:20:35,  1.08it/s][A
 11%|████████▏                                                                | 2150/19053 [8:40:15<3:32:09,  1.33it/s][A
 11%|████████▏                                                                | 2151/19053 [8:40:15<2:44:48,  1.71it/s][A
 11%|████████▏                                                                | 2152/19053 [8:40:16<2:25:06,  1.94it/s][A
 11%|████████▏  

 12%|████████▌                                                               | 2277/19053 [12:50:51<1:50:27,  2.53it/s][A
 12%|████████▌                                                               | 2278/19053 [12:50:52<1:30:54,  3.08it/s][A
 12%|████████▌                                                               | 2279/19053 [12:50:53<3:13:04,  1.45it/s][A
 12%|████████▌                                                               | 2280/19053 [12:50:54<4:09:39,  1.12it/s][A
 12%|████████▌                                                               | 2281/19053 [12:50:55<3:04:16,  1.52it/s][A
 12%|████████▌                                                               | 2282/19053 [12:50:55<2:32:18,  1.84it/s][A
 12%|████████▋                                                               | 2283/19053 [12:50:55<2:15:47,  2.06it/s][A
 12%|████████▋                                                               | 2284/19053 [12:50:56<2:07:59,  2.18it/s][A
 12%|████████▋  

 13%|█████████                                                               | 2411/19053 [13:01:40<5:35:33,  1.21s/it][A
 13%|█████████                                                               | 2412/19053 [13:01:41<4:32:59,  1.02it/s][A
 13%|█████████                                                               | 2413/19053 [13:01:45<8:28:24,  1.83s/it][A
 13%|█████████▏                                                              | 2415/19053 [13:01:45<5:20:23,  1.16s/it][A
 13%|█████████▏                                                              | 2416/19053 [13:01:46<4:34:52,  1.01it/s][A
 13%|█████████▏                                                              | 2417/19053 [13:01:47<4:51:10,  1.05s/it][A
 13%|█████████▏                                                              | 2418/19053 [13:01:47<3:44:24,  1.24it/s][A
 13%|█████████▏                                                              | 2419/19053 [13:01:47<2:56:56,  1.57it/s][A
 13%|█████████▏ 

 13%|█████████▎                                                            | 2546/19053 [14:02:29<100:05:14, 21.83s/it][A
 13%|█████████▍                                                             | 2547/19053 [14:02:29<70:27:36, 15.37s/it][A
 13%|█████████▍                                                             | 2548/19053 [14:02:37<60:10:45, 13.13s/it][A
 13%|█████████▍                                                             | 2549/19053 [14:02:38<42:29:02,  9.27s/it][A
 13%|█████████▌                                                             | 2550/19053 [14:02:38<29:54:00,  6.52s/it][A
 13%|█████████▌                                                             | 2551/19053 [14:02:38<21:09:26,  4.62s/it][A
 13%|█████████▌                                                             | 2552/19053 [14:02:38<15:07:58,  3.30s/it][A
 13%|█████████▌                                                             | 2553/19053 [14:02:42<15:34:22,  3.40s/it][A
 13%|█████████▌ 

 14%|██████████▏                                                             | 2681/19053 [14:05:49<4:55:16,  1.08s/it][A
 14%|██████████▏                                                             | 2682/19053 [14:05:49<3:51:38,  1.18it/s][A
 14%|██████████▏                                                             | 2683/19053 [14:05:49<2:53:30,  1.57it/s][A
 14%|██████████▏                                                             | 2684/19053 [14:05:50<4:06:05,  1.11it/s][A
 14%|██████████▏                                                             | 2685/19053 [14:05:51<3:32:51,  1.28it/s][A
 14%|██████████▏                                                             | 2686/19053 [14:05:51<2:39:42,  1.71it/s][A
 14%|██████████▏                                                             | 2687/19053 [14:05:52<2:23:32,  1.90it/s][A
 14%|██████████▏                                                             | 2688/19053 [14:05:52<1:57:39,  2.32it/s][A
 14%|██████████▏

 15%|██████████▋                                                             | 2814/19053 [14:39:12<7:51:32,  1.74s/it][A
 15%|██████████▏                                                          | 2815/19053 [14:46:00<558:02:38, 123.72s/it][A
 15%|██████████▎                                                           | 2816/19053 [14:46:15<410:12:41, 90.95s/it][A
 15%|██████████▎                                                           | 2817/19053 [14:46:15<287:25:50, 63.73s/it][A
 15%|██████████▎                                                           | 2818/19053 [14:46:15<201:26:12, 44.67s/it][A
 15%|██████████▎                                                           | 2819/19053 [14:46:16<141:23:06, 31.35s/it][A
 15%|██████████▌                                                            | 2820/19053 [14:46:16<99:17:34, 22.02s/it][A
 15%|██████████▌                                                            | 2821/19053 [14:46:22<77:19:48, 17.15s/it][A
 15%|██████████▌

 15%|███████████▏                                                            | 2948/19053 [14:56:00<4:38:00,  1.04s/it][A
 15%|███████████▏                                                            | 2949/19053 [14:56:00<3:39:42,  1.22it/s][A
 15%|███████████▏                                                            | 2950/19053 [14:56:00<2:48:50,  1.59it/s][A
 15%|███████████▏                                                            | 2951/19053 [14:56:01<3:21:59,  1.33it/s][A
 15%|███████████▏                                                            | 2952/19053 [14:56:02<3:46:43,  1.18it/s][A
 15%|███████████▏                                                            | 2953/19053 [14:56:03<3:07:26,  1.43it/s][A
 16%|███████████▏                                                            | 2954/19053 [14:56:03<2:25:26,  1.84it/s][A
 16%|███████████▏                                                            | 2955/19053 [14:56:04<3:39:28,  1.22it/s][A
 16%|███████████

 16%|███████████▍                                                           | 3082/19053 [15:11:41<27:56:28,  6.30s/it][A
 16%|███████████▍                                                           | 3083/19053 [15:11:43<22:02:13,  4.97s/it][A
 16%|███████████▍                                                           | 3085/19053 [15:11:43<12:06:46,  2.73s/it][A
 16%|███████████▋                                                            | 3086/19053 [15:11:43<9:16:15,  2.09s/it][A
 16%|███████████▌                                                           | 3087/19053 [15:11:48<11:49:46,  2.67s/it][A
 16%|███████████▌                                                           | 3088/19053 [15:11:49<10:23:08,  2.34s/it][A
 16%|███████████▋                                                            | 3089/19053 [15:11:49<7:50:01,  1.77s/it][A
 16%|███████████▋                                                            | 3090/19053 [15:11:51<7:52:59,  1.78s/it][A
 16%|███████████

 17%|████████████▏                                                           | 3218/19053 [15:22:12<2:47:43,  1.57it/s][A
 17%|████████████▏                                                           | 3219/19053 [15:22:12<2:12:01,  2.00it/s][A
 17%|████████████▏                                                           | 3220/19053 [15:22:12<2:01:55,  2.16it/s][A
 17%|████████████▏                                                           | 3221/19053 [15:22:13<1:40:27,  2.63it/s][A
 17%|████████████▏                                                           | 3222/19053 [15:22:13<1:52:06,  2.35it/s][A
 17%|████████████▏                                                           | 3223/19053 [15:22:13<1:29:32,  2.95it/s][A
 17%|████████████▏                                                           | 3224/19053 [15:22:14<1:33:03,  2.83it/s][A
 17%|████████████▏                                                           | 3225/19053 [15:22:20<9:20:09,  2.12s/it][A
 17%|███████████

 18%|████████████▍                                                          | 3353/19053 [15:40:48<78:42:55, 18.05s/it][A
 18%|████████████▍                                                          | 3354/19053 [15:40:51<59:36:21, 13.67s/it][A
 18%|████████████▌                                                          | 3355/19053 [15:40:52<42:49:18,  9.82s/it][A
 18%|████████████▌                                                          | 3356/19053 [15:40:59<38:37:12,  8.86s/it][A
 18%|████████████▌                                                          | 3357/19053 [15:41:00<29:12:42,  6.70s/it][A
 18%|████████████▌                                                          | 3358/19053 [15:41:01<20:44:38,  4.76s/it][A
 18%|████████████▌                                                          | 3359/19053 [15:41:01<14:48:41,  3.40s/it][A
 18%|████████████▌                                                          | 3360/19053 [15:41:16<30:48:45,  7.07s/it][A
 18%|███████████

 18%|████████████▊                                                         | 3492/19053 [16:13:49<118:56:18, 27.52s/it][A
 18%|█████████████                                                          | 3493/19053 [16:13:49<83:28:16, 19.31s/it][A
 18%|█████████████                                                          | 3494/19053 [16:13:50<58:48:29, 13.61s/it][A
 18%|█████████████                                                          | 3495/19053 [16:13:50<41:22:33,  9.57s/it][A
 18%|█████████████                                                          | 3496/19053 [16:13:50<29:57:48,  6.93s/it][A
 18%|█████████████                                                          | 3497/19053 [16:14:01<34:40:06,  8.02s/it][A
 18%|█████████████                                                          | 3498/19053 [16:14:04<28:14:38,  6.54s/it][A
 18%|█████████████                                                          | 3499/19053 [16:14:05<20:59:27,  4.86s/it][A
 18%|███████████

 19%|█████████████▌                                                         | 3630/19053 [16:36:08<18:55:46,  4.42s/it][A
 19%|█████████████▌                                                         | 3631/19053 [16:36:09<14:40:33,  3.43s/it][A
 19%|█████████████▌                                                         | 3632/19053 [16:36:09<10:27:28,  2.44s/it][A
 19%|█████████████▋                                                          | 3633/19053 [16:36:09<7:34:26,  1.77s/it][A
 19%|█████████████▋                                                          | 3634/19053 [16:36:09<5:30:38,  1.29s/it][A
 19%|█████████████▋                                                          | 3635/19053 [16:36:10<4:02:33,  1.06it/s][A
 19%|█████████████▋                                                          | 3636/19053 [16:36:10<3:04:21,  1.39it/s][A
 19%|█████████████▋                                                          | 3637/19053 [16:36:10<2:18:22,  1.86it/s][A
 19%|███████████

 20%|██████████████▏                                                         | 3766/19053 [17:25:50<3:45:15,  1.13it/s][A
 20%|██████████████▏                                                         | 3767/19053 [17:25:51<4:13:24,  1.01it/s][A
 20%|██████████████▏                                                         | 3768/19053 [17:25:54<6:44:52,  1.59s/it][A
 20%|██████████████▏                                                         | 3769/19053 [17:25:57<7:40:27,  1.81s/it][A
 20%|██████████████                                                         | 3770/19053 [17:26:06<17:09:50,  4.04s/it][A
 20%|██████████████                                                         | 3771/19053 [17:26:06<12:26:51,  2.93s/it][A
 20%|██████████████                                                         | 3772/19053 [17:26:09<11:35:30,  2.73s/it][A
 20%|██████████████▎                                                         | 3773/19053 [17:26:09<8:24:49,  1.98s/it][A
 20%|███████████

 20%|██████████████▋                                                         | 3900/19053 [17:49:53<2:33:55,  1.64it/s][A
 20%|██████████████▋                                                         | 3901/19053 [17:49:53<2:01:54,  2.07it/s][A
 20%|██████████████▋                                                         | 3902/19053 [17:49:53<1:44:34,  2.41it/s][A
 20%|██████████████▋                                                         | 3903/19053 [17:49:53<1:22:26,  3.06it/s][A
 20%|██████████████▊                                                         | 3904/19053 [17:49:55<2:34:25,  1.63it/s][A
 20%|██████████████▊                                                         | 3905/19053 [17:49:55<2:07:47,  1.98it/s][A
 21%|██████████████▊                                                         | 3906/19053 [17:49:55<1:44:03,  2.43it/s][A
 21%|██████████████▊                                                         | 3907/19053 [17:49:55<1:24:48,  2.98it/s][A
 21%|███████████

 21%|██████████████▍                                                     | 4035/19053 [18:41:11<2263:51:19, 542.67s/it][A
 21%|██████████████▍                                                     | 4036/19053 [18:41:13<1586:13:21, 380.26s/it][A
 21%|██████████████▍                                                     | 4037/19053 [18:41:13<1110:26:22, 266.22s/it][A
 21%|██████████████▌                                                      | 4038/19053 [18:41:17<781:48:49, 187.45s/it][A
 21%|██████████████▋                                                      | 4039/19053 [18:41:39<575:41:45, 138.04s/it][A
 21%|██████████████▊                                                       | 4040/19053 [18:41:41<405:17:01, 97.18s/it][A
 21%|██████████████▊                                                       | 4041/19053 [18:41:43<285:37:34, 68.50s/it][A
 21%|██████████████▊                                                       | 4042/19053 [18:41:43<200:10:37, 48.01s/it][A
 21%|███████████

 22%|███████████████▊                                                        | 4169/19053 [18:53:59<6:51:30,  1.66s/it][A
 22%|███████████████▊                                                        | 4170/19053 [18:53:59<5:00:45,  1.21s/it][A
 22%|███████████████▊                                                        | 4171/19053 [18:53:59<3:38:49,  1.13it/s][A
 22%|███████████████▊                                                        | 4172/19053 [18:53:59<2:46:42,  1.49it/s][A
 22%|███████████████▊                                                        | 4173/19053 [18:53:59<2:08:14,  1.93it/s][A
 22%|███████████████▌                                                       | 4174/19053 [18:54:08<11:51:52,  2.87s/it][A
 22%|███████████████▊                                                        | 4175/19053 [18:54:08<8:30:58,  2.06s/it][A
 22%|███████████████▊                                                        | 4176/19053 [18:54:09<7:06:49,  1.72s/it][A
 22%|███████████

 23%|████████████████▎                                                       | 4304/19053 [18:58:57<5:48:00,  1.42s/it][A
 23%|████████████████▎                                                       | 4305/19053 [18:58:57<4:19:24,  1.06s/it][A
 23%|███████████████▊                                                      | 4306/19053 [19:00:29<116:06:04, 28.34s/it][A
 23%|████████████████                                                       | 4307/19053 [19:00:29<81:39:41, 19.94s/it][A
 23%|████████████████                                                       | 4308/19053 [19:00:30<57:26:19, 14.02s/it][A
 23%|████████████████                                                       | 4309/19053 [19:00:30<40:24:32,  9.87s/it][A
 23%|████████████████                                                       | 4311/19053 [19:00:33<24:45:47,  6.05s/it][A
 23%|████████████████                                                       | 4312/19053 [19:00:33<18:48:18,  4.59s/it][A
 23%|███████████

---

# Images from CIF

## Train

In [8]:
len_train = len(os.listdir('mof_cif_train'))
len_train

68613

In [None]:
for i in tqdm(range(len_train)):
    name = f'mof_unit_{i+1}'
    svg_png_from_cif(f'mof_cif_train/{name}.cif', dest_svg=f'mof_images/svg/train/{name}.svg',
                     dest_png=f'mof_images/png/train/{name}.png')


  0%|                                                                                        | 0/19053 [00:00<?, ?it/s][A
  0%|                                                                     | 1/19053 [3:09:30<60176:56:51, 11370.83s/it][A
  0%|                                                                      | 2/19053 [3:09:31<24778:13:59, 4682.25s/it][A
  0%|                                                                      | 3/19053 [3:09:31<13463:59:17, 2544.38s/it][A
  0%|                                                                       | 4/19053 [3:09:31<8148:26:46, 1539.94s/it][A
  0%|                                                                        | 5/19053 [3:09:31<5210:04:53, 984.69s/it][A
  0%|                                                                        | 6/19053 [3:09:33<3441:16:00, 650.42s/it][A
  0%|                                                                        | 7/19053 [3:09:33<2317:15:59, 438.00s/it][A
  0%|          

  1%|▌                                                                         | 134/19053 [3:15:06<6:27:33,  1.23s/it][A
  1%|▌                                                                         | 135/19053 [3:15:07<6:54:54,  1.32s/it][A
  1%|▌                                                                        | 136/19053 [3:15:12<12:18:46,  2.34s/it][A
  1%|▌                                                                         | 137/19053 [3:15:13<9:25:04,  1.79s/it][A
  1%|▌                                                                         | 138/19053 [3:15:13<6:59:51,  1.33s/it][A
  1%|▌                                                                         | 139/19053 [3:15:14<5:55:29,  1.13s/it][A
  1%|▌                                                                         | 140/19053 [3:15:14<4:58:01,  1.06it/s][A
  1%|▌                                                                         | 141/19053 [3:15:14<4:02:51,  1.30it/s][A
  1%|▌          

  1%|▉                                                                     | 269/19053 [4:46:39<1040:39:58, 199.45s/it][A
  1%|█                                                                      | 270/19053 [4:46:39<729:20:09, 139.79s/it][A
  1%|█                                                                       | 271/19053 [4:46:39<511:08:49, 97.97s/it][A
  1%|█                                                                       | 272/19053 [4:46:40<358:14:49, 68.67s/it][A
  1%|█                                                                       | 273/19053 [4:46:43<255:55:16, 49.06s/it][A
  1%|█                                                                       | 274/19053 [4:46:43<179:18:10, 34.37s/it][A
  1%|█                                                                       | 275/19053 [4:46:43<125:45:42, 24.11s/it][A
  1%|█                                                                        | 276/19053 [4:46:44<88:56:01, 17.05s/it][A
  1%|█          

  2%|█▌                                                                       | 407/19053 [5:16:46<85:05:51, 16.43s/it][A
  2%|█▌                                                                       | 408/19053 [5:16:46<60:16:26, 11.64s/it][A
  2%|█▌                                                                       | 409/19053 [5:16:48<44:56:43,  8.68s/it][A
  2%|█▌                                                                       | 410/19053 [5:16:48<32:02:49,  6.19s/it][A
  2%|█▌                                                                       | 411/19053 [5:16:48<22:40:31,  4.38s/it][A
  2%|█▌                                                                       | 412/19053 [5:16:48<16:06:29,  3.11s/it][A
  2%|█▌                                                                       | 413/19053 [5:16:49<11:54:11,  2.30s/it][A
  2%|█▌                                                                        | 414/19053 [5:16:49<8:33:10,  1.65s/it][A
  2%|█▌         

  3%|██                                                                       | 541/19053 [5:20:52<43:31:47,  8.47s/it][A
  3%|██                                                                       | 542/19053 [5:20:52<30:42:58,  5.97s/it][A
  3%|██                                                                       | 543/19053 [5:20:54<23:30:19,  4.57s/it][A
  3%|██                                                                       | 544/19053 [5:20:54<17:03:38,  3.32s/it][A
  3%|██                                                                       | 545/19053 [5:20:55<13:10:11,  2.56s/it][A
  3%|██                                                                        | 546/19053 [5:20:55<9:49:55,  1.91s/it][A
  3%|██                                                                        | 547/19053 [5:20:56<7:32:42,  1.47s/it][A
  3%|██▏                                                                       | 548/19053 [5:20:56<6:18:30,  1.23s/it][A
  3%|██▏        

  4%|██▌                                                                       | 675/19053 [5:31:22<4:03:28,  1.26it/s][A
  4%|██▋                                                                       | 676/19053 [5:31:22<3:02:44,  1.68it/s][A
  4%|██▋                                                                       | 677/19053 [5:31:23<2:27:56,  2.07it/s][A
  4%|██▋                                                                       | 678/19053 [5:31:23<2:45:34,  1.85it/s][A
  4%|██▋                                                                       | 679/19053 [5:31:24<2:26:01,  2.10it/s][A
  4%|██▋                                                                       | 680/19053 [5:31:28<7:55:05,  1.55s/it][A
  4%|██▌                                                                      | 681/19053 [5:31:32<12:38:02,  2.48s/it][A
  4%|██▌                                                                      | 682/19053 [5:31:35<12:15:50,  2.40s/it][A
  4%|██▋        

  4%|███▏                                                                      | 810/19053 [5:51:09<3:46:11,  1.34it/s][A
  4%|███▏                                                                      | 811/19053 [5:51:09<3:10:04,  1.60it/s][A
  4%|███▏                                                                      | 812/19053 [5:51:11<3:51:24,  1.31it/s][A
  4%|███▏                                                                      | 813/19053 [5:51:11<4:00:33,  1.26it/s][A
  4%|███▏                                                                      | 814/19053 [5:51:12<3:10:55,  1.59it/s][A
  4%|███▏                                                                      | 815/19053 [5:51:14<5:42:08,  1.13s/it][A
  4%|███▏                                                                      | 816/19053 [5:51:14<4:13:16,  1.20it/s][A
  4%|███▏                                                                      | 817/19053 [5:51:14<3:19:19,  1.52it/s][A
  4%|███▏       

  5%|███▋                                                                      | 944/19053 [6:09:48<9:50:30,  1.96s/it][A
  5%|███▋                                                                      | 945/19053 [6:09:50<9:26:25,  1.88s/it][A
  5%|███▋                                                                      | 946/19053 [6:09:50<7:52:58,  1.57s/it][A
  5%|███▋                                                                      | 947/19053 [6:09:51<5:41:23,  1.13s/it][A
  5%|███▋                                                                     | 948/19053 [6:09:55<10:50:38,  2.16s/it][A
  5%|███▋                                                                      | 949/19053 [6:09:55<8:03:41,  1.60s/it][A
  5%|███▋                                                                      | 950/19053 [6:09:56<5:54:17,  1.17s/it][A
  5%|███▋                                                                      | 951/19053 [6:09:59<8:43:45,  1.74s/it][A
  5%|███▋       

  6%|████▏                                                                    | 1078/19053 [6:22:27<4:03:51,  1.23it/s][A
  6%|████▏                                                                    | 1079/19053 [6:22:27<3:16:02,  1.53it/s][A
  6%|████▏                                                                    | 1080/19053 [6:22:27<2:32:41,  1.96it/s][A
  6%|████▏                                                                    | 1081/19053 [6:22:28<2:51:45,  1.74it/s][A
  6%|████                                                                   | 1082/19053 [6:24:00<140:15:17, 28.10s/it][A
  6%|████                                                                    | 1083/19053 [6:24:00<98:26:56, 19.72s/it][A
  6%|████                                                                    | 1084/19053 [6:24:00<69:10:00, 13.86s/it][A
  6%|████                                                                    | 1085/19053 [6:24:18<73:57:19, 14.82s/it][A
  6%|████       

  6%|████▋                                                                    | 1210/19053 [6:42:33<1:55:58,  2.56it/s][A
  6%|████▋                                                                    | 1211/19053 [6:42:34<3:27:40,  1.43it/s][A
  6%|████▋                                                                    | 1212/19053 [6:42:34<2:35:15,  1.92it/s][A
  6%|████▋                                                                    | 1213/19053 [6:42:35<2:27:23,  2.02it/s][A
  6%|████▌                                                                   | 1214/19053 [6:42:41<11:08:39,  2.25s/it][A
  6%|████▋                                                                    | 1215/19053 [6:42:42<8:41:06,  1.75s/it][A
  6%|████▋                                                                    | 1216/19053 [6:42:42<6:19:28,  1.28s/it][A
  6%|████▋                                                                    | 1217/19053 [6:42:43<5:59:15,  1.21s/it][A
  6%|████▌      

  7%|█████                                                                   | 1343/19053 [6:54:52<15:33:57,  3.16s/it][A
  7%|█████                                                                   | 1344/19053 [6:54:52<11:24:16,  2.32s/it][A
  7%|█████▏                                                                   | 1345/19053 [6:54:53<8:31:33,  1.73s/it][A
  7%|█████▏                                                                   | 1346/19053 [6:54:53<6:17:22,  1.28s/it][A
  7%|█████▏                                                                   | 1347/19053 [6:54:53<4:40:36,  1.05it/s][A
  7%|█████                                                                   | 1348/19053 [6:54:59<12:30:08,  2.54s/it][A
  7%|█████▏                                                                   | 1349/19053 [6:55:00<9:04:43,  1.85s/it][A
  7%|█████                                                                   | 1350/19053 [6:55:03<11:37:30,  2.36s/it][A
  7%|█████      

  8%|█████▋                                                                   | 1477/19053 [7:01:49<4:21:12,  1.12it/s][A
  8%|█████▋                                                                   | 1478/19053 [7:01:52<7:21:50,  1.51s/it][A
  8%|█████▌                                                                  | 1479/19053 [7:02:01<17:25:39,  3.57s/it][A
  8%|█████▌                                                                  | 1480/19053 [7:02:05<18:25:52,  3.78s/it][A
  8%|█████▌                                                                  | 1481/19053 [7:02:05<13:20:24,  2.73s/it][A
  8%|█████▋                                                                   | 1483/19053 [7:02:05<7:33:04,  1.55s/it][A
  8%|█████▋                                                                   | 1484/19053 [7:02:06<5:55:41,  1.21s/it][A
  8%|█████▋                                                                   | 1485/19053 [7:02:06<4:43:37,  1.03it/s][A
  8%|█████▋     

  8%|██████                                                                  | 1611/19053 [7:13:23<26:31:48,  5.48s/it][A
  8%|██████                                                                  | 1612/19053 [7:13:48<55:27:27, 11.45s/it][A
  8%|██████                                                                  | 1613/19053 [7:13:48<39:10:27,  8.09s/it][A
  8%|██████                                                                  | 1614/19053 [7:14:37<97:38:13, 20.16s/it][A
  8%|██████                                                                  | 1615/19053 [7:14:37<68:29:51, 14.14s/it][A
  8%|██████                                                                  | 1616/19053 [7:14:37<48:32:12, 10.02s/it][A
  8%|██████                                                                 | 1617/19053 [7:15:47<135:24:53, 27.96s/it][A
  8%|██████                                                                  | 1618/19053 [7:15:49<98:15:47, 20.29s/it][A
  8%|██████     

  9%|██████▌                                                                | 1746/19053 [7:44:34<186:00:24, 38.69s/it][A
  9%|██████▌                                                                | 1747/19053 [7:44:35<131:10:55, 27.29s/it][A
  9%|██████▌                                                                 | 1748/19053 [7:44:35<92:08:53, 19.17s/it][A
  9%|██████▌                                                                 | 1749/19053 [7:44:36<66:24:17, 13.82s/it][A
  9%|██████▌                                                                 | 1750/19053 [7:44:37<46:47:17,  9.73s/it][A
  9%|██████▌                                                                 | 1751/19053 [7:44:50<52:31:09, 10.93s/it][A
  9%|██████▌                                                                 | 1752/19053 [7:44:51<37:41:43,  7.84s/it][A
  9%|██████▌                                                                 | 1753/19053 [7:44:55<33:02:58,  6.88s/it][A
  9%|██████▋    

 10%|███████▏                                                                 | 1878/19053 [8:05:54<9:54:30,  2.08s/it][A
 10%|███████▏                                                                 | 1879/19053 [8:05:55<7:25:15,  1.56s/it][A
 10%|███████▏                                                                 | 1880/19053 [8:05:55<5:52:26,  1.23s/it][A
 10%|███████                                                                 | 1881/19053 [8:06:32<56:32:13, 11.85s/it][A
 10%|███████                                                                 | 1882/19053 [8:06:32<40:05:35,  8.41s/it][A
 10%|███████                                                                 | 1883/19053 [8:06:47<49:37:55, 10.41s/it][A
 10%|███████                                                                 | 1884/19053 [8:06:48<35:05:14,  7.36s/it][A
 10%|███████                                                                 | 1885/19053 [8:06:48<24:55:23,  5.23s/it][A
 10%|██████▉    

 11%|███████▌                                                                | 2012/19053 [8:26:31<16:28:33,  3.48s/it][A
 11%|███████▌                                                                | 2013/19053 [8:26:31<12:02:58,  2.55s/it][A
 11%|███████▌                                                                | 2014/19053 [8:26:41<22:34:54,  4.77s/it][A
 11%|███████▌                                                                | 2015/19053 [8:26:42<16:04:39,  3.40s/it][A
 11%|███████▌                                                                | 2016/19053 [8:26:42<11:27:51,  2.42s/it][A
 11%|███████▋                                                                 | 2017/19053 [8:26:42<9:09:16,  1.93s/it][A
 11%|███████▋                                                                | 2018/19053 [8:26:46<11:42:45,  2.48s/it][A
 11%|███████▋                                                                 | 2019/19053 [8:26:46<8:33:27,  1.81s/it][A
 11%|███████▋   

 11%|████████▏                                                                | 2145/19053 [8:40:11<3:32:54,  1.32it/s][A
 11%|████████▏                                                                | 2146/19053 [8:40:11<2:49:16,  1.66it/s][A
 11%|████████▏                                                                | 2147/19053 [8:40:14<5:51:22,  1.25s/it][A
 11%|████████▏                                                                | 2148/19053 [8:40:14<4:28:05,  1.05it/s][A
 11%|████████▏                                                                | 2149/19053 [8:40:15<4:20:35,  1.08it/s][A
 11%|████████▏                                                                | 2150/19053 [8:40:15<3:32:09,  1.33it/s][A
 11%|████████▏                                                                | 2151/19053 [8:40:15<2:44:48,  1.71it/s][A
 11%|████████▏                                                                | 2152/19053 [8:40:16<2:25:06,  1.94it/s][A
 11%|████████▏  

 12%|████████▌                                                               | 2277/19053 [12:50:51<1:50:27,  2.53it/s][A
 12%|████████▌                                                               | 2278/19053 [12:50:52<1:30:54,  3.08it/s][A
 12%|████████▌                                                               | 2279/19053 [12:50:53<3:13:04,  1.45it/s][A
 12%|████████▌                                                               | 2280/19053 [12:50:54<4:09:39,  1.12it/s][A
 12%|████████▌                                                               | 2281/19053 [12:50:55<3:04:16,  1.52it/s][A
 12%|████████▌                                                               | 2282/19053 [12:50:55<2:32:18,  1.84it/s][A
 12%|████████▋                                                               | 2283/19053 [12:50:55<2:15:47,  2.06it/s][A
 12%|████████▋                                                               | 2284/19053 [12:50:56<2:07:59,  2.18it/s][A
 12%|████████▋  

 13%|█████████                                                               | 2411/19053 [13:01:40<5:35:33,  1.21s/it][A
 13%|█████████                                                               | 2412/19053 [13:01:41<4:32:59,  1.02it/s][A
 13%|█████████                                                               | 2413/19053 [13:01:45<8:28:24,  1.83s/it][A
 13%|█████████▏                                                              | 2415/19053 [13:01:45<5:20:23,  1.16s/it][A
 13%|█████████▏                                                              | 2416/19053 [13:01:46<4:34:52,  1.01it/s][A
 13%|█████████▏                                                              | 2417/19053 [13:01:47<4:51:10,  1.05s/it][A
 13%|█████████▏                                                              | 2418/19053 [13:01:47<3:44:24,  1.24it/s][A
 13%|█████████▏                                                              | 2419/19053 [13:01:47<2:56:56,  1.57it/s][A
 13%|█████████▏ 

 13%|█████████▎                                                            | 2546/19053 [14:02:29<100:05:14, 21.83s/it][A
 13%|█████████▍                                                             | 2547/19053 [14:02:29<70:27:36, 15.37s/it][A
 13%|█████████▍                                                             | 2548/19053 [14:02:37<60:10:45, 13.13s/it][A
 13%|█████████▍                                                             | 2549/19053 [14:02:38<42:29:02,  9.27s/it][A
 13%|█████████▌                                                             | 2550/19053 [14:02:38<29:54:00,  6.52s/it][A
 13%|█████████▌                                                             | 2551/19053 [14:02:38<21:09:26,  4.62s/it][A
 13%|█████████▌                                                             | 2552/19053 [14:02:38<15:07:58,  3.30s/it][A
 13%|█████████▌                                                             | 2553/19053 [14:02:42<15:34:22,  3.40s/it][A
 13%|█████████▌ 

 14%|██████████▏                                                             | 2681/19053 [14:05:49<4:55:16,  1.08s/it][A
 14%|██████████▏                                                             | 2682/19053 [14:05:49<3:51:38,  1.18it/s][A
 14%|██████████▏                                                             | 2683/19053 [14:05:49<2:53:30,  1.57it/s][A
 14%|██████████▏                                                             | 2684/19053 [14:05:50<4:06:05,  1.11it/s][A
 14%|██████████▏                                                             | 2685/19053 [14:05:51<3:32:51,  1.28it/s][A
 14%|██████████▏                                                             | 2686/19053 [14:05:51<2:39:42,  1.71it/s][A
 14%|██████████▏                                                             | 2687/19053 [14:05:52<2:23:32,  1.90it/s][A
 14%|██████████▏                                                             | 2688/19053 [14:05:52<1:57:39,  2.32it/s][A
 14%|██████████▏

 15%|██████████▋                                                             | 2814/19053 [14:39:12<7:51:32,  1.74s/it][A
 15%|██████████▏                                                          | 2815/19053 [14:46:00<558:02:38, 123.72s/it][A
 15%|██████████▎                                                           | 2816/19053 [14:46:15<410:12:41, 90.95s/it][A
 15%|██████████▎                                                           | 2817/19053 [14:46:15<287:25:50, 63.73s/it][A
 15%|██████████▎                                                           | 2818/19053 [14:46:15<201:26:12, 44.67s/it][A
 15%|██████████▎                                                           | 2819/19053 [14:46:16<141:23:06, 31.35s/it][A
 15%|██████████▌                                                            | 2820/19053 [14:46:16<99:17:34, 22.02s/it][A
 15%|██████████▌                                                            | 2821/19053 [14:46:22<77:19:48, 17.15s/it][A
 15%|██████████▌

 15%|███████████▏                                                            | 2948/19053 [14:56:00<4:38:00,  1.04s/it][A
 15%|███████████▏                                                            | 2949/19053 [14:56:00<3:39:42,  1.22it/s][A
 15%|███████████▏                                                            | 2950/19053 [14:56:00<2:48:50,  1.59it/s][A
 15%|███████████▏                                                            | 2951/19053 [14:56:01<3:21:59,  1.33it/s][A
 15%|███████████▏                                                            | 2952/19053 [14:56:02<3:46:43,  1.18it/s][A
 15%|███████████▏                                                            | 2953/19053 [14:56:03<3:07:26,  1.43it/s][A
 16%|███████████▏                                                            | 2954/19053 [14:56:03<2:25:26,  1.84it/s][A
 16%|███████████▏                                                            | 2955/19053 [14:56:04<3:39:28,  1.22it/s][A
 16%|███████████

 16%|███████████▍                                                           | 3082/19053 [15:11:41<27:56:28,  6.30s/it][A
 16%|███████████▍                                                           | 3083/19053 [15:11:43<22:02:13,  4.97s/it][A
 16%|███████████▍                                                           | 3085/19053 [15:11:43<12:06:46,  2.73s/it][A
 16%|███████████▋                                                            | 3086/19053 [15:11:43<9:16:15,  2.09s/it][A
 16%|███████████▌                                                           | 3087/19053 [15:11:48<11:49:46,  2.67s/it][A
 16%|███████████▌                                                           | 3088/19053 [15:11:49<10:23:08,  2.34s/it][A
 16%|███████████▋                                                            | 3089/19053 [15:11:49<7:50:01,  1.77s/it][A
 16%|███████████▋                                                            | 3090/19053 [15:11:51<7:52:59,  1.78s/it][A
 16%|███████████

 17%|████████████▏                                                           | 3218/19053 [15:22:12<2:47:43,  1.57it/s][A
 17%|████████████▏                                                           | 3219/19053 [15:22:12<2:12:01,  2.00it/s][A
 17%|████████████▏                                                           | 3220/19053 [15:22:12<2:01:55,  2.16it/s][A
 17%|████████████▏                                                           | 3221/19053 [15:22:13<1:40:27,  2.63it/s][A
 17%|████████████▏                                                           | 3222/19053 [15:22:13<1:52:06,  2.35it/s][A
 17%|████████████▏                                                           | 3223/19053 [15:22:13<1:29:32,  2.95it/s][A
 17%|████████████▏                                                           | 3224/19053 [15:22:14<1:33:03,  2.83it/s][A
 17%|████████████▏                                                           | 3225/19053 [15:22:20<9:20:09,  2.12s/it][A
 17%|███████████

 18%|████████████▍                                                          | 3353/19053 [15:40:48<78:42:55, 18.05s/it][A
 18%|████████████▍                                                          | 3354/19053 [15:40:51<59:36:21, 13.67s/it][A
 18%|████████████▌                                                          | 3355/19053 [15:40:52<42:49:18,  9.82s/it][A
 18%|████████████▌                                                          | 3356/19053 [15:40:59<38:37:12,  8.86s/it][A
 18%|████████████▌                                                          | 3357/19053 [15:41:00<29:12:42,  6.70s/it][A
 18%|████████████▌                                                          | 3358/19053 [15:41:01<20:44:38,  4.76s/it][A
 18%|████████████▌                                                          | 3359/19053 [15:41:01<14:48:41,  3.40s/it][A
 18%|████████████▌                                                          | 3360/19053 [15:41:16<30:48:45,  7.07s/it][A
 18%|███████████

 18%|████████████▊                                                         | 3492/19053 [16:13:49<118:56:18, 27.52s/it][A
 18%|█████████████                                                          | 3493/19053 [16:13:49<83:28:16, 19.31s/it][A
 18%|█████████████                                                          | 3494/19053 [16:13:50<58:48:29, 13.61s/it][A
 18%|█████████████                                                          | 3495/19053 [16:13:50<41:22:33,  9.57s/it][A
 18%|█████████████                                                          | 3496/19053 [16:13:50<29:57:48,  6.93s/it][A
 18%|█████████████                                                          | 3497/19053 [16:14:01<34:40:06,  8.02s/it][A
 18%|█████████████                                                          | 3498/19053 [16:14:04<28:14:38,  6.54s/it][A
 18%|█████████████                                                          | 3499/19053 [16:14:05<20:59:27,  4.86s/it][A
 18%|███████████

 19%|█████████████▌                                                         | 3630/19053 [16:36:08<18:55:46,  4.42s/it][A
 19%|█████████████▌                                                         | 3631/19053 [16:36:09<14:40:33,  3.43s/it][A
 19%|█████████████▌                                                         | 3632/19053 [16:36:09<10:27:28,  2.44s/it][A
 19%|█████████████▋                                                          | 3633/19053 [16:36:09<7:34:26,  1.77s/it][A
 19%|█████████████▋                                                          | 3634/19053 [16:36:09<5:30:38,  1.29s/it][A
 19%|█████████████▋                                                          | 3635/19053 [16:36:10<4:02:33,  1.06it/s][A
 19%|█████████████▋                                                          | 3636/19053 [16:36:10<3:04:21,  1.39it/s][A
 19%|█████████████▋                                                          | 3637/19053 [16:36:10<2:18:22,  1.86it/s][A
 19%|███████████

 20%|██████████████▏                                                         | 3766/19053 [17:25:50<3:45:15,  1.13it/s][A
 20%|██████████████▏                                                         | 3767/19053 [17:25:51<4:13:24,  1.01it/s][A
 20%|██████████████▏                                                         | 3768/19053 [17:25:54<6:44:52,  1.59s/it][A
 20%|██████████████▏                                                         | 3769/19053 [17:25:57<7:40:27,  1.81s/it][A
 20%|██████████████                                                         | 3770/19053 [17:26:06<17:09:50,  4.04s/it][A
 20%|██████████████                                                         | 3771/19053 [17:26:06<12:26:51,  2.93s/it][A
 20%|██████████████                                                         | 3772/19053 [17:26:09<11:35:30,  2.73s/it][A
 20%|██████████████▎                                                         | 3773/19053 [17:26:09<8:24:49,  1.98s/it][A
 20%|███████████

 20%|██████████████▋                                                         | 3900/19053 [17:49:53<2:33:55,  1.64it/s][A
 20%|██████████████▋                                                         | 3901/19053 [17:49:53<2:01:54,  2.07it/s][A
 20%|██████████████▋                                                         | 3902/19053 [17:49:53<1:44:34,  2.41it/s][A
 20%|██████████████▋                                                         | 3903/19053 [17:49:53<1:22:26,  3.06it/s][A
 20%|██████████████▊                                                         | 3904/19053 [17:49:55<2:34:25,  1.63it/s][A
 20%|██████████████▊                                                         | 3905/19053 [17:49:55<2:07:47,  1.98it/s][A
 21%|██████████████▊                                                         | 3906/19053 [17:49:55<1:44:03,  2.43it/s][A
 21%|██████████████▊                                                         | 3907/19053 [17:49:55<1:24:48,  2.98it/s][A
 21%|███████████

 21%|██████████████▍                                                     | 4035/19053 [18:41:11<2263:51:19, 542.67s/it][A
 21%|██████████████▍                                                     | 4036/19053 [18:41:13<1586:13:21, 380.26s/it][A
 21%|██████████████▍                                                     | 4037/19053 [18:41:13<1110:26:22, 266.22s/it][A
 21%|██████████████▌                                                      | 4038/19053 [18:41:17<781:48:49, 187.45s/it][A
 21%|██████████████▋                                                      | 4039/19053 [18:41:39<575:41:45, 138.04s/it][A
 21%|██████████████▊                                                       | 4040/19053 [18:41:41<405:17:01, 97.18s/it][A
 21%|██████████████▊                                                       | 4041/19053 [18:41:43<285:37:34, 68.50s/it][A
 21%|██████████████▊                                                       | 4042/19053 [18:41:43<200:10:37, 48.01s/it][A
 21%|███████████

 22%|███████████████▊                                                        | 4169/19053 [18:53:59<6:51:30,  1.66s/it][A
 22%|███████████████▊                                                        | 4170/19053 [18:53:59<5:00:45,  1.21s/it][A
 22%|███████████████▊                                                        | 4171/19053 [18:53:59<3:38:49,  1.13it/s][A
 22%|███████████████▊                                                        | 4172/19053 [18:53:59<2:46:42,  1.49it/s][A
 22%|███████████████▊                                                        | 4173/19053 [18:53:59<2:08:14,  1.93it/s][A
 22%|███████████████▌                                                       | 4174/19053 [18:54:08<11:51:52,  2.87s/it][A
 22%|███████████████▊                                                        | 4175/19053 [18:54:08<8:30:58,  2.06s/it][A
 22%|███████████████▊                                                        | 4176/19053 [18:54:09<7:06:49,  1.72s/it][A
 22%|███████████

 23%|████████████████▎                                                       | 4304/19053 [18:58:57<5:48:00,  1.42s/it][A
 23%|████████████████▎                                                       | 4305/19053 [18:58:57<4:19:24,  1.06s/it][A
 23%|███████████████▊                                                      | 4306/19053 [19:00:29<116:06:04, 28.34s/it][A
 23%|████████████████                                                       | 4307/19053 [19:00:29<81:39:41, 19.94s/it][A
 23%|████████████████                                                       | 4308/19053 [19:00:30<57:26:19, 14.02s/it][A
 23%|████████████████                                                       | 4309/19053 [19:00:30<40:24:32,  9.87s/it][A
 23%|████████████████                                                       | 4311/19053 [19:00:33<24:45:47,  6.05s/it][A
 23%|████████████████                                                       | 4312/19053 [19:00:33<18:48:18,  4.59s/it][A
 23%|███████████

## Pretest

In [9]:
len_pretest = len(os.listdir('mof_cif_pretest'))
len_pretest

68613

In [None]:
for i in tqdm(range(len_pretest)):
    name = f'mof_unit_pretest_{i+1}'
    svg_png_from_cif(f'mof_cif_pretest/{name}.cif', dest_svg=f'mof_images/svg/pretest/{name}.svg',
                     dest_png=f'mof_images/png/pretest/{name}.png')

  0%|                                                                            | 63/68613 [03:05<18:00:12,  1.06it/s]

---

# Charges

In [97]:
import pandas as pd
import numpy as np
from collections import Counter

In [110]:
df = pd.read_csv('atom_mass_charge.csv')
df

Unnamed: 0,Element,Energy level,Mass(u),Charge,Radius(pm)
0,H,1,1.0080,1,53
1,He,1,4.0026,0,31
2,Li,2,6.9400,1,167
3,Be,2,9.0122,2,112
4,B,2,10.8100,3,87
...,...,...,...,...,...
113,Fi,7,289.0000,0,-
114,Mc,7,290.0000,0,-
115,Lv,7,293.0000,0,-
116,Ts,7,294.0000,0,-


In [111]:
df = df.set_index('Element', drop=True)
radius_df = df['Radius(pm)'].replace({'-':0}).astype(float)
charge_df = df['Charge']

In [112]:
charge_dict = charge_df.to_dict()
charge_dict

{'H': 1,
 'He': 0,
 'Li': 1,
 'Be': 2,
 'B': 3,
 'C': 4,
 'N': -3,
 'O': -2,
 'F': -1,
 'Ne': 0,
 'Na': 1,
 'Mg': 2,
 'Al': 3,
 'Si': 4,
 'P': -3,
 'S': -2,
 'Cl': -1,
 'Ar': 0,
 'K': 1,
 'Ca': 2,
 'Sc': 0,
 'Ti': 3,
 'V': 0,
 'Cr': 0,
 'Mn': 0,
 'Fe': 0,
 'Co': 0,
 'Ni': 0,
 'Cu': 0,
 'Zn': 0,
 'Ga': 3,
 'Ge': 4,
 'As': -3,
 'Se': -2,
 'Br': -1,
 'Kr': 0,
 'Rb': 1,
 'Sr': 2,
 'Y': 0,
 'Zr': 0,
 'Nb': 0,
 'Mo': 0,
 'Tc': 0,
 'Ru': 0,
 'Rh': 0,
 'Pd': 0,
 'Ag': 0,
 'Cd': 0,
 'In': 3,
 'Sn': 4,
 'Sb': -3,
 'Te': -2,
 'I': -1,
 'Xe': 0,
 'Cs': 1,
 'Ba': 2,
 'La': 0,
 'Ce': 0,
 'Pr': 0,
 'Nd': 0,
 'Pm': 0,
 'Sm': 0,
 'Eu': 0,
 'Gd': 0,
 'Tb': 0,
 'Dy': 0,
 'Ho': 0,
 'Er': 0,
 'Tm': 0,
 'Yb': 0,
 'Lu': 0,
 'Hf': 0,
 'Ta': 0,
 'W': 0,
 'Re': 0,
 'Os': 0,
 'Ir': 0,
 'Pt': 0,
 'Au': 0,
 'Hg': 0,
 'Pb': 4,
 'Bi': -3,
 'Po': -2,
 'At': -1,
 'Rn': 0,
 'Fr': 1,
 'Ra': 2,
 'Ac': 0,
 'Th': 0,
 'Pa': 0,
 'U': 0,
 'Np': 0,
 'Pu': 0,
 'Am': 0,
 'Cm': 0,
 'Bk': 0,
 'Cf': 0,
 'Es': 0,
 'Fm': 0,
 'Md': 0,

In [113]:
radius_dict = radius_df.to_dict()
radius_dict

{'H': 53.0,
 'He': 31.0,
 'Li': 167.0,
 'Be': 112.0,
 'B': 87.0,
 'C': 67.0,
 'N': 56.0,
 'O': 48.0,
 'F': 42.0,
 'Ne': 38.0,
 'Na': 190.0,
 'Mg': 145.0,
 'Al': 118.0,
 'Si': 111.0,
 'P': 98.0,
 'S': 88.0,
 'Cl': 79.0,
 'Ar': 71.0,
 'K': 243.0,
 'Ca': 194.0,
 'Sc': 184.0,
 'Ti': 156.0,
 'V': 171.0,
 'Cr': 166.0,
 'Mn': 161.0,
 'Fe': 156.0,
 'Co': 152.0,
 'Ni': 149.0,
 'Cu': 145.0,
 'Zn': 142.0,
 'Ga': 136.0,
 'Ge': 125.0,
 'As': 114.0,
 'Se': 103.0,
 'Br': 94.0,
 'Kr': 88.0,
 'Rb': 265.0,
 'Sr': 219.0,
 'Y': 212.0,
 'Zr': 206.0,
 'Nb': 198.0,
 'Mo': 190.0,
 'Tc': 183.0,
 'Ru': 178.0,
 'Rh': 173.0,
 'Pd': 169.0,
 'Ag': 165.0,
 'Cd': 161.0,
 'In': 156.0,
 'Sn': 145.0,
 'Sb': 133.0,
 'Te': 123.0,
 'I': 115.0,
 'Xe': 108.0,
 'Cs': 298.0,
 'Ba': 253.0,
 'La': 0.0,
 'Ce': 0.0,
 'Pr': 247.0,
 'Nd': 206.0,
 'Pm': 205.0,
 'Sm': 238.0,
 'Eu': 231.0,
 'Gd': 233.0,
 'Tb': 225.0,
 'Dy': 228.0,
 'Ho': 226.0,
 'Er': 226.0,
 'Tm': 222.0,
 'Yb': 222.0,
 'Lu': 217.0,
 'Hf': 208.0,
 'Ta': 200.0,
 'W': 19

In [121]:
test_count = count_atom('C67H46N16Ni4O')
test_count

{'C': 67, 'H': 46, 'N': 16, 'Ni': 4, 'O': 1}

In [118]:
calc_charges(test_count, charge_dict)

314

In [122]:
calc_avg_radius(67+46+16+4+1, test_count, radius_dict)

63.1865671641791

---

# Train

In [72]:
train_df = pd.read_csv('train.csv')
train_df

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.864166,6.786041,105.284502
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,etb,33.616780,7.147286,101.224774
2,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967,118.987011
3,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,sra,25.701377,6.190085,187.626004
4,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.001838,6.478063,79.210001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68608,mof_unit_68609,1188.302573,1001.700216,0.00,0.00000,0.0000,Pr-F,3,4,24,pcu,24.131770,,-12.943652
68609,mof_unit_68610,1506.660363,1493.296496,0.00,0.01108,0.0000,SO3H,10,42,46,etb,6.071818,,-12.985582
68610,mof_unit_68611,2035.532738,1959.518320,0.00,0.00000,0.0000,OPr,4,14,22,acs,9.876134,,-13.187635
68611,mof_unit_68612,3985.426053,3638.677280,0.00,0.00000,0.0000,OPr-Me,4,4,15,acs,5.285051,inf,15.672698


## Extract data

In [123]:
base_dir = 'mof_cif_train'

In [125]:
train_cif_data = []
for i in tqdm(range(len(train_df))):
    a, b, c, alpha_r, beta_r, gamma_r, n_atoms, mol_avg_mass, charges, mol_avg_radius = get_cell_parameters(os.path.sep.join([base_dir,f'mof_unit_{i+1}.cif']))
    train_cif_data.append([a, b, c, alpha_r, beta_r, gamma_r, n_atoms, mol_avg_mass, charges, mol_avg_radius])

  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))


  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))


  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
100%|████████████████████████████████████████████████████████████████████████████| 68613/68613 [47:43<00:00, 23.96it/s]


In [126]:
train_extra_feats = pd.DataFrame(train_cif_data, columns=['_cell_length_a', '_cell_length_b','_cell_length_c', 
                                                          '_cell_angle_alpha', '_cell_angle_beta','_cell_angle_gamma',
                                                          'n_atoms', 'mol_avg_mass', 'charges', 'mol_avg_radius'])

In [127]:
train_extra_feats

Unnamed: 0,_cell_length_a,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius
0,10.609882,10.643578,9.890832,1.569125,1.592480,1.575368,75,11.669907,92,59.653333
1,8.463295,17.684225,18.960098,1.746437,1.602488,1.691961,194,11.400559,250,60.515464
2,10.732110,9.552271,10.631996,1.556872,1.569806,1.577559,82,9.435293,120,59.585366
3,6.935530,17.504896,19.274980,1.911789,1.574891,1.580099,112,11.648598,204,63.500000
4,10.825925,9.699886,10.853274,1.565467,1.622999,1.595312,94,9.593000,90,58.425532
...,...,...,...,...,...,...,...,...,...,...
68608,10.718161,10.886490,10.193870,1.585497,1.609910,1.583947,119,8.417773,216,59.277311
68609,8.192620,12.576230,15.033794,1.661287,1.730445,1.700483,126,11.851548,126,62.460317
68610,11.237482,11.321902,18.608120,1.574297,1.572863,1.034849,204,9.605572,366,61.578431
68611,19.396341,11.081428,18.544746,1.578949,1.585477,1.569257,364,9.996454,652,62.054945


In [128]:
train_extra = pd.concat([train_df, train_extra_feats], axis=1)
train_extra

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_length_a,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius
0,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,...,10.609882,10.643578,9.890832,1.569125,1.592480,1.575368,75,11.669907,92,59.653333
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,...,8.463295,17.684225,18.960098,1.746437,1.602488,1.691961,194,11.400559,250,60.515464
2,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,...,10.732110,9.552271,10.631996,1.556872,1.569806,1.577559,82,9.435293,120,59.585366
3,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,...,6.935530,17.504896,19.274980,1.911789,1.574891,1.580099,112,11.648598,204,63.500000
4,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,...,10.825925,9.699886,10.853274,1.565467,1.622999,1.595312,94,9.593000,90,58.425532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68608,mof_unit_68609,1188.302573,1001.700216,0.00,0.00000,0.0000,Pr-F,3,4,24,...,10.718161,10.886490,10.193870,1.585497,1.609910,1.583947,119,8.417773,216,59.277311
68609,mof_unit_68610,1506.660363,1493.296496,0.00,0.01108,0.0000,SO3H,10,42,46,...,8.192620,12.576230,15.033794,1.661287,1.730445,1.700483,126,11.851548,126,62.460317
68610,mof_unit_68611,2035.532738,1959.518320,0.00,0.00000,0.0000,OPr,4,14,22,...,11.237482,11.321902,18.608120,1.574297,1.572863,1.034849,204,9.605572,366,61.578431
68611,mof_unit_68612,3985.426053,3638.677280,0.00,0.00000,0.0000,OPr-Me,4,4,15,...,19.396341,11.081428,18.544746,1.578949,1.585477,1.569257,364,9.996454,652,62.054945


In [129]:
train_extra.to_csv('train_extra_2.csv', index=False)

# Test

In [130]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,pcu,36.639791,7.005640
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,acs,18.390691,5.119399
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,pcu,13.062850,5.045400
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,pcu,9.601198,5.106238
4,mof_unit_68618,3123.418006,1337.53800,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,pcu,12.974954,5.287639
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,0.77614,3.2318,OH-NO2,3,3,14,nbo,4.536626,3.146698
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,0.59192,1.2058,Me-OMe,3,1,11,nbo,6.745508,3.658871
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,0.54950,1.1689,Me-CN,2,7,23,pcu,4.666206,3.593052
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,0.60643,1.4091,OH-HCO,3,7,25,pcu,4.823305,3.454497


## Extract data

In [134]:
base_dir = 'mof_cif_test'

In [135]:
68613+len(test_df)

85613

In [136]:
test_cif_data = []
for i in tqdm(range(68613, 68613+len(test_df))):
    a, b, c, alpha_r, beta_r, gamma_r, n_atoms, mol_avg_mass, charges, mol_avg_radius = get_cell_parameters(os.path.sep.join([base_dir,f'mof_unit_{i+1}.cif']))
    test_cif_data.append([a, b, c, alpha_r, beta_r, gamma_r, n_atoms, mol_avg_mass, charges, mol_avg_radius])

  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
100%|████████████████████████████████████████████████████████████████████████████| 17000/17000 [11:29<00:00, 24.64it/s]


In [137]:
test_extra_feats = pd.DataFrame(test_cif_data, columns=['_cell_length_a', '_cell_length_b','_cell_length_c', 
                                                          '_cell_angle_alpha', '_cell_angle_beta','_cell_angle_gamma',
                                                          'n_atoms', 'mol_avg_mass', 'charges', 'mol_avg_radius'])

In [138]:
test_extra_feats

Unnamed: 0,_cell_length_a,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius
0,7.806778,9.084327,17.070225,1.556313,1.511002,1.575256,84,9.496619,128,59.761905
1,12.951336,13.273001,27.655520,1.656643,1.574169,2.081566,158,23.630859,314,70.329114
2,14.948982,10.697065,10.019137,1.569684,1.574747,1.571707,60,12.453633,94,62.433333
3,10.835044,9.587345,23.461267,1.565566,1.590575,1.570093,94,10.593745,160,60.787234
4,11.498125,16.343084,16.653784,1.517304,1.599094,1.587196,118,11.335136,94,58.711864
...,...,...,...,...,...,...,...,...,...,...
16995,44.634366,38.924145,21.601534,1.565134,1.055786,1.568475,416,11.355106,768,61.711538
16996,18.653876,18.775583,18.721517,1.905919,1.908772,1.910598,124,12.089839,196,62.201613
16997,17.736994,16.514905,15.943922,1.570403,1.586349,1.570638,112,11.804214,198,62.821429
16998,15.773696,17.587298,16.887609,1.571806,1.571627,1.605246,104,11.668462,184,60.884615


In [139]:
test_extra = pd.concat([test_df, test_extra_feats], axis=1)
test_extra

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_length_a,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,...,7.806778,9.084327,17.070225,1.556313,1.511002,1.575256,84,9.496619,128,59.761905
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,...,12.951336,13.273001,27.655520,1.656643,1.574169,2.081566,158,23.630859,314,70.329114
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,...,14.948982,10.697065,10.019137,1.569684,1.574747,1.571707,60,12.453633,94,62.433333
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,...,10.835044,9.587345,23.461267,1.565566,1.590575,1.570093,94,10.593745,160,60.787234
4,mof_unit_68618,3123.418006,1337.53800,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,...,11.498125,16.343084,16.653784,1.517304,1.599094,1.587196,118,11.335136,94,58.711864
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,0.77614,3.2318,OH-NO2,3,3,14,...,44.634366,38.924145,21.601534,1.565134,1.055786,1.568475,416,11.355106,768,61.711538
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,0.59192,1.2058,Me-OMe,3,1,11,...,18.653876,18.775583,18.721517,1.905919,1.908772,1.910598,124,12.089839,196,62.201613
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,0.54950,1.1689,Me-CN,2,7,23,...,17.736994,16.514905,15.943922,1.570403,1.586349,1.570638,112,11.804214,198,62.821429
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,0.60643,1.4091,OH-HCO,3,7,25,...,15.773696,17.587298,16.887609,1.571806,1.571627,1.605246,104,11.668462,184,60.884615


In [140]:
test_extra.to_csv('test_extra_2.csv', index=False)

---

# Atoms volume

In [2]:
df = pd.read_csv('atom_mass_vol.csv')
df

Unnamed: 0,Element,Energy level,Mass(u),Change,Radius(pm),Radius(A),Volume(A^3)
0,H,1,1.0080,1,53,0.53,0.623865524
1,He,1,4.0026,0,31,0.31,0.124838476
2,Li,2,6.9400,1,167,1.67,19.51698781
3,Be,2,9.0122,2,112,1.12,5.887317333
4,B,2,10.8100,3,87,0.87,2.759441143
...,...,...,...,...,...,...,...
113,Fi,7,289.0000,0,-,-,-
114,Mc,7,290.0000,0,-,-,-
115,Lv,7,293.0000,0,-,-,-
116,Ts,7,294.0000,0,-,-,-


In [3]:
df = df.set_index('Element', drop=True)
vol_df = df['Volume(A^3)'].replace({'-':0}).astype(float)

In [4]:
vol_dict = vol_df.to_dict()
vol_dict

{'H': 0.623865524,
 'He': 0.124838476,
 'Li': 19.51698781,
 'Be': 5.887317333,
 'B': 2.759441143,
 'C': 1.26034019,
 'N': 0.735914667,
 'O': 0.463433143,
 'F': 0.310464,
 'Ne': 0.22993981,
 'Na': 28.74247619,
 'Mg': 12.77519048,
 'Al': 6.885086476,
 'Si': 5.731025143,
 'P': 3.944042667,
 'S': 2.85569219,
 'Cl': 2.06606819,
 'Ar': 1.499817524,
 'K': 60.12875314,
 'Ca': 30.59627581,
 'Sc': 26.10458819,
 'Ti': 15.90879086,
 'V': 20.95326514,
 'Cr': 19.16847848,
 'Mn': 17.48803467,
 'Fe': 15.90879086,
 'Co': 14.71614781,
 'Ni': 13.86188152,
 'Cu': 12.77519048,
 'Zn': 11.99854019,
 'Ga': 10.54095848,
 'Ge': 8.18452381,
 'As': 6.208374857,
 'Se': 4.579046476,
 'Br': 3.480542476,
 'Kr': 2.85569219,
 'Rb': 77.98319048,
 'Sr': 44.01449486,
 'Y': 39.92739352,
 'Zr': 36.63237181,
 'Nb': 32.52811886,
 'Mo': 28.74247619,
 'Tc': 25.68127886,
 'Ru': 23.63324648,
 'Rh': 21.69709981,
 'Pd': 20.22662819,
 'Ag': 18.82414286,
 'Cd': 17.48803467,
 'In': 15.90879086,
 'Sn': 12.77519048,
 'Sb': 9.858669333,


In [11]:
calc_atoms_vol('mof_cif_train/mof_unit_1.cif', vol_dict)

Counter({'C': 30, 'O': 21, 'H': 20, 'Zn': 2, 'N': 2})


85.488521897

---

# Area (A^2)

In [3]:
df = pd.read_csv('atom_mass_vol_area.csv')
df

Unnamed: 0,Element,Energy level,Mass(u),Change,Radius(pm),Radius(A),Volume(A^3),Area(A^2)
0,H,1,1.0080,1,53,0.53,0.623865524,3.531314286
1,He,1,4.0026,0,31,0.31,0.124838476,1.208114286
2,Li,2,6.9400,1,167,1.67,19.51698781,35.06045714
3,Be,2,9.0122,2,112,1.12,5.887317333,15.7696
4,B,2,10.8100,3,87,0.87,2.759441143,9.515314286
...,...,...,...,...,...,...,...,...
113,Fi,7,289.0000,0,-,-,-,#VALUE!
114,Mc,7,290.0000,0,-,-,-,#VALUE!
115,Lv,7,293.0000,0,-,-,-,#VALUE!
116,Ts,7,294.0000,0,-,-,-,#VALUE!


In [4]:
df = df.set_index('Element', drop=True)
area_df = df['Area(A^2)'].replace({'#VALUE!':0}).astype(float)

In [5]:
area_dict = area_df.to_dict()
area_dict

{'H': 3.531314286,
 'He': 1.208114286,
 'Li': 35.06045714,
 'Be': 15.7696,
 'B': 9.515314286,
 'C': 5.643314286,
 'N': 3.9424,
 'O': 2.896457143,
 'F': 2.2176,
 'Ne': 1.815314286,
 'Na': 45.38285714,
 'Mg': 26.43142857,
 'Al': 17.50445714,
 'Si': 15.48925714,
 'P': 12.0736,
 'S': 9.735314286,
 'Cl': 7.845828571,
 'Ar': 6.337257143,
 'K': 74.23302857,
 'Ca': 47.31382857,
 'Sc': 42.56182857,
 'Ti': 30.59382857,
 'V': 36.76011429,
 'Cr': 34.64182857,
 'Mn': 32.5864,
 'Fe': 30.59382857,
 'Co': 29.04502857,
 'Ni': 27.90982857,
 'Cu': 26.43142857,
 'Zn': 25.34902857,
 'Ga': 23.25211429,
 'Ge': 19.64285714,
 'As': 16.33782857,
 'Se': 13.33702857,
 'Br': 11.10811429,
 'Kr': 9.735314286,
 'Rb': 88.28285714,
 'Sr': 60.29382857,
 'Y': 56.50102857,
 'Zr': 53.34811429,
 'Nb': 49.28502857,
 'Mo': 45.38285714,
 'Tc': 42.10045714,
 'Ru': 39.83131429,
 'Rh': 37.62502857,
 'Pd': 35.90525714,
 'Ag': 34.22571429,
 'Cd': 32.5864,
 'In': 30.59382857,
 'Sn': 26.43142857,
 'Sb': 22.2376,
 'Te': 19.01931429,
 

In [7]:
calc_atoms_area('mof_cif_train/mof_unit_1.cif', area_dict)

  setting_name, spacegroup))


359.33417144299995

## Train

In [8]:
train_df = pd.read_csv('train_extra_2.csv')
train_df

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius,atoms_volume
0,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,...,10.643578,9.890832,1.569125,1.592480,1.575368,75,11.669907,92,59.653333,85.488522
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,...,17.684225,18.960098,1.746437,1.602488,1.691961,194,11.400559,250,60.515464,318.194213
2,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,...,9.552271,10.631996,1.556872,1.569806,1.577559,82,9.435293,120,59.585366,92.531908
3,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,...,17.504896,19.274980,1.911789,1.574891,1.580099,112,11.648598,204,63.500000,185.375000
4,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,...,9.699886,10.853274,1.565467,1.622999,1.595312,94,9.593000,90,58.425532,98.775910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68608,mof_unit_68609,1188.302573,1001.700216,0.00,0.00000,0.0000,Pr-F,3,4,24,...,10.886490,10.193870,1.585497,1.609910,1.583947,119,8.417773,216,59.277311,124.277288
68609,mof_unit_68610,1506.660363,1493.296496,0.00,0.01108,0.0000,SO3H,10,42,46,...,12.576230,15.033794,1.661287,1.730445,1.700483,126,11.851548,126,62.460317,261.730396
68610,mof_unit_68611,2035.532738,1959.518320,0.00,0.00000,0.0000,OPr,4,14,22,...,11.321902,18.608120,1.574297,1.572863,1.034849,204,9.605572,366,61.578431,289.412179
68611,mof_unit_68612,3985.426053,3638.677280,0.00,0.00000,0.0000,OPr-Me,4,4,15,...,11.081428,18.544746,1.578949,1.585477,1.569257,364,9.996454,652,62.054945,541.832410


In [9]:
base_dir = 'mof_cif_train'

In [10]:
train_cif_data = []
for i in tqdm(range(len(train_df))):
    atoms_area = calc_atoms_area(os.path.sep.join([base_dir,f'mof_unit_{i+1}.cif']), area_dict)
    train_cif_data.append(atoms_area)

  setting_name, spacegroup))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))


  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))


  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))


  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
100%|████████████████████████████████████████████████████████████████████████████| 68613/68613 [50:08<00:00, 22.81it/s]


In [11]:
train_extra_feats = pd.DataFrame(train_cif_data, columns=['atoms_area'])

In [12]:
train_extra_feats

Unnamed: 0,atoms_area
0,359.334171
1,1020.568686
2,390.662171
3,638.296686
4,428.741029
...,...
68608,552.300571
68609,736.477029
68610,1072.591771
68611,1958.553143


In [13]:
train_extra = pd.concat([train_df, train_extra_feats], axis=1)
train_extra

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius,atoms_volume,atoms_area
0,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,...,9.890832,1.569125,1.592480,1.575368,75,11.669907,92,59.653333,85.488522,359.334171
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,...,18.960098,1.746437,1.602488,1.691961,194,11.400559,250,60.515464,318.194213,1020.568686
2,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,...,10.631996,1.556872,1.569806,1.577559,82,9.435293,120,59.585366,92.531908,390.662171
3,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,...,19.274980,1.911789,1.574891,1.580099,112,11.648598,204,63.500000,185.375000,638.296686
4,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,...,10.853274,1.565467,1.622999,1.595312,94,9.593000,90,58.425532,98.775910,428.741029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68608,mof_unit_68609,1188.302573,1001.700216,0.00,0.00000,0.0000,Pr-F,3,4,24,...,10.193870,1.585497,1.609910,1.583947,119,8.417773,216,59.277311,124.277288,552.300571
68609,mof_unit_68610,1506.660363,1493.296496,0.00,0.01108,0.0000,SO3H,10,42,46,...,15.033794,1.661287,1.730445,1.700483,126,11.851548,126,62.460317,261.730396,736.477029
68610,mof_unit_68611,2035.532738,1959.518320,0.00,0.00000,0.0000,OPr,4,14,22,...,18.608120,1.574297,1.572863,1.034849,204,9.605572,366,61.578431,289.412179,1072.591771
68611,mof_unit_68612,3985.426053,3638.677280,0.00,0.00000,0.0000,OPr-Me,4,4,15,...,18.544746,1.578949,1.585477,1.569257,364,9.996454,652,62.054945,541.832410,1958.553143


In [14]:
train_extra.to_csv('train_extra_2.csv', index=False)

## Test

In [15]:
test_df = pd.read_csv('test_extra_2.csv')
test_df

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius,atoms_volume
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,...,9.084327,17.070225,1.556313,1.511002,1.575256,84,9.496619,128,59.761905,95.052588
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,...,13.273001,27.655520,1.656643,1.574169,2.081566,158,23.630859,314,70.329114,349.189783
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,...,10.697065,10.019137,1.569684,1.574747,1.571707,60,12.453633,94,62.433333,78.343492
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,...,9.587345,23.461267,1.565566,1.590575,1.570093,94,10.593745,160,60.787234,108.832173
4,mof_unit_68618,3123.418006,1337.53800,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,...,16.343084,16.653784,1.517304,1.599094,1.587196,118,11.335136,94,58.711864,120.398097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,0.77614,3.2318,OH-NO2,3,3,14,...,38.924145,21.601534,1.565134,1.055786,1.568475,416,11.355106,768,61.711538,517.114417
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,0.59192,1.2058,Me-OMe,3,1,11,...,18.775583,18.721517,1.905919,1.908772,1.910598,124,12.089839,196,62.201613,175.008856
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,0.54950,1.1689,Me-CN,2,7,23,...,16.514905,15.943922,1.570403,1.586349,1.570638,112,11.804214,198,62.821429,135.867491
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,0.60643,1.4091,OH-HCO,3,7,25,...,17.587298,16.887609,1.571806,1.571627,1.605246,104,11.668462,184,60.884615,118.374717


In [16]:
base_dir = 'mof_cif_test'

In [17]:
test_cif_data = []
for i in tqdm(range(68613, 68613+len(test_df))):
    atoms_area = calc_atoms_area(os.path.sep.join([base_dir,f'mof_unit_{i+1}.cif']), area_dict)
    test_cif_data.append(atoms_area)

  setting_name, spacegroup))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  setting_name, spacegroup))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
  (kinds[ind], kind))
100%|████████████████████████████████████████████████████████████████████████████| 17000/17000 [12:29<00:00, 22.68it/s]


In [18]:
test_extra_feats = pd.DataFrame(test_cif_data, columns=['atoms_area'])

In [19]:
test_extra_feats

Unnamed: 0,atoms_area
0,401.948800
1,1114.600457
2,314.992229
3,462.158400
4,537.835886
...,...
16995,2125.174857
16996,663.584114
16997,579.618286
16998,510.525714


In [20]:
test_extra = pd.concat([test_df, test_extra_feats], axis=1)
test_extra

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius,atoms_volume,atoms_area
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,...,17.070225,1.556313,1.511002,1.575256,84,9.496619,128,59.761905,95.052588,401.948800
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,...,27.655520,1.656643,1.574169,2.081566,158,23.630859,314,70.329114,349.189783,1114.600457
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,...,10.019137,1.569684,1.574747,1.571707,60,12.453633,94,62.433333,78.343492,314.992229
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,...,23.461267,1.565566,1.590575,1.570093,94,10.593745,160,60.787234,108.832173,462.158400
4,mof_unit_68618,3123.418006,1337.53800,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,...,16.653784,1.517304,1.599094,1.587196,118,11.335136,94,58.711864,120.398097,537.835886
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,0.77614,3.2318,OH-NO2,3,3,14,...,21.601534,1.565134,1.055786,1.568475,416,11.355106,768,61.711538,517.114417,2125.174857
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,0.59192,1.2058,Me-OMe,3,1,11,...,18.721517,1.905919,1.908772,1.910598,124,12.089839,196,62.201613,175.008856,663.584114
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,0.54950,1.1689,Me-CN,2,7,23,...,15.943922,1.570403,1.586349,1.570638,112,11.804214,198,62.821429,135.867491,579.618286
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,0.60643,1.4091,OH-HCO,3,7,25,...,16.887609,1.571806,1.571627,1.605246,104,11.668462,184,60.884615,118.374717,510.525714


In [21]:
test_extra.to_csv('test_extra_2.csv', index=False)

---

# Get Description

## Metal linker

In [54]:
base_dir = 'metal_linker_xyz'

In [55]:
metal_len = len(os.listdir(base_dir))
metal_len

12

In [60]:
metal_linker_data = []
for i in tqdm(range(metal_len)):
    hba1, hba2, hbd, mw, mlinker_atoms = get_desc_xyz(os.path.sep.join([base_dir,f'metal_linker_{i+1}.xyz']))
    metal_linker_data.append([hba1, hba2, hbd, mw, mlinker_atoms])

100%|█████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 118.82it/s]


In [61]:
metal_linker_data

[[17.0, 13.0, 0.0, 541.5763999999998, 23],
 [10.0, 8.0, 0.0, 303.12999999999994, 14],
 [10.0, 8.0, 0.0, 306.79799999999994, 14],
 [16.0, 13.0, 0.0, 436.04470000000003, 22],
 [20.0, 16.0, 0.0, 801.72, 28],
 [12.0, 24.0, 0.0, 764.0521960000004, 44],
 [42.0, 32.0, 4.0, 1207.4849600000005, 54],
 [15.0, 10.0, 2.0, 290.9972939999999, 19],
 [12.0, 10.0, 0.0, 309.9198, 16],
 [12.0, 10.0, 0.0, 434.64799999999997, 12],
 [9.0, 8.0, 0.0, 290.856, 13],
 [36.0, 16.0, 0.0, 763.2646400000002, 60]]

In [62]:
metal_df = pd.DataFrame(data=metal_linker_data, columns=['HBA1', 'HBA2', 'HBD', 'MW', 'mlinker_atoms'], index=range(1,13))
metal_df

Unnamed: 0,HBA1,HBA2,HBD,MW,mlinker_atoms
1,17.0,13.0,0.0,541.5764,23
2,10.0,8.0,0.0,303.13,14
3,10.0,8.0,0.0,306.798,14
4,16.0,13.0,0.0,436.0447,22
5,20.0,16.0,0.0,801.72,28
6,12.0,24.0,0.0,764.052196,44
7,42.0,32.0,4.0,1207.48496,54
8,15.0,10.0,2.0,290.997294,19
9,12.0,10.0,0.0,309.9198,16
10,12.0,10.0,0.0,434.648,12


In [63]:
metal_df.to_csv('metal_linker_data.csv', index=True)

## Organic linker

In [64]:
base_dir = 'organic_linker_xyz'

In [65]:
organic_len = len(os.listdir(base_dir))
organic_len

59

In [66]:
organic_linker_data = []
for i in tqdm(range(organic_len)):
    hba1, hba2, hbd, mw, olinker_atoms = get_desc_xyz(os.path.sep.join([base_dir,f'organic_linker_{i+1}.xyz']))
    organic_linker_data.append([hba1, hba2, hbd, mw, olinker_atoms])

100%|██████████████████████████████████████████████████████████████████████████████████| 59/59 [00:00<00:00, 80.93it/s]


In [67]:
metal_linker_data

[[17.0, 13.0, 0.0, 541.5763999999998, 23],
 [10.0, 8.0, 0.0, 303.12999999999994, 14],
 [10.0, 8.0, 0.0, 306.79799999999994, 14],
 [16.0, 13.0, 0.0, 436.04470000000003, 22],
 [20.0, 16.0, 0.0, 801.72, 28],
 [12.0, 24.0, 0.0, 764.0521960000004, 44],
 [42.0, 32.0, 4.0, 1207.4849600000005, 54],
 [15.0, 10.0, 2.0, 290.9972939999999, 19],
 [12.0, 10.0, 0.0, 309.9198, 16],
 [12.0, 10.0, 0.0, 434.64799999999997, 12],
 [9.0, 8.0, 0.0, 290.856, 13],
 [36.0, 16.0, 0.0, 763.2646400000002, 60]]

In [69]:
organic_df = pd.DataFrame(data=organic_linker_data, columns=['HBA1', 'HBA2', 'HBD', 'MW', 'olinker_atoms'], index=range(1,organic_len+1))
organic_df

Unnamed: 0,HBA1,HBA2,HBD,MW,olinker_atoms
1,4.0,0.0,0.0,76.09596,10
2,8.0,0.0,0.0,152.19192,20
3,12.0,0.0,0.0,228.28788,30
4,6.0,0.0,0.0,102.13324,14
5,0.0,0.0,0.0,24.0214,2
6,4.0,0.0,0.0,124.13876,14
7,8.0,0.0,0.0,176.21332,22
8,10.0,2.0,0.0,180.20532,22
9,12.0,0.0,0.0,228.28788,30
10,2.0,0.0,0.0,26.03728,4


In [70]:
organic_df.to_csv('organic_linker_data.csv', index=True)

## Train

In [72]:
train_df = pd.read_csv('train_extra_2.csv')
train_df

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius,atoms_volume
0,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,...,10.643578,9.890832,1.569125,1.592480,1.575368,75,11.669907,92,59.653333,85.488522
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,...,17.684225,18.960098,1.746437,1.602488,1.691961,194,11.400559,250,60.515464,318.194213
2,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,...,9.552271,10.631996,1.556872,1.569806,1.577559,82,9.435293,120,59.585366,92.531908
3,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,...,17.504896,19.274980,1.911789,1.574891,1.580099,112,11.648598,204,63.500000,185.375000
4,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,...,9.699886,10.853274,1.565467,1.622999,1.595312,94,9.593000,90,58.425532,98.775910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68608,mof_unit_68609,1188.302573,1001.700216,0.00,0.00000,0.0000,Pr-F,3,4,24,...,10.886490,10.193870,1.585497,1.609910,1.583947,119,8.417773,216,59.277311,124.277288
68609,mof_unit_68610,1506.660363,1493.296496,0.00,0.01108,0.0000,SO3H,10,42,46,...,12.576230,15.033794,1.661287,1.730445,1.700483,126,11.851548,126,62.460317,261.730396
68610,mof_unit_68611,2035.532738,1959.518320,0.00,0.00000,0.0000,OPr,4,14,22,...,11.321902,18.608120,1.574297,1.572863,1.034849,204,9.605572,366,61.578431,289.412179
68611,mof_unit_68612,3985.426053,3638.677280,0.00,0.00000,0.0000,OPr-Me,4,4,15,...,11.081428,18.544746,1.578949,1.585477,1.569257,364,9.996454,652,62.054945,541.832410


In [73]:
base_dir = 'mof_cif_train'

In [74]:
train_cif_desc = []
for i in tqdm(range(len(train_df))):
    hba1, hba2, hbd, nF, logP, MR, TPSA = get_desc_cif(os.path.sep.join([base_dir,f'mof_unit_{i+1}.cif']))
    train_cif_desc.append([hba1, hba2, hbd, nF, logP, MR, TPSA])

100%|████████████████████████████████████████████████████████████████████████████| 68613/68613 [37:50<00:00, 30.21it/s]


In [75]:
train_extra_desc = pd.DataFrame(train_cif_desc, columns=['hba1', 'hba2', 'hbd', 'nF', 'logP', 'MR', 'TPSA'])

In [76]:
train_extra_desc

Unnamed: 0,hba1,hba2,hbd,nF,logP,MR,TPSA
0,45.0,23.0,6.0,0.0,0.0,0.0,0.0
1,106.0,35.0,0.0,11.0,0.0,0.0,0.0
2,52.0,14.0,2.0,0.0,0.0,0.0,0.0
3,55.0,23.0,1.0,0.0,0.0,0.0,0.0
4,66.0,26.0,16.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
68608,67.0,10.0,0.0,5.0,0.0,0.0,0.0
68609,83.0,28.0,1.0,0.0,0.0,0.0,0.0
68610,116.0,32.0,0.0,0.0,0.0,0.0,0.0
68611,204.0,60.0,0.0,0.0,0.0,0.0,0.0


In [77]:
train_extra_desc.describe()

Unnamed: 0,hba1,hba2,hbd,nF,logP,MR,TPSA
count,68613.0,68613.0,68613.0,68613.0,68613.0,68613.0,68613.0
mean,74.307755,26.261292,3.024689,0.824173,0.0,0.0,0.0
std,56.098649,21.102319,6.228315,3.846238,0.0,0.0,0.0
min,10.0,4.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,13.0,0.0,0.0,0.0,0.0,0.0
50%,57.0,20.0,0.0,0.0,0.0,0.0,0.0
75%,88.0,30.0,4.0,0.0,0.0,0.0,0.0
max,904.0,480.0,144.0,128.0,0.0,0.0,0.0


In [78]:
train_extra_desc = train_extra_desc[['hba1', 'hba2', 'hbd', 'nF']]

In [79]:
train_extra_desc

Unnamed: 0,hba1,hba2,hbd,nF
0,45.0,23.0,6.0,0.0
1,106.0,35.0,0.0,11.0
2,52.0,14.0,2.0,0.0
3,55.0,23.0,1.0,0.0
4,66.0,26.0,16.0,0.0
...,...,...,...,...
68608,67.0,10.0,0.0,5.0
68609,83.0,28.0,1.0,0.0
68610,116.0,32.0,0.0,0.0
68611,204.0,60.0,0.0,0.0


**Note:** logP, MR, TPSA are all 0, so we drop them.

In [80]:
train_extra = pd.concat([train_df, train_extra_desc], axis=1)
train_extra

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius,atoms_volume,hba1,hba2,hbd,nF
0,mof_unit_1,1116.667429,875.240600,0.00,0.07899,0.0607,COOH-OEt,3,4,11,...,1.575368,75,11.669907,92,59.653333,85.488522,45.0,23.0,6.0,0.0
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.1040,F-OMe,10,44,57,...,1.691961,194,11.400559,250,60.515464,318.194213,106.0,35.0,0.0,11.0
2,mof_unit_3,1089.818728,773.687960,788.50,0.14874,0.1262,OMe-COOH,2,22,24,...,1.577559,82,9.435293,120,59.585366,92.531908,52.0,14.0,2.0,0.0
3,mof_unit_4,2205.198301,1304.638720,1441.53,0.21814,0.2220,H-SO3H,9,17,24,...,1.580099,112,11.648598,204,63.500000,185.375000,55.0,23.0,1.0,0.0
4,mof_unit_5,1137.800963,901.736120,0.00,0.07778,0.0591,NHMe-OH,2,1,22,...,1.595312,94,9.593000,90,58.425532,98.775910,66.0,26.0,16.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68608,mof_unit_68609,1188.302573,1001.700216,0.00,0.00000,0.0000,Pr-F,3,4,24,...,1.583947,119,8.417773,216,59.277311,124.277288,67.0,10.0,0.0,5.0
68609,mof_unit_68610,1506.660363,1493.296496,0.00,0.01108,0.0000,SO3H,10,42,46,...,1.700483,126,11.851548,126,62.460317,261.730396,83.0,28.0,1.0,0.0
68610,mof_unit_68611,2035.532738,1959.518320,0.00,0.00000,0.0000,OPr,4,14,22,...,1.034849,204,9.605572,366,61.578431,289.412179,116.0,32.0,0.0,0.0
68611,mof_unit_68612,3985.426053,3638.677280,0.00,0.00000,0.0000,OPr-Me,4,4,15,...,1.569257,364,9.996454,652,62.054945,541.832410,204.0,60.0,0.0,0.0


In [81]:
train_extra.to_csv('train_extra_3.csv', index=False)

## Test

In [82]:
test_df = pd.read_csv('test_extra_2.csv')
test_df

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_length_b,_cell_length_c,_cell_angle_alpha,_cell_angle_beta,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius,atoms_volume
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,...,9.084327,17.070225,1.556313,1.511002,1.575256,84,9.496619,128,59.761905,95.052588
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,...,13.273001,27.655520,1.656643,1.574169,2.081566,158,23.630859,314,70.329114,349.189783
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,...,10.697065,10.019137,1.569684,1.574747,1.571707,60,12.453633,94,62.433333,78.343492
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,...,9.587345,23.461267,1.565566,1.590575,1.570093,94,10.593745,160,60.787234,108.832173
4,mof_unit_68618,3123.418006,1337.53800,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,...,16.343084,16.653784,1.517304,1.599094,1.587196,118,11.335136,94,58.711864,120.398097
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,0.77614,3.2318,OH-NO2,3,3,14,...,38.924145,21.601534,1.565134,1.055786,1.568475,416,11.355106,768,61.711538,517.114417
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,0.59192,1.2058,Me-OMe,3,1,11,...,18.775583,18.721517,1.905919,1.908772,1.910598,124,12.089839,196,62.201613,175.008856
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,0.54950,1.1689,Me-CN,2,7,23,...,16.514905,15.943922,1.570403,1.586349,1.570638,112,11.804214,198,62.821429,135.867491
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,0.60643,1.4091,OH-HCO,3,7,25,...,17.587298,16.887609,1.571806,1.571627,1.605246,104,11.668462,184,60.884615,118.374717


In [83]:
base_dir = 'mof_cif_test'

In [84]:
test_cif_desc = []
for i in tqdm(range(68613, 68613+len(test_df))):
    hba1, hba2, hbd, nF, logP, MR, TPSA = get_desc_cif(os.path.sep.join([base_dir,f'mof_unit_{i+1}.cif']))
    test_cif_desc.append([hba1, hba2, hbd, nF, logP, MR, TPSA])

100%|████████████████████████████████████████████████████████████████████████████| 17000/17000 [09:10<00:00, 30.87it/s]


In [85]:
test_extra_desc = pd.DataFrame(test_cif_desc, columns=['hba1', 'hba2', 'hbd', 'nF', 'logP', 'MR', 'TPSA'])

In [86]:
test_extra_desc

Unnamed: 0,hba1,hba2,hbd,nF,logP,MR,TPSA
0,52.0,14.0,0.0,0.0,0.0,0.0,0.0
1,53.0,26.0,0.0,0.0,0.0,0.0,0.0
2,30.0,16.0,4.0,0.0,0.0,0.0,0.0
3,50.0,20.0,0.0,0.0,0.0,0.0,0.0
4,66.0,10.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
16995,200.0,60.0,12.0,0.0,0.0,0.0,0.0
16996,71.0,27.0,0.0,0.0,0.0,0.0,0.0
16997,48.0,28.0,0.0,0.0,0.0,0.0,0.0
16998,50.0,26.0,4.0,0.0,0.0,0.0,0.0


In [87]:
# drop columns with all 0 values
test_extra_desc = test_extra_desc[['hba1', 'hba2', 'hbd', 'nF']]

In [88]:
test_extra = pd.concat([test_df, test_extra_desc], axis=1)
test_extra

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,...,_cell_angle_gamma,n_atoms,mol_avg_mass,charges,mol_avg_radius,atoms_volume,hba1,hba2,hbd,nF
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,...,1.575256,84,9.496619,128,59.761905,95.052588,52.0,14.0,0.0,0.0
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,...,2.081566,158,23.630859,314,70.329114,349.189783,53.0,26.0,0.0,0.0
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,...,1.571707,60,12.453633,94,62.433333,78.343492,30.0,16.0,4.0,0.0
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,...,1.570093,94,10.593745,160,60.787234,108.832173,50.0,20.0,0.0,0.0
4,mof_unit_68618,3123.418006,1337.53800,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,...,1.587196,118,11.335136,94,58.711864,120.398097,66.0,10.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16995,mof_unit_85609,32660.944605,4723.68288,5720.14,0.77614,3.2318,OH-NO2,3,3,14,...,1.568475,416,11.355106,768,61.711538,517.114417,200.0,60.0,12.0,0.0
16996,mof_unit_85610,5070.998617,1499.13262,4017.28,0.59192,1.2058,Me-OMe,3,1,11,...,1.910598,124,12.089839,196,62.201613,175.008856,71.0,27.0,0.0,0.0
16997,mof_unit_85611,4669.804446,1322.04892,4288.76,0.54950,1.1689,Me-CN,2,7,23,...,1.570638,112,11.804214,198,62.821429,135.867491,48.0,28.0,0.0,0.0
16998,mof_unit_85612,4682.120862,1213.51148,4331.86,0.60643,1.4091,OH-HCO,3,7,25,...,1.605246,104,11.668462,184,60.884615,118.374717,50.0,26.0,4.0,0.0


In [89]:
test_extra.to_csv('test_extra_3.csv', index=False)