In [1]:
from ccdc import io
from ccdc.descriptors import CrystalDescriptors
import pandas as pd
import numpy as np
import sklearn as sk
import os

In [2]:
current_directory = os.getcwd()
files_in_directory = os.listdir(current_directory)
cif_files = [file for file in files_in_directory if file.endswith(".cif")]

In [14]:
import re
def parse_formula(formula: str):
    '''This takes a string in the form 'C48 H32 Co4 N8 O16' and converts it
       into a dict in the form {'C': 48, 'H': 32, ... }
    '''
    # Thanks chat gippity
    pattern = r'([A-Z][a-z]*)(\d*)'
    elements = re.findall(pattern, formula)
    parsed_formula = {}

    for element, count in elements:
        if count == '':
            count = 1
        else:
            count = int(count)
        # parsed_formula.append((element, count))
        parsed_formula[element] = count

    return parsed_formula

In [None]:
formulas = []
unique_elements = set()
count = 0
for cif in cif_files:
    reader = io.EntryReader(cif)
    for mol in reader:
        crystal = mol.crystal
        for key in parse_formula(crystal.formula).keys():
            unique_elements.add(key)

In [12]:
import sys

mofs = []
count = 0
for cif in cif_files:
    reader = io.EntryReader(cif)
    for mol in reader:
        crystal = mol.crystal

        pattern = CrystalDescriptors.PowderPattern.from_crystal(crystal)
        two_theta   = pattern.two_theta # x values in an xye file
        intensities = pattern.intensity # y values in an xye file
        zipped      = [(y, x) for x, y in zip(two_theta, intensities)]
        zipped.sort(reverse=True)
        XYE_POINTS = 10

        parsed_formula = parse_formula(crystal.formula)

        mof_data = {
            'molecule_name': cif,
            'disorder': int(crystal.has_disorder),
            'crystal_system': crystal.crystal_system,
            'packing_coefficient': crystal.packing_coefficient,
            'void_volume': crystal.void_volume(),
            'habit': mol.habit,
        }
        # Add the specified number of two theta peaks
        for x in range(XYE_POINTS):
            mof_data['2theta_' + str(x)] = zipped[x][-1]
        # One hot encoding for elements in the MOF. 
        mof_data.update({key: parsed_formula.get(key, 0) for key in unique_elements})
        print(mof_data)

        mofs.append(mof_data)

        print(count)
        count += 1

df = pd.DataFrame(mofs)
df.to_csv('mof_data.csv')


C48 H32 Co4 N8 O16
0
C288 H96 Cl48 Mn48 O192
1
C288 H96 Cl48 Co48 O192
2
C88 H48 N8 O24 S8 Tb2
3
O64 P16 U20
4
O64 P16 U20
5
C416 H288 Cu16 N64
6
C56 H16 La8 N16 O48
7
C240 H120 In12 O96 S12
8
C56 H16 Ce8 N16 O48
9
C56 H16 N16 O48 Pr8
10
C56 H16 N16 Nd8 O48
11
C56 H16 N16 O48 Sm8
12
C56 H16 Eu8 N16 O48
13
C56 H16 Gd8 N16 O48
14
Co3 Mo12 O52 P8
15
C160 H112 Co12 N40 O32
16
H16 Be12 O48 P16
17
C96 H56 O56 Zn16
18
C72 H102 Br6 N12 O12 Zn6
19
C90 H144 N144 Zn6
20
C88 H96 Cu8 N8 O32
21
C92 H104 Cu8 N8 O32
22
C72 H32 La4 N12 O32
23
C72 H32 Er4 N12 O32
24
C60 H72 K12 N12 O48 Pb12 S12
25
C72 H72 Co36 O144
26
C72 H24 O52 Zn12
27
C86 H50 O32 P4 Tb4
28
C126 H78 O12 Yb2
29
C64 H72 Br4 Cd2 N24 O8
30
C64 H72 Cd2 I4 N24 O8
31
C260 H190 Mn10 N50 O40
32
C44 H38 N10 O8 Zn2
33
C208 H160 Cd12 O76
34
C48 H168 Cs16 Cu16 O96 Si48
35
C72 H44 Cu4 N20 O24
36
C232 H128 Cd4 Cl8 N24 O64
37
C432 H216 Ag72 Ca36 N72 O144 S72
38


KeyboardInterrupt: 