# Advanced File Parsing

Parse an sdf file with RegEx

In [1]:
import pandas as pd
import numpy as np
import re

with open('SBE-b-CD-data.sdf') as f:
    sdf_string = f.read()
    
    # Info for molecules is given between $$$$
    pattern = re.compile('(?<=\$\$\$\$)((.|\n)*?)(?=\$\$\$\$)')
    matches = pattern.finditer(sdf_string)
    

In [2]:
#print(f'{len(list(matches))} molecules found in file')

In [3]:
molecule_info = []

for match in matches:
    
    # Strip to get rid of whitespace around the text.
    molecule_string = str(match.group().strip())
    
    # Get the molecule name
    molecule_name = molecule_string[:molecule_string.find('\n')]
    
    # Set up dictionary
    one_info = {}
    one_info['name'] = molecule_name
    
    
    # Get any key-value properties
    key_pattern = re.compile('\<(.*)\>\s[^\n]*')
    key_string = key_pattern.finditer(molecule_string)
    
    for k in key_string:
        search_string = k.group()
        
        # Group and span
        info_location = re.search('(?<=\<)((.|\n)*?)(?=\>)', search_string)
        key = info_location.group()
        end_key = info_location.span()[-1]
        value = search_string[end_key+1:].strip()

        try:
            one_info[key] = float(value)
        except ValueError:
            one_info[key] = value
    
    # Find the molecule coordinates.
    pattern = re.compile('([-,+]?[0-9]\.[0-9]{4}\s*){3}(\D)')
    coordinates = pattern.finditer(molecule_string)
    
    # Get coordinates and element name.
    molecule_coordinates = []
    molecule_elements = []
    for coordinate in coordinates:
        coord_info = coordinate.group().split()
        element = coord_info[-1]
        coordinates = coord_info[:-1]
        molecule_coordinates.append(coordinates)
        molecule_elements.append(element)
    
    one_info['coordinates'] = np.array(molecule_coordinates)
    one_info['elements'] = molecule_elements

    molecule_info.append(one_info)

In [4]:
df = pd.DataFrame.from_dict(molecule_info)

In [5]:
df

Unnamed: 0,name,ID,Temperature_K,Kapp,coordinates,elements
0,(-)__Sulpiride,(-)__Sulpiride,293.0,35.0,"[[0.3200, 0.2956, 0.0000], [1.1450, 0.2956, 0....","[C, C, C, C, C, C, C, O, N, O, C, C, N, C, C, ..."
1,1-naphthol,1-naphthol,298.0,1720.0,"[[-2.6563, 3.0063, 0.0000], [-3.3707, 3.4188, ...","[C, C, C, C, C, C, C, C, C, C, O]"
2,1-naphthylamine,1-naphthylamine,293.0,518.0,"[[0.7145, 1.2375, 0.0000], [0.7145, 0.4125, 0....","[N, C, C, C, C, C, C, C, C, C, C]"
3,1-phenylpyrrole,1-phenylpyrrole,293.0,555.0,"[[0.6027, 0.0000, 0.0000], [1.0877, 0.6674, 0....","[N, C, C, C, C, C, C, C, C, C, C]"
4,17-a-methyltestosterone,17-a-methyltestosterone,298.0,12933.0,"[[0.5086, -0.3695, 0.0000], [-0.2019, 0.0400, ...","[C, C, C, C, C, C, C, C, C, C, C, C, C, C, C, ..."
...,...,...,...,...,...,...
215,vanillin__acetate,vanillin__acetate,293.0,107.0,"[[-2.4750, -1.0717, 0.0000], [-2.0625, -0.3572...","[O, C, C, C, C, O, C, C, C, C, C, C, O, O]"
216,warfarin,warfarin,298.0,5542.0,"[[-2.1027, 1.6193, 0.0000], [-2.7138, 1.5987, ...","[C, C, C, C, C, C, C, O, O, O, C, C, C, C, C, ..."
217,warfarin_anion,warfarin_anion,298.0,130.0,"[[-2.8624, 2.2044, 0.0000], [-3.6944, 2.1764, ...","[C, C, C, C, C, C, C, O, O, O, C, C, C, C, C, ..."
218,xanthine,xanthine,298.0,0.0,"[[0.0360, -9.4160, 0.0000], [8.6112, -9.4160, ...","[C, N, C, N, C, O]"
